# Frequency of user usage

## Imports

In [26]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm

## Import `interaction`

In [2]:
df_interactions = pd.read_csv('../raw_data/interaction_events.csv', low_memory=False)

In [3]:
df_interactions.shape

(16675373, 11)

In [4]:
df_interactions.columns

Index(['id', 'user_id', 'type', 'content_type', 'content_id', 'context_type',
       'context_id', 'thema_prio', 'url', 'created_at', 'updated_at'],
      dtype='object')

#### Old calcul way

In [None]:
# def create_temporal_engagement_df_optimized(df_interactions):
#     """
#     Version encore plus optimisée utilisant groupby + apply
#     """
#     print("🚀 Version optimisée avec groupby + apply...")
    
#     # Préparation
#     df_interactions['created_at'] = pd.to_datetime(df_interactions['created_at'])
#     df_interactions = df_interactions.dropna(subset=['user_id', 'created_at'])
    
#     # Définition des périodes
#     now = datetime.now()
#     semaines = [(now - timedelta(weeks=i+1), now - timedelta(weeks=i)) for i in range(12)]
#     mois = [(now - timedelta(days=30*(i+1)), now - timedelta(days=30*i)) for i in range(12)]
#     annees = [(now - timedelta(days=365*(i+1)), now - timedelta(days=365*i)) for i in range(3)]
    
#     def process_user_group(group):
#         """Traite un groupe d'interactions pour un utilisateur"""
#         dates = group['created_at']
        
#         result = {}
        
#         # Semaines
#         for i, (debut, fin) in enumerate(semaines):
#             result[f'nb_actions_semaine_{i}'] = dates[(dates >= debut) & (dates < fin)].count()
        
#         # Mois
#         for i, (debut, fin) in enumerate(mois):
#             result[f'nb_actions_mois_{i}'] = dates[(dates >= debut) & (dates < fin)].count()
        
#         # Années
#         for i, (debut, fin) in enumerate(annees):
#             result[f'nb_actions_annee_{i}'] = dates[(dates >= debut) & (dates < fin)].count()
        
#         # Date dernière action
#         result['date_derniere_action'] = dates.max()
        
#         print(len(result))
        
#         return pd.Series(result)
    
#     # Application avec groupby + apply
#     df_engagement = df_interactions.groupby('user_id').apply(process_user_group).reset_index()
    
#     print("✅ Calcul optimisé terminé !")
#     return df_engagement



#### Define a function to apply

In [34]:

def create_temporal_engagement_df_optimized(df_interactions):
    """
    Create a temporal user engagement DataFrame from interaction data.
    
    This function analyzes user interactions to generate engagement metrics across
    different time periods (weeks, months, years) and calculates usage statistics
    for each user.
    
    Parameters
    ----------
    df_interactions : pandas.DataFrame
        DataFrame containing user interactions with at minimum the columns:
        - 'user_id' : Unique user identifier
        - 'created_at' : Date/time of interaction (will be converted to datetime)
    
    Returns
    -------
    pandas.DataFrame
        DataFrame with one row per unique user containing:
        
        Identification columns:
        - 'id' : User identifier (formerly user_id)
        - 'join_date' : Date of first interaction
        - 'last_action_date' : Date of last interaction
        - 'total_interactions' : Total number of interactions
        
        Weekly metrics (12 columns):
        - 'week_minus_0' to 'week_minus_11' : Number of actions per week
          (0 = current week, 11 = 11 weeks ago)
        
        Monthly metrics (12 columns):
        - 'month_minus_0' to 'month_minus_11' : Number of actions per month
          (0 = current month, 11 = 11 months ago)
        
        Yearly metrics (3 columns):
        - 'year_minus_0' to 'year_minus_2' : Number of actions per year
          (0 = current year, 2 = 2 years ago)
    
    Notes
    -----
    - Time periods are calculated retrospectively from datetime.now()
    - Weeks = 7 days, months = 30 days, years = 365 days
    - Rows with missing user_id or created_at are dropped
    - Function displays progress bar via tqdm
    - Execution time is measured and displayed
    
    Examples
    --------
    >>> df_interactions = pd.DataFrame({
    ...     'user_id': [1, 1, 2, 2, 3],
    ...     'created_at': ['2024-01-01', '2024-01-15', '2024-02-01', '2024-02-10', '2024-03-01']
    ... })
    >>> df_engagement = create_temporal_engagement_df_optimized(df_interactions)
    🚀 Optimized version with groupby + apply...
    📊 Unique users: 3
    📊 Total interactions: 5
    ⚡ Processing...
    ✅ Optimized calculation completed in 0.12 seconds!
    
    Raises
    ------
    KeyError
        If 'user_id' or 'created_at' columns are missing
    ValueError
        If datetime conversion fails for 'created_at'
    
    See Also
    --------
    pandas.DataFrame.groupby : Grouping method used for aggregation
    tqdm.pandas : Progress bar for pandas operations 
    """
#     import time
#     from tqdm import tqdm
    
    # Activate tqdm for pandas
    tqdm.pandas()
    
    start_time = time.time()
    
    # Prepare
    df_interactions['created_at'] = pd.to_datetime(df_interactions['created_at'])
    df_interactions = df_interactions.dropna(subset=['user_id', 'created_at'])
    
    print(f"📊 Unique users : {df_interactions['user_id'].nunique()}")
    print(f"📊 Total interactions : {len(df_interactions)}")
    
    # Define periods
    now = datetime.now()
    weeks = [(now - timedelta(weeks=i+1), now - timedelta(weeks=i)) for i in range(12)]
    months = [(now - timedelta(days=30*(i+1)), now - timedelta(days=30*i)) for i in range(12)]
    years = [(now - timedelta(days=365*(i+1)), now - timedelta(days=365*i)) for i in range(3)]
    
    def process_user_group(group):
        """
        Process interaction group for a single user to calculate temporal engagement metrics.
        
        Parameters
        ----------
        group : pandas.DataFrame
            Group of interactions for one user containing 'created_at' column.
            
        Returns
        -------
        pandas.Series
            Series with engagement metrics: weekly/monthly/yearly counts,
            join_date, last_action_date, and total_interactions.
        """
        dates = group['created_at']
        
        result = {}
        
        # Weekks
        for i, (debut, fin) in enumerate(weeks):
            result[f'week_minus_{i}'] = dates[(dates >= debut) & (dates < fin)].count()
        
        # Months
        for i, (debut, fin) in enumerate(months):
            result[f'month_minus_{i}'] = dates[(dates >= debut) & (dates < fin)].count()
        
        # Years
        for i, (debut, fin) in enumerate(years):
            result[f'year_minus_{i}'] = dates[(dates >= debut) & (dates < fin)].count()
        
        # Complementary datas
        result['join_date'] = dates.min()  # First interaction
        result['last_action_date'] = dates.max()  # Last action
        result['total_interactions'] = len(dates)  # Total number of interactions
        
        return pd.Series(result)
    
    print("⚡ Run in progress...")
    df_engagement = df_interactions.groupby('user_id').progress_apply(process_user_group).reset_index()
    
    df_engagement = df_engagement.rename(columns={'user_id': 'id'})
    
    # Reorder columns
    column_order = ['id', 'join_date', 'last_action_date', 'total_interactions']
    column_order.extend([f'week_minus_{i}' for i in range(12)])
    column_order.extend([f'month_minus_{i}' for i in range(12)])
    column_order.extend([f'year_minus_{i}' for i in range(3)])
    
    df_engagement = df_engagement[column_order]
    
    # Total execution time
    end_time = time.time()
    execution_time = end_time - start_time
    
    print(f"✅ Time elapsed to run processing {execution_time:.2f} secondes!")
    
    return df_engagement

#### Let's run on df

In [28]:
df_engagement = create_temporal_engagement_df_optimized(df_interactions)

🚀 Version optimisée avec groupby + apply...
📊 Nombre d'utilisateurs uniques : 198937
📊 Nombre d'interactions : 16675373
⚡ Traitement en cours...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 198937/198937 [45:01<00:00, 73.65it/s]


✅ Calcul optimisé terminé en 2704.17 secondes!
⚡ Soit 45.1 minutes pour 198937 utilisateurs
CPU times: user 44min 22s, sys: 1min 1s, total: 45min 24s
Wall time: 45min 8s


In [25]:
df_engagement.shape

(198937, 31)

In [11]:
df_engagement.columns

Index(['user_id', 'nb_actions_semaine_0', 'nb_actions_semaine_1',
       'nb_actions_semaine_2', 'nb_actions_semaine_3', 'nb_actions_semaine_4',
       'nb_actions_semaine_5', 'nb_actions_semaine_6', 'nb_actions_semaine_7',
       'nb_actions_semaine_8', 'nb_actions_semaine_9', 'nb_actions_semaine_10',
       'nb_actions_semaine_11', 'nb_actions_mois_0', 'nb_actions_mois_1',
       'nb_actions_mois_2', 'nb_actions_mois_3', 'nb_actions_mois_4',
       'nb_actions_mois_5', 'nb_actions_mois_6', 'nb_actions_mois_7',
       'nb_actions_mois_8', 'nb_actions_mois_9', 'nb_actions_mois_10',
       'nb_actions_mois_11', 'nb_actions_annee_0', 'nb_actions_annee_1',
       'nb_actions_annee_2', 'date_derniere_action'],
      dtype='object')

### Import users_cleaned database

In [35]:
!pwd

/home/chris/code/ChrisLPH/etreprof_lewagon/notebooks


In [36]:
df_users = pd.read_csv("../data/users_cleaned.csv", low_memory=False)

In [37]:
%%time
df_users_enriched = df_users.merge(df_engagement, on='id', how='left')

CPU times: user 94.8 ms, sys: 97.9 ms, total: 193 ms
Wall time: 190 ms


In [38]:
df_users_enriched.head()

Unnamed: 0,id,statut_infolettre,statut_mailchimp,code_postal,departement,academie,anciennete,created_at,degre,maternelle,...,month_minus_5,month_minus_6,month_minus_7,month_minus_8,month_minus_9,month_minus_10,month_minus_11,year_minus_0,year_minus_1,year_minus_2
0,1,1,subscribed,78770.0,78.0,Versailles,24.0,2016-12-31,1,1,...,13.0,18.0,22.0,8.0,8.0,12.0,4.0,239.0,132.0,27.0
1,2,1,unsubscribed,31130.0,31.0,Toulouse,9.0,2016-12-31,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,3,1,unsubscribed,,,,8.0,2017-01-11,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,4,1,subscribed,93260.0,93.0,Créteil,12.0,2017-01-13,1,1,...,35.0,12.0,34.0,39.0,59.0,64.0,47.0,449.0,746.0,307.0
4,5,1,subscribed,75020.0,75.0,Paris,8.0,2017-01-13,2,0,...,20.0,11.0,16.0,21.0,13.0,10.0,13.0,188.0,189.0,117.0


In [39]:
df_users_enriched.shape

(198889, 71)

In [40]:
df_users_enriched.to_csv("../data/users_cleaned_and_frequency.csv", index=False)