# Import contents usage in users df

In [76]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
from tqdm import tqdm

## Import and clean contents

In [8]:
df_contents = pd.read_csv("../raw_data/contents.csv", low_memory = False)

In [9]:
df_contents.columns

Index(['id', 'title', 'type', 'markdown', 'categories', 'niveaux', 'auteur',
       'categorie', 'categorie_principale', 'created_at', 'date_debut',
       'description', 'difficulte', 'downloads_count', 'duree_heures',
       'duree_minutes', 'duree_secondes', 'etat', 'json_categories',
       'json_centre_interet', 'json_niveau', 'json_niveau_detail',
       'landing_url', 'nb_abonnes', 'nb_commentaires', 'nb_presents', 'niveau',
       'page_view_count', 'published_at', 'score_downvote', 'score_upvote',
       'source_url', 'sous_categorie', 'statut', 'titre', 'updated_at', 'url',
       'transition_ecologique', 'sante_mentale', 'ecole_inclusive', 'cps',
       'reussite_tous_eleves'],
      dtype='object')

In [19]:
df_contents.shape

(4898, 42)

In [20]:
df_contents = df_contents[(df_contents.markdown.str.len() > 5) & (df_contents.type.isin(['article',
 'guide_pratique', 'fiche_outils', 'contenu']) ) & (df_contents.markdown.notna())]

In [21]:
df_contents.shape

(1416, 42)

In [23]:
df_contents_lite = df_contents[['id', 'type', 'transition_ecologique', 'sante_mentale', 'ecole_inclusive', 'cps',
       'reussite_tous_eleves']]

In [25]:
df_contents_lite.columns

Index(['id', 'type', 'transition_ecologique', 'sante_mentale',
       'ecole_inclusive', 'cps', 'reussite_tous_eleves'],
      dtype='object')

In [26]:
df_contents_lite['master_theme'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_contents_lite['master_theme'] = None


In [27]:
df_contents_lite.columns

Index(['id', 'type', 'transition_ecologique', 'sante_mentale',
       'ecole_inclusive', 'cps', 'reussite_tous_eleves', 'master_theme'],
      dtype='object')

## Import `interactions` to map contents with users

### Import of database

In [28]:
df_interactions = pd.read_csv('../raw_data/interaction_events.csv', low_memory=False)

In [29]:
df_interactions.shape

(16675373, 11)

### Explore database

In [30]:
df_interactions.columns

Index(['id', 'user_id', 'type', 'content_type', 'content_id', 'context_type',
       'context_id', 'thema_prio', 'url', 'created_at', 'updated_at'],
      dtype='object')

In [31]:
df_interactions.type.unique()

array(['click', 'subscription-ended', 'subscribed',
       'subscription-completed', 'submitted', 'attended', 'download',
       'session', 'connexion', 'inscription', 'contenu_vote',
       'contenu_favoris', 'comment_posted', 'register', 'completed',
       'page_view', 'opened_mail', 'click_mail'], dtype=object)

In [49]:
df_interactions.sample(30)

Unnamed: 0,id,user_id,type,content_type,content_id,context_type,context_id,thema_prio,url,created_at,updated_at
3013141,12043624,36166,opened_mail,onboarding,36dfa826b3,,,,Lancement Film ÊtrePROF - 28/09/2022,2022-09-28 18:04:28,2023-12-01 00:13:19
13477047,22507600,107000,opened_mail,infolettre_hebdo,7422513d98,,,,14/12/24 - ÉLÉMENTAIRE [NEWS_HEBDO],2024-12-14 10:00:40,2024-12-15 05:40:50
5703213,14733696,76702,opened_mail,onboarding,bc9eacbd79,,,,[ENQUETE_FORMATION_CONTINUE]-MAIL 1-230308,2023-03-19 17:21:05,2023-12-01 05:10:50
4015944,13046427,48575,opened_mail,infolettre_hebdo,b350c35345,,,,04/06/2022 - ELEMENTAIRE [NEWS_HEBDO],2022-06-04 14:46:05,2023-12-01 01:55:26
8414914,17445397,43709,session,opened_mail,a954c9e011,infolettre_hebdo,,,06/01/24 - MATERNELLE [NEWS_HEBDO],2024-01-14 02:48:00,2024-01-20 22:32:30
12002464,21032986,46582,download,fiche-outils,540,Contenu,4278.0,,/ressources/4278/7-activites-briseglace-pour-u...,2024-09-07 11:11:59,
16494112,25528714,130344,session,opened_mail,72764ec12f,onboarding,,,[OBSERVATOIRE] BAROMÈTRE IA MAIL 2 - 250521,2025-06-28 10:49:11,2025-06-29 10:59:04
4762474,13792957,58828,opened_mail,infolettre_hebdo,aba3ead9cf,,,,17/06/23 - ÉLÉMENTAIRE [NEWS_HEBDO],2023-06-18 22:01:07,2023-12-01 03:15:34
823961,9854444,2301,session,opened_mail,7b70823d83,infolettre_hebdo,,,22/08/23 - ÉLÉMENTAIRE [NEWS_HEBDO] - ATELIERS,2023-08-22 12:44:42,2023-11-30 20:11:36
14434245,23467277,49525,opened_mail,onboarding,739abdf426,,,,[MARKETING] CPS - 250219,2025-02-23 15:58:38,2025-03-02 01:08:05


In [52]:
df_interactions[df_interactions.content_id == "4840"]

Unnamed: 0,id,user_id,type,content_type,content_id,context_type,context_id,thema_prio,url,created_at,updated_at
10551507,19582011,68,page_view,Contenu,4840,article,,,/ressources/4840/6-facons-dutiliser-l-ia-dans-...,2024-06-10 11:25:35,
10551512,19582016,68,page_view,Contenu,4840,article,,,/ressources/4840/6-facons-dutiliser-l-ia-dans-...,2024-06-10 11:29:38,
10551513,19582017,68,page_view,Contenu,4840,article,,,/ressources/4840/6-facons-dutiliser-l-ia-dans-...,2024-06-10 11:29:47,
10551717,19582221,68,contenu_vote,Contenu,4840,article,,,/ressources/4840/6-facons-dutiliser-l-ia-dans-...,2024-06-10 14:08:20,
10553226,19583730,20179,page_view,Contenu,4840,article,,,/ressources/4840/6-facons-dutiliser-l-ia-dans-...,2024-06-11 07:37:06,
...,...,...,...,...,...,...,...,...,...,...,...
16673185,25707787,107935,page_view,Contenu,4840,article,,,/ressources/4840/6-facons-dutiliser-l-ia-dans-...,2025-07-10 11:30:02,
16674140,25708742,33768,page_view,Contenu,4840,article,,,/ressources/4840/6-facons-dutiliser-l-ia-dans-...,2025-07-10 13:06:04,
16674364,25708966,143133,page_view,Contenu,4840,article,,,/ressources/4840/6-facons-dutiliser-l-ia-dans-...,2025-07-10 13:49:05,
16674365,25708967,143133,session,Contenu,4840,article,,,/ressources/4840/6-facons-dutiliser-l-ia-dans-...,2025-07-10 13:49:06,


In [39]:
df =pd.DataFrame(df_interactions.content_type.value_counts())

In [40]:
df

Unnamed: 0,content_type
opened_mail,6217444
infolettre_hebdo,5753740
onboarding,2100062
kit,673897
Contenu,398288
...,...
admin-media,1
admin-textes-editables,1
admin-users,1
admin-objectifs,1


In [45]:
df[df.content_type<=1000]

Unnamed: 0,content_type
Live,875
compte-profil-informations,736
parcours-id-slug-termine,718
ateliers-id-slug,708
compte-profil-services,689
...,...
admin-media,1
admin-textes-editables,1
admin-users,1
admin-objectifs,1


In [46]:
df[df.content_type<=1000].index.tolist()

['Live',
 'compte-profil-informations',
 'parcours-id-slug-termine',
 'ateliers-id-slug',
 'compte-profil-services',
 'preparer-noel-ecole',
 'parcours-id-slug',
 'objectif_cap_rentree_2021-id-slug-difficulte',
 'compte-profil-favoris',
 'qui-sommes-nous',
 'parcours-id-slug-questionnaire-questionnaireId',
 'histoire-en-maternelle-alma-studio-seance-pedagogique-nougat',
 'tutos',
 'contact',
 'histoire-en-maternelle-alma-studio-seance-pedagogique-gigi-e',
 'outils-numeriques-enseignants-liste',
 'compte-droits-insuffisants',
 'commentaire',
 'histoire-en-maternelle-alma-studio-seance-pedagogique-peche-',
 'devenir-mentors',
 'compte-profil',
 'objectif_cap_rentree_2021-id-slug-difficulte-contenu-contenu',
 'admin-contenus-modifier-id',
 'article',
 'admin-membres',
 'objectif_cap_rentree_2021-landing',
 'outils-numeriques-enseignants-id-slug',
 'newsletter',
 'admin-ateliers-modifier-id',
 'evenement',
 'compte-profil-abonnement',
 'compte-quitter',
 'admin-ateliers',
 'outils-numeriqu

In [33]:
df_interactions.context_id.unique()

array([  nan,  108.,  102., ..., 1036., 1037., 1038.])

In [34]:
df_interactions.shape

(16675373, 11)

### Filter database before joining content

In [35]:
df_contents.type.unique()

array(['article', 'guide_pratique', 'fiche_outils'], dtype=object)

In [53]:
df_interactions_filtered = df_interactions[df_interactions.content_type.isin(['contenu', 'guide-pratique', 'fiche-outils'])]

In [54]:
df_interactions_filtered.shape

(430048, 11)

In [55]:
df_interactions_filtered = df_interactions_filtered[
    ((df_interactions_filtered.content_type == 'contenu') & (df_interactions_filtered.type == "page_view")) |
    ((df_interactions_filtered.content_type == 'guide-pratique') & (df_interactions_filtered.type == "download")) |
    ((df_interactions_filtered.content_type == 'fiche-outils') & (df_interactions_filtered.type == "download"))
]

In [56]:
df_interactions_filtered.shape

(423845, 11)

## Create `users_content`

#### Are there some same id for different types ? 

In [70]:
df_contents_lite.columns

Index(['id', 'type', 'transition_ecologique', 'sante_mentale',
       'ecole_inclusive', 'cps', 'reussite_tous_eleves', 'master_theme'],
      dtype='object')

In [75]:
df_contents_lite.groupby('id')['type'].nunique()[df_contents_lite.groupby('id')['type'].nunique() > 1].index.tolist()

[]

**Answer** : No !

####    

In [57]:
df_interactions_filtered.columns

Index(['id', 'user_id', 'type', 'content_type', 'content_id', 'context_type',
       'context_id', 'thema_prio', 'url', 'created_at', 'updated_at'],
      dtype='object')

In [79]:
import pandas as pd
from tqdm import tqdm

# 1. CONTENT TYPE FEATURES
# Count interactions by user and content type
content_type_features = df_interactions_filtered.groupby(['user_id', 'content_type']).size().unstack(fill_value=0)
content_type_features.columns = [f'nb_{col.replace("-", "_")}' for col in content_type_features.columns]

# 2. PRIORITY CHALLENGE FEATURES
# Merge with df_contents_lite to get priority challenges and themes
# Fix type mismatch: convert content_id to int to match df_contents_lite id column
df_interactions_filtered['content_id'] = pd.to_numeric(df_interactions_filtered['content_id'], errors='coerce')

df_interactions_with_content = df_interactions_filtered.merge(
    df_contents_lite[['id', 'transition_ecologique', 'sante_mentale', 'ecole_inclusive', 'cps', 'reussite_tous_eleves', 'master_theme']], 
    left_on='content_id', 
    right_on='id', 
    how='left'
)

# Function to count interactions by priority challenge
def count_priority_challenge_interactions(df_merged):
    priority_features = pd.DataFrame(index=df_merged['user_id'].unique())
    priority_features.index.name = 'user_id'  # Ensure index has a name
    
    challenges = ['transition_ecologique', 'sante_mentale', 'ecole_inclusive', 'cps', 'reussite_tous_eleves']
    
    for challenge in tqdm(challenges, desc="Processing priority challenges"):
        # Filter content that matches this challenge (value = 1)
        challenge_interactions = df_merged[df_merged[challenge] == 1]
        
        # Count by user_id
        challenge_counts = challenge_interactions.groupby('user_id').size()
        
        # Add to DataFrame (fill_value=0 for users without interactions on this challenge)
        priority_features[f'nb_{challenge}'] = challenge_counts.reindex(priority_features.index, fill_value=0)
    
    return priority_features.fillna(0).astype(int)

priority_features = count_priority_challenge_interactions(df_interactions_with_content)

# 3. COMBINE INTO FINAL MATRIX
# Start with unique user_ids
df_user_features = pd.DataFrame({'user_id': df_interactions_filtered['user_id'].unique()})

# Merge with content type features
df_user_features = df_user_features.merge(
    content_type_features.reset_index(), 
    on='user_id', 
    how='left'
).fillna(0)

# Merge with priority challenge features
df_user_features = df_user_features.merge(
    priority_features.reset_index(), 
    on='user_id', 
    how='left'
).fillna(0)

# 4. THEME FEATURES
# Filter interactions that have a defined master_theme (not None/NaN)
theme_interactions = df_interactions_with_content.dropna(subset=['master_theme'])

if not theme_interactions.empty:
    # Count interactions by user and theme
    theme_features = theme_interactions.groupby(['user_id', 'master_theme']).size().unstack(fill_value=0)
    theme_features.columns = [f'nb_theme_{col}' for col in theme_features.columns]
    
    # Merge with main matrix
    df_user_features = df_user_features.merge(
        theme_features.reset_index(), 
        on='user_id', 
        how='left'
    ).fillna(0)

# 5. BONUS FEATURES
# Total interactions per user
df_user_features['total_interactions'] = df_interactions_filtered.groupby('user_id').size().values

# Content diversity (number of unique contents consulted)
df_user_features['diversite_contenus'] = df_interactions_filtered.groupby('user_id')['content_id'].nunique().values

# Convert all to integers
numeric_columns = df_user_features.select_dtypes(include=['float64']).columns
df_user_features[numeric_columns] = df_user_features[numeric_columns].astype(int)

Processing priority challenges: 100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 35.88it/s]


In [81]:
df_user_features

Unnamed: 0,user_id,nb_fiche_outils,nb_guide_pratique,nb_transition_ecologique,nb_sante_mentale,nb_ecole_inclusive,nb_cps,nb_reussite_tous_eleves,total_interactions,diversite_contenus
0,1,3,1,0,0,0,0,0,4,4
1,68,1761,291,81,68,169,79,294,54,40
2,5089,0,1,0,0,0,0,0,1,1
3,91525,1,0,0,0,0,0,0,103,67
4,91550,0,2,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...
80328,210712,2,0,0,0,0,0,1,1,1
80329,210714,2,0,0,0,0,0,0,2,2
80330,210717,2,0,0,0,0,0,0,2,2
80331,210698,1,0,0,0,0,0,0,2,2


In [85]:
df_user_features.shape

(80333, 10)

In [82]:
df_contents_lite.columns

Index(['id', 'type', 'transition_ecologique', 'sante_mentale',
       'ecole_inclusive', 'cps', 'reussite_tous_eleves', 'master_theme'],
      dtype='object')

In [None]:
array(['click', 'subscription-ended', 'subscribed',
       'subscription-completed', 'submitted', 'attended', 'download',
       'session', 'connexion', 'inscription', 'contenu_vote',
       'contenu_favoris', 'comment_posted', 'register', 'completed',
       'page_view', 'opened_mail', 'click_mail'], dtype=object)

In [124]:
# Créer DataFrame avec user_id uniques
df_engagement = pd.DataFrame({'user_id': df_interactions['user_id'].unique()})

# Compter chaque type d'interaction par user
engagement_counts = df_interactions.groupby(['user_id', 'type']).size().unstack(fill_value=0)

# Ajouter seulement les colonnes qui nous intéressent (avec 0 si elles n'existent pas)
df_engagement['nb_vote'] = engagement_counts.get('contenu_vote', 0).values if 'contenu_vote' in engagement_counts.columns else 0
df_engagement['nb_comments'] = engagement_counts.get('comment_posted', 0).values if 'comment_posted' in engagement_counts.columns else 0
df_engagement['nb_opened_mail'] = engagement_counts.get('opened_mail', 0).values if 'opened_mail' in engagement_counts.columns else 0
df_engagement['nb_clicked_mail'] = engagement_counts.get('click_mail', 0).values if 'click_mail' in engagement_counts.columns else 0

In [130]:
df_engagement

Unnamed: 0,user_id,nb_vote,nb_comments,nb_opened_mail,nb_clicked_mail
0,69952,0,0,77,1
1,68886,0,0,1,0
2,1,0,0,1,0
3,68,31,0,284,12
4,70138,0,0,256,0
...,...,...,...,...,...
198932,210716,0,0,0,0
198933,210717,0,0,0,0
198934,210718,0,0,0,0
198935,210719,0,0,0,0


In [126]:
df_user_features

Unnamed: 0,user_id,nb_fiche_outils,nb_guide_pratique,nb_transition_ecologique,nb_sante_mentale,nb_ecole_inclusive,nb_cps,nb_reussite_tous_eleves,total_interactions,diversite_contenus
0,1,3,1,0,0,0,0,0,4,4
1,68,1761,291,81,68,169,79,294,54,40
2,5089,0,1,0,0,0,0,0,1,1
3,91525,1,0,0,0,0,0,0,103,67
4,91550,0,2,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...
80328,210712,2,0,0,0,0,0,1,1,1
80329,210714,2,0,0,0,0,0,0,2,2
80330,210717,2,0,0,0,0,0,0,2,2
80331,210698,1,0,0,0,0,0,0,2,2


In [131]:
print("Colonnes df_engagement:", df_engagement.columns.tolist())
print("Colonnes df_user_features:", df_user_features.columns.tolist())
print("Colonnes communes:", set(df_engagement.columns) & set(df_user_features.columns))

Colonnes df_engagement: ['user_id', 'nb_vote', 'nb_comments', 'nb_opened_mail', 'nb_clicked_mail']
Colonnes df_user_features: ['user_id', 'nb_fiche_outils', 'nb_guide_pratique', 'nb_transition_ecologique', 'nb_sante_mentale', 'nb_ecole_inclusive', 'nb_cps', 'nb_reussite_tous_eleves', 'total_interactions', 'diversite_contenus']
Colonnes communes: {'user_id'}


In [132]:
df_users_featured = df_user_features.merge(
    df_engagement, 
    on='user_id', 
    how='outer'
).fillna(0)

In [133]:
df_users_featured.shape

(198937, 14)

In [134]:
df_user_final = pd.read_csv("../data/users_cleaned_and_frequency.csv", low_memory=False)

In [135]:
df_users_featured.columns

Index(['user_id', 'nb_fiche_outils', 'nb_guide_pratique',
       'nb_transition_ecologique', 'nb_sante_mentale', 'nb_ecole_inclusive',
       'nb_cps', 'nb_reussite_tous_eleves', 'total_interactions',
       'diversite_contenus', 'nb_vote', 'nb_comments', 'nb_opened_mail',
       'nb_clicked_mail'],
      dtype='object')

In [136]:
df_users_featured = df_users_featured.rename(columns={'user_id': 'id'})

In [137]:
df_users_featured.columns

Index(['id', 'nb_fiche_outils', 'nb_guide_pratique',
       'nb_transition_ecologique', 'nb_sante_mentale', 'nb_ecole_inclusive',
       'nb_cps', 'nb_reussite_tous_eleves', 'total_interactions',
       'diversite_contenus', 'nb_vote', 'nb_comments', 'nb_opened_mail',
       'nb_clicked_mail'],
      dtype='object')

In [138]:
df_user_final.shape

(198889, 71)

In [139]:
df_user_final.columns

Index(['id', 'statut_infolettre', 'statut_mailchimp', 'code_postal',
       'departement', 'academie', 'anciennete', 'created_at', 'degre',
       'maternelle', 'elementaire', 'college', 'lycee', 'lycee_pro', 'autre',
       'type_etab', 'discipline', 'niveau_tps', 'niveau_ps', 'niveau_ms',
       'niveau_gs', 'niveau_cp', 'niveau_ce1', 'niveau_ce2', 'niveau_cm1',
       'niveau_cm2', 'niveau_6e', 'niveau_5e', 'niveau_4e', 'niveau_3e',
       'niveau_2nde', 'niveau_1ere', 'niveau_terminale', 'niveau_cap',
       'niveau_bac_pro', 'niveau_post_bac', 'niveau_segpa', 'niveau_ash',
       'niveau_direction', 'niveau_formateur', 'niveau_documentaliste',
       'join_date', 'last_action_date', 'total_interactions', 'week_minus_0',
       'week_minus_1', 'week_minus_2', 'week_minus_3', 'week_minus_4',
       'week_minus_5', 'week_minus_6', 'week_minus_7', 'week_minus_8',
       'week_minus_9', 'week_minus_10', 'week_minus_11', 'month_minus_0',
       'month_minus_1', 'month_minus_2', 'month_m

In [140]:
df_users_featured.shape

(198937, 14)

In [141]:
df_complete = df_users_featured.merge(
    df_user_final,  
    on='id', 
    how='right'
)


In [142]:
df_complete.shape

(198889, 84)

In [143]:
df_complete.columns

Index(['id', 'nb_fiche_outils', 'nb_guide_pratique',
       'nb_transition_ecologique', 'nb_sante_mentale', 'nb_ecole_inclusive',
       'nb_cps', 'nb_reussite_tous_eleves', 'total_interactions_x',
       'diversite_contenus', 'nb_vote', 'nb_comments', 'nb_opened_mail',
       'nb_clicked_mail', 'statut_infolettre', 'statut_mailchimp',
       'code_postal', 'departement', 'academie', 'anciennete', 'created_at',
       'degre', 'maternelle', 'elementaire', 'college', 'lycee', 'lycee_pro',
       'autre', 'type_etab', 'discipline', 'niveau_tps', 'niveau_ps',
       'niveau_ms', 'niveau_gs', 'niveau_cp', 'niveau_ce1', 'niveau_ce2',
       'niveau_cm1', 'niveau_cm2', 'niveau_6e', 'niveau_5e', 'niveau_4e',
       'niveau_3e', 'niveau_2nde', 'niveau_1ere', 'niveau_terminale',
       'niveau_cap', 'niveau_bac_pro', 'niveau_post_bac', 'niveau_segpa',
       'niveau_ash', 'niveau_direction', 'niveau_formateur',
       'niveau_documentaliste', 'join_date', 'last_action_date',
       'total_inter

In [144]:
df_complete.to_csv("../data/users_cleaned_with_frequency_engagement_subjects.csv", index=False)