In [19]:
import os

# Liste les dossiers et fichiers dans /kaggle/input/
print("Contenu de /kaggle/input/:")
print(os.listdir('/kaggle/input/'))
data_dir = '/kaggle/input/'  
for item in os.listdir(data_dir):
    path = os.path.join(data_dir, item)
    if os.path.isdir(path):
        print(f"\nContenu de {path}:")
        print(os.listdir(path))

Contenu de /kaggle/input/:
['datasetmovielens1m']

Contenu de /kaggle/input/datasetmovielens1m:
['ml-1m']


In [20]:
import pandas as pd
import os

data_path = "/kaggle/input/datasetmovielens1m/ml-1m/"

# Chargement des fichiers avec encodage latin-1
ratings = pd.read_csv(os.path.join(data_path, "ratings.dat"),
                      sep="::",
                      engine='python',
                      names=["UserID", "MovieID", "Rating", "Timestamp"],
                      encoding='latin-1')

movies = pd.read_csv(os.path.join(data_path, "movies.dat"),
                     sep="::",
                     engine='python',
                     names=["MovieID", "Title", "Genres"],
                     encoding='latin-1')

users = pd.read_csv(os.path.join(data_path, "users.dat"),
                    sep="::",
                    engine='python',
                    names=["UserID", "Gender", "Age", "Occupation", "Zip-code"],
                    encoding='latin-1')

# Vérifier le chargement
print("Ratings :")
print(ratings.head())

print("\n Movies :")
print(movies.head())

print("\n Users :")
print(users.head())

Ratings :
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

 Movies :
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy

 Users :
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455


In [21]:
# === Vérification du nettoyage ===
print("=== STRUCTURE DES FICHIERS ===")
print("\nRatings:")
print(ratings.info())
print("\nMovies:")
print(movies.info())
print("\nUsers:")
print(users.info())

print("\n=== VALEURS MANQUANTES ===")
print("Ratings:\n", ratings.isnull().sum())
print("\nMovies:\n", movies.isnull().sum())
print("\nUsers:\n", users.isnull().sum())

print("\n=== DOUBLONS ===")
print("Ratings:", ratings.duplicated().sum())
print("Movies:", movies.duplicated().sum())
print("Users:", users.duplicated().sum())

print("\n=== VALEURS UNIQUES ===")
print("Ratings uniques:", ratings['Rating'].unique())
print("Genres (exemples):", movies['Genres'].unique()[:10])
print("Gender uniques:", users['Gender'].unique())

print("\n=== APERÇU DES DONNÉES ===")
print("Ratings:\n", ratings.head())
print("\nMovies:\n", movies.head())
print("\nUsers:\n", users.head())

=== STRUCTURE DES FICHIERS ===

Ratings:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB
None

Movies:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
None

Users:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID  

In [22]:
# Fusion des trois fichiers
merged_df = pd.merge(ratings, users, on='UserID')      
merged_df = pd.merge(merged_df, movies, on='MovieID') 

# Afficher un aperçu
print(" Aperçu du DataFrame fusionné ")
print(merged_df.head())

# Dimensions du DataFrame final
print("\nDimensions :", merged_df.shape)

 Aperçu du DataFrame fusionné 
   UserID  MovieID  Rating  Timestamp Gender  Age  Occupation Zip-code  \
0       1     1193       5  978300760      F    1          10    48067   
1       1      661       3  978302109      F    1          10    48067   
2       1      914       3  978301968      F    1          10    48067   
3       1     3408       4  978300275      F    1          10    48067   
4       1     2355       5  978824291      F    1          10    48067   

                                    Title                        Genres  
0  One Flew Over the Cuckoo's Nest (1975)                         Drama  
1        James and the Giant Peach (1996)  Animation|Children's|Musical  
2                     My Fair Lady (1964)               Musical|Romance  
3                  Erin Brockovich (2000)                         Drama  
4                    Bug's Life, A (1998)   Animation|Children's|Comedy  

Dimensions : (1000209, 10)


In [23]:
print("Colonnes du DataFrame fusionné :")
print(merged_df.columns)

Colonnes du DataFrame fusionné :
Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'Gender', 'Age',
       'Occupation', 'Zip-code', 'Title', 'Genres'],
      dtype='object')


In [24]:
import pandas as pd

# Ton DataFrame fusionné
df = merged_df

# Colonnes principales
user_col = 'UserID'
item_col = 'MovieID'
rating_col = 'Rating'

#  Statistiques générales 
num_columns = df.shape[1]
num_users = df[user_col].nunique()
num_items = df[item_col].nunique()
num_interactions = df[rating_col].count()

# Colonnes contextuelles (toutes sauf User, Item, Rating)
contextual_columns = [col for col in df.columns if col not in [user_col, item_col, rating_col]]
num_contextual_dimensions = len(contextual_columns)

# === Statistiques des items ===
interactions_per_item = df.groupby(item_col)[rating_col].count()
ratings_stats = df.groupby(item_col)[rating_col].agg(['mean', 'min', 'max', 'std'])

# === Affichage ===
print(" Statistiques générales du dataset fusionné")
print(f"Nombre de colonnes : {num_columns}")
print(f"Nombre d'utilisateurs uniques : {num_users}")
print(f"Nombre de films uniques : {num_items}")
print(f"Nombre total d'interactions (notes) : {num_interactions}")
print(f"Nombre de dimensions contextuelles : {num_contextual_dimensions}")
print(f"Colonnes contextuelles : {contextual_columns}\n")

print(" Statistiques des films")
print("Exemple du nombre d'interactions par film :")
print(interactions_per_item.head(), "\n")

print("Exemple des statistiques des notes par film :")
print(ratings_stats.head())

 Statistiques générales du dataset fusionné
Nombre de colonnes : 10
Nombre d'utilisateurs uniques : 6040
Nombre de films uniques : 3706
Nombre total d'interactions (notes) : 1000209
Nombre de dimensions contextuelles : 7
Colonnes contextuelles : ['Timestamp', 'Gender', 'Age', 'Occupation', 'Zip-code', 'Title', 'Genres']

 Statistiques des films
Exemple du nombre d'interactions par film :
MovieID
1    2077
2     701
3     478
4     170
5     296
Name: Rating, dtype: int64 

Exemple des statistiques des notes par film :
             mean  min  max       std
MovieID                              
1        4.146846    1    5  0.852349
2        3.201141    1    5  0.983172
3        3.016736    1    5  1.071712
4        2.729412    1    5  1.013381
5        3.006757    1    5  1.025086


In [25]:
# Vérification de la propreté du DataFrame fusionné 

print(" Dimensions du DataFrame fusionné ")
print(merged_df.shape)

print("\n Présence de valeurs manquantes (NaN) ")
print(merged_df.isnull().sum())

print("\n Nombre total de valeurs manquantes ")
print(merged_df.isnull().sum().sum())

print("\n Présence de doublons ")
num_duplicates = merged_df.duplicated().sum()
print(f"Nombre de lignes dupliquées : {num_duplicates}")

if num_duplicates > 0:
    print("\nExemples de doublons :")
    print(merged_df[merged_df.duplicated()].head())
else:
    print("Aucun doublon trouvé.")

print("\n Types de données ")
print(merged_df.dtypes)

print("\n Aperçu des 5 premières lignes ")
print(merged_df.head())


 Dimensions du DataFrame fusionné 
(1000209, 10)

 Présence de valeurs manquantes (NaN) 
UserID        0
MovieID       0
Rating        0
Timestamp     0
Gender        0
Age           0
Occupation    0
Zip-code      0
Title         0
Genres        0
dtype: int64

 Nombre total de valeurs manquantes 
0

 Présence de doublons 
Nombre de lignes dupliquées : 0
Aucun doublon trouvé.

 Types de données 
UserID         int64
MovieID        int64
Rating         int64
Timestamp      int64
Gender        object
Age            int64
Occupation     int64
Zip-code      object
Title         object
Genres        object
dtype: object

 Aperçu des 5 premières lignes 
   UserID  MovieID  Rating  Timestamp Gender  Age  Occupation Zip-code  \
0       1     1193       5  978300760      F    1          10    48067   
1       1      661       3  978302109      F    1          10    48067   
2       1      914       3  978301968      F    1          10    48067   
3       1     3408       4  978300275      F   

In [26]:
# # Supprimer les utilisateurs avec âge extrême dans le DataFrame fusionné
# merged_df = merged_df[(merged_df['Age'] >= 3) & (merged_df['Age'] <= 100)]

# # Vérifier
# print("Dimensions après suppression des âges extrêmes :", merged_df.shape)
# print("Aperçu :")
# print(merged_df[['UserID', 'Age']].head(10))

In [27]:
# Vérifier combien de lignes seraient supprimées
num_extreme_ages = merged_df[(merged_df['Age'] < 3) | (merged_df['Age'] > 100)].shape[0]
print(f"Lignes avec âges extrêmes (<3 ou >100) : {num_extreme_ages}")

# Appliquer le filtrage uniquement si nécessaire
merged_df = merged_df[(merged_df['Age'] >= 3) & (merged_df['Age'] <= 100)]

# Vérifier le résultat
print("Dimensions après suppression des âges extrêmes :", merged_df.shape)
print("Aperçu :")
print(merged_df[['UserID', 'Age']].head(10))

Lignes avec âges extrêmes (<3 ou >100) : 27211
Dimensions après suppression des âges extrêmes : (972998, 10)
Aperçu :
    UserID  Age
53       2   56
54       2   56
55       2   56
56       2   56
57       2   56
58       2   56
59       2   56
60       2   56
61       2   56
62       2   56


In [28]:
# Remplacer les séparateurs '|' par des virgules 
merged_df['Genres'] = merged_df['Genres'].str.replace('|', ', ', regex=False)

# Vérifier le résultat
print(merged_df[['MovieID', 'Title', 'Genres']].head(10))

    MovieID                                             Title  \
53     1357                                      Shine (1996)   
54     3068                               Verdict, The (1982)   
55     1537          Shall We Dance? (Shall We Dansu?) (1996)   
56      647                         Courage Under Fire (1996)   
57     2194                          Untouchables, The (1987)   
58      648                        Mission: Impossible (1996)   
59     2268                            Few Good Men, A (1992)   
60     2628  Star Wars: Episode I - The Phantom Menace (1999)   
61     1103                      Rebel Without a Cause (1955)   
62     2916                               Total Recall (1990)   

                                 Genres  
53                       Drama, Romance  
54                                Drama  
55                               Comedy  
56                           Drama, War  
57                 Action, Crime, Drama  
58           Action, Adventure, 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['Genres'] = merged_df['Genres'].str.replace('|', ', ', regex=False)


In [29]:
def age_to_group(age):
    if age <= 17:
        return "Under 18"
    elif 18 <= age <= 24:
        return "18-24"
    elif 25 <= age <= 34:
        return "25-34"
    elif 35 <= age <= 44:
        return "35-44"
    elif 45 <= age <= 49:
        return "45-49"
    elif 50 <= age <= 55:
        return "50-55"
    else:
        return "56+"

merged_df['AgeGroup'] = merged_df['Age'].apply(age_to_group)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['AgeGroup'] = merged_df['Age'].apply(age_to_group)


In [30]:
occupation_map = {
    0:"other",1:"academic/educator",2:"artist",3:"clerical/admin",
    4:"college/grad student",5:"customer service",6:"doctor/health care",
    7:"executive/managerial",8:"farmer",9:"homemaker",10:"K-12 student",
    11:"lawyer",12:"programmer",13:"retired",14:"sales/marketing",
    15:"scientist",16:"self-employed",17:"technician/engineer",
    18:"tradesman/craftsman",19:"unemployed",20:"writer"
}
merged_df['Occupation'] = merged_df['Occupation'].map(occupation_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['Occupation'] = merged_df['Occupation'].map(occupation_map)


In [31]:
merged_df['Gender'] = merged_df['Gender'].map({'M':'Male','F':'Female'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['Gender'] = merged_df['Gender'].map({'M':'Male','F':'Female'})


In [32]:
# Afficher les 5 premières lignes
print(merged_df.head())

    UserID  MovieID  Rating  Timestamp Gender  Age     Occupation Zip-code  \
53       2     1357       5  978298709   Male   56  self-employed    70072   
54       2     3068       4  978299000   Male   56  self-employed    70072   
55       2     1537       4  978299620   Male   56  self-employed    70072   
56       2      647       3  978299351   Male   56  self-employed    70072   
57       2     2194       4  978299297   Male   56  self-employed    70072   

                                       Title                Genres AgeGroup  
53                              Shine (1996)        Drama, Romance      56+  
54                       Verdict, The (1982)                 Drama      56+  
55  Shall We Dance? (Shall We Dansu?) (1996)                Comedy      56+  
56                 Courage Under Fire (1996)            Drama, War      56+  
57                  Untouchables, The (1987)  Action, Crime, Drama      56+  


In [33]:
# Supprimer la colonne Zip-code
merged_df = merged_df.drop(columns=['Zip-code','Age'])

# Vérifier que la colonne est bien supprimée
print(merged_df.columns)

Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'Gender', 'Occupation',
       'Title', 'Genres', 'AgeGroup'],
      dtype='object')


In [34]:
import pandas as pd

# Ton DataFrame final après nettoyage et mapping
df = merged_df

# Colonnes principales
user_col = 'UserID'
item_col = 'MovieID'
rating_col = 'Rating'

# Statistiques générales
num_columns = df.shape[1]
num_users = df[user_col].nunique()
num_items = df[item_col].nunique()
num_interactions = df[rating_col].count()

# Colonnes contextuelles (tout sauf UserID, MovieID, Rating)
contextual_columns = [col for col in df.columns if col not in [user_col, item_col, rating_col]]
num_contextual_dimensions = len(contextual_columns)

# Statistiques des films
interactions_per_item = df.groupby(item_col)[rating_col].count()
ratings_stats = df.groupby(item_col)[rating_col].agg(['mean', 'min', 'max', 'std'])

# Affichage
print("Statistiques générales du dataset fusionné ")
print(f"Nombre de colonnes : {num_columns}")
print(f"Nombre d'utilisateurs uniques : {num_users}")
print(f"Nombre de films uniques : {num_items}")
print(f"Nombre total d'interactions (notes) : {num_interactions}")
print(f"Nombre de dimensions contextuelles : {num_contextual_dimensions}")
print(f"Colonnes contextuelles : {contextual_columns}\n")

print(" Statistiques des films")
print("Exemple du nombre d'interactions par film :")
print(interactions_per_item.head(), "\n")

print("Exemple des statistiques des notes par film :")
print(ratings_stats.head())

Statistiques générales du dataset fusionné 
Nombre de colonnes : 9
Nombre d'utilisateurs uniques : 5818
Nombre de films uniques : 3702
Nombre total d'interactions (notes) : 972998
Nombre de dimensions contextuelles : 6
Colonnes contextuelles : ['Timestamp', 'Gender', 'Occupation', 'Title', 'Genres', 'AgeGroup']

 Statistiques des films
Exemple du nombre d'interactions par film :
MovieID
1    1965
2     665
3     460
4     167
5     281
Name: Rating, dtype: int64 

Exemple des statistiques des notes par film :
             mean  min  max       std
MovieID                              
1        4.159796    1    5  0.844843
2        3.190977    1    5  0.983865
3        2.997826    1    5  1.065397
4        2.718563    1    5  1.017238
5        2.985765    1    5  1.010557


In [35]:
output_path = '/kaggle/working/merged_dataset_final.csv'
merged_df.to_csv(output_path, index=False)
print(f"Dataset sauvegardé en CSV : {output_path}")

Dataset sauvegardé en CSV : /kaggle/working/merged_dataset_final.csv


In [36]:
# Nombre de ratings par utilisateur
ratings_per_user = merged_df.groupby('UserID')['Rating'].count()

# Minimum de ratings par utilisateur
min_ratings = ratings_per_user.min()
print(f"Chaque utilisateur a au minimum {min_ratings} ratings")

Chaque utilisateur a au minimum 20 ratings
