In [19]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# afficher toutes les colonnes
pd.set_option('display.max_columns', None)

In [20]:
def read_ratings(data) -> pd.DataFrame:
    temp = pd.DataFrame(LabelEncoder().fit_transform(data["movieId"]))
    data["movieId"] = temp
    return data

def read_movies(data) -> pd.DataFrame:
    genres = data["genres"].str.get_dummies(sep="|")
    result_df = pd.concat([data[["movieId", "title"]], genres], axis=1)
    return result_df

In [34]:
movies = pd.read_csv('../data/raw/movies.csv', encoding="utf-8")
ratings = pd.read_csv('../data/raw/ratings.csv', encoding="utf-8")
display(movies.head())
display(ratings.head())

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [40]:
# head of ratings where movieId is 1
display(ratings[ratings["movieId"] == 1].head())

# top 20 biggest values of movieId in ratings
display(pd.Series(ratings["movieId"].unique()).sort_values(ascending=False).head(20))


Unnamed: 0,userId,movieId,rating,timestamp
236,3,1,4.0,944919407
517,6,1,5.0,858275452
817,8,1,4.0,833981871
922,10,1,4.0,943497887
960,11,1,4.5,1230858821


26660    131262
23072    131260
19675    131258
23927    131256
23926    131254
23925    131252
23924    131250
23923    131248
23922    131243
23921    131241
23920    131239
26686    131237
22851    131231
25343    131180
25145    131176
25144    131174
25995    131172
24692    131170
22979    131168
22096    131166
dtype: int64

In [22]:
movies = read_movies(movies)
ratings = read_ratings(ratings)
display(movies.head())
display(ratings.head())

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,3.5,1112486027
1,1,28,3.5,1112484676
2,1,31,3.5,1112484819
3,1,46,3.5,1112484727
4,1,49,3.5,1112484580


In [23]:
movie_ratings = ratings.merge(movies, on="movieId", how="inner")
display(movie_ratings.head())

Unnamed: 0,userId,movieId,rating,timestamp,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,3.5,1112486027,Toy Story,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,28,3.5,1112484676,Persuasion,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
2,1,31,3.5,1112484819,Dangerous Minds,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,46,3.5,1112484727,How to Make an American Quilt,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,1,49,3.5,1112484580,When Night Is Falling,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [24]:
movie_ratings = movie_ratings.drop(
        ["movieId", "timestamp", "title", "rating"], axis=1
    )
display(movie_ratings.head())

Unnamed: 0,userId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [26]:
# nombre de ligne dans movie_ratings
print(f"Nombre de ligne dans movie_ratings: {movie_ratings.shape[0]}")

# nombre de valeurs différentes dans la colonne userId de movie_ratings
print(f"Nombre de valeurs différentes dans la colonne userId de movie_ratings: {movie_ratings['userId'].nunique()}")

Nombre de ligne dans movie_ratings: 17123169
Nombre de valeurs différentes dans la colonne userId de movie_ratings: 138485


In [27]:
# Calculate user_matrix
user_matrix = movie_ratings.groupby("userId").agg(
    "mean",
)
display(user_matrix.head())

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.129412,0.123529,0.029412,0.082353,0.352941,0.141176,0.029412,0.505882,0.041176,0.017647,0.076471,0.005882,0.041176,0.047059,0.205882,0.076471,0.182353,0.041176,0.023529
2,0.0,0.083333,0.1,0.033333,0.066667,0.383333,0.033333,0.033333,0.483333,0.066667,0.016667,0.116667,0.0,0.066667,0.05,0.233333,0.083333,0.233333,0.05,0.016667
3,0.0,0.127072,0.088398,0.022099,0.055249,0.298343,0.071823,0.027624,0.475138,0.044199,0.027624,0.116022,0.0,0.038674,0.044199,0.132597,0.116022,0.176796,0.049724,0.01105
4,0.0,0.178571,0.142857,0.071429,0.071429,0.357143,0.035714,0.035714,0.571429,0.035714,0.0,0.035714,0.035714,0.107143,0.071429,0.107143,0.0,0.178571,0.035714,0.107143
5,0.0,0.142857,0.095238,0.079365,0.126984,0.380952,0.015873,0.063492,0.555556,0.063492,0.0,0.047619,0.015873,0.079365,0.063492,0.142857,0.031746,0.238095,0.031746,0.015873


In [30]:
# nombre de ligne dans user_matrix
print(f"Nombre de ligne dans user_matrix: {user_matrix.shape[0]}")

Nombre de ligne dans user_matrix: 138485


In [31]:
display(movies.head())

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [33]:
movies_matrix = movies.drop(
    "title", axis=1
)
# movieId est la clé primaire de la table movies
movies_matrix = movies_matrix.set_index("movieId")
display(movies_matrix.head())

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
def read_ratings(data) -> pd.DataFrame:
    temp = pd.DataFrame(LabelEncoder().fit_transform(data["movieId"]))
    data["movieId"] = temp
    return data

def read_movies(data) -> pd.DataFrame:
    genres = data["genres"].str.get_dummies(sep="|")
    result_df = pd.concat([data[["movieId", "title"]], genres], axis=1)
    return result_df

In [3]:
# voir les ligne 15 à 45
sub = movies[15:45]

In [4]:
sub

Unnamed: 0,movieId,title,genres
15,16,Casino (1995),Crime|Drama
16,17,Sense and Sensibility (1995),Drama|Romance
17,18,Four Rooms (1995),Comedy
18,19,Ace Ventura: When Nature Calls (1995),Comedy
19,20,Money Train (1995),Action|Comedy|Crime|Drama|Thriller
20,21,Get Shorty (1995),Comedy|Crime|Thriller
21,22,Copycat (1995),Crime|Drama|Horror|Mystery|Thriller
22,23,Assassins (1995),Action|Crime|Thriller
23,24,Powder (1995),Drama|Sci-Fi
24,25,Leaving Las Vegas (1995),Drama|Romance


In [7]:
import re

sub['year'] = sub['title'].apply(lambda x: int(re.search(r'\((\d{4})\)(?!.*\(\d{4}\))', x).group(1)) if re.search(r'\((\d{4})\)(?!.*\(\d{4}\))', x) else None)
sub['title'] = sub['title'].apply(lambda x: re.sub(r'\s*\((\d{4})\)(?!.*\(\d{4}\))', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['year'] = sub['title'].apply(lambda x: int(re.search(r'\((\d{4})\)(?!.*\(\d{4}\))', x).group(1)) if re.search(r'\((\d{4})\)(?!.*\(\d{4}\))', x) else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['title'] = sub['title'].apply(lambda x: re.sub(r'\s*\((\d{4})\)(?!.*\(\d{4}\))', '', x))


In [8]:
sub

Unnamed: 0,movieId,title,genres,year
15,16,Casino,Crime|Drama,1995
16,17,Sense and Sensibility,Drama|Romance,1995
17,18,Four Rooms,Comedy,1995
18,19,Ace Ventura: When Nature Calls,Comedy,1995
19,20,Money Train,Action|Comedy|Crime|Drama|Thriller,1995
20,21,Get Shorty,Comedy|Crime|Thriller,1995
21,22,Copycat,Crime|Drama|Horror|Mystery|Thriller,1995
22,23,Assassins,Action|Crime|Thriller,1995
23,24,Powder,Drama|Sci-Fi,1995
24,25,Leaving Las Vegas,Drama|Romance,1995


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

def read_ratings(ratings_csv, data_dir="raw") -> pd.DataFrame:
    """
    Reads a ratings.csv from the data/raw folder.

    Parameters
    -------
    ratings_csv : str
        The csv file that will be read. Must be corresponding to a rating file.

    Returns
    -------
    pd.DataFrame
        The ratings DataFrame. Its columns are, in order:
        "userId", "movieId", "rating" and "timestamp".
    """
    data = pd.read_csv(os.path.join(data_dir, ratings_csv))

    temp = pd.DataFrame(LabelEncoder().fit_transform(data["movieId"]))
    data["movieId"] = temp
    return data


def read_movies(movies_csv, data_dir="raw") -> pd.DataFrame:
    """
    Lit un fichier movies.csv du dossier data/raw, le traite et extrait l'année du titre.

    Paramètres
    ----------
    movies_csv : str
        Le nom du fichier CSV à lire. Doit correspondre à un fichier de films.

    data_dir : str, optionnel
        Le répertoire contenant le fichier CSV. Par défaut "raw".

    Retourne
    --------
    pd.DataFrame
        Le DataFrame des films. Ses colonnes sont 'movieId', 'title', 'year', et des colonnes binaires
        représentant les genres de films. La colonne 'year' est extraite du titre et ajoutée au DataFrame.
    """
    # Lire le fichier CSV
    df = pd.read_csv(os.path.join(data_dir, movies_csv))

    # Extraire l'année du titre et créer une nouvelle colonne 'year'
    df["year"] = df["title"].str.extract(r"\((\d{4})\)$")
    df["title"] = df["title"].str.replace(r"\s*\(\d{4}\)$", "", regex=True)

    # Convertir la colonne 'year' en entier
    df["year"] = pd.to_numeric(df["year"], errors="coerce")

    # Sauvegarder le CSV en écrasant l'ancien
    df.to_csv(os.path.join(data_dir, movies_csv), index=False)

    # Diviser la colonne 'genres' en genres individuels
    genres = df["genres"].str.get_dummies(sep="|")

    # Concatenate the original movieId and title columns with the binary genre columns
    result_df = pd.concat([df[["movieId", "title"]], genres], axis=1)
    return result_df


def create_user_matrix(ratings, movies):
    # merge the 2 tables together
    movie_ratings = ratings.merge(movies, on="movieId", how="inner")

    display(movie_ratings.head())

    # Drop useless features
    movie_ratings = movie_ratings.drop(
        ["movieId", "timestamp", "title", "rating"], axis=1
    )

    # Calculate user_matrix
    user_matrix = movie_ratings.groupby("userId").agg(
        "mean",
    )

    return user_matrix




In [4]:

# read user_ratings and movies tables
user_ratings = read_ratings("ratings.csv")
display(user_ratings.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,3.5,1112486027
1,1,28,3.5,1112484676
2,1,31,3.5,1112484819
3,1,46,3.5,1112484727
4,1,49,3.5,1112484580


In [5]:
movies = read_movies("movies.csv")
display(movies.head())

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
user_matrix = create_user_matrix(user_ratings, movies)
display(user_matrix.head())

Unnamed: 0,userId,movieId,rating,timestamp,title,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,3.5,1112486027,Toy Story,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,28,3.5,1112484676,Persuasion,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,31,3.5,1112484819,Dangerous Minds,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,46,3.5,1112484727,How to Make an American Quilt,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,49,3.5,1112484580,When Night Is Falling,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.129412,0.123529,0.029412,0.082353,0.352941,0.141176,0.029412,0.505882,0.041176,0.017647,0.076471,0.005882,0.041176,0.047059,0.205882,0.076471,0.182353,0.041176,0.023529
2,0.0,0.083333,0.1,0.033333,0.066667,0.383333,0.033333,0.033333,0.483333,0.066667,0.016667,0.116667,0.0,0.066667,0.05,0.233333,0.083333,0.233333,0.05,0.016667
3,0.0,0.127072,0.088398,0.022099,0.055249,0.298343,0.071823,0.027624,0.475138,0.044199,0.027624,0.116022,0.0,0.038674,0.044199,0.132597,0.116022,0.176796,0.049724,0.01105
4,0.0,0.178571,0.142857,0.071429,0.071429,0.357143,0.035714,0.035714,0.571429,0.035714,0.0,0.035714,0.035714,0.107143,0.071429,0.107143,0.0,0.178571,0.035714,0.107143
5,0.0,0.142857,0.095238,0.079365,0.126984,0.380952,0.015873,0.063492,0.555556,0.063492,0.0,0.047619,0.015873,0.079365,0.063492,0.142857,0.031746,0.238095,0.031746,0.015873


In [None]:
movies = movies.drop("title", axis=1)
display(movies.head())

In [None]:
# movies.to_csv("data/processed/movie_matrix.csv", index=False)


In [None]:
# user_matrix.to_csv("data/processed/user_matrix.csv")
