In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
ratings = pd.read_csv('../data/raw/ratings.csv')
movies = pd.read_csv('../data/raw/movies.csv')

display(ratings.head())
display(movies.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


In [3]:
movies.isna().sum()

movieId      0
title        0
genres       0
year       229
dtype: int64

In [4]:
movies.dtypes

movieId      int64
title       object
genres      object
year       float64
dtype: object

In [5]:
movies['year'] = pd.to_numeric(movies['year'], errors='coerce').astype('Int64')
movies.dtypes


movieId     int64
title      object
genres     object
year        Int64
dtype: object

In [7]:
movies.to_csv('../data/processed/movies.csv', index=False)

In [8]:
movies_processed = pd.read_csv('../data/processed/movies.csv')
movies_processed.head()
movies_processed.dtypes


movieId      int64
title       object
genres      object
year       float64
dtype: object

In [10]:
links = pd.read_csv('../data/raw/links.csv')
links.head()


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [17]:
genome_scores = pd.read_csv('../data/raw/genome-scores.csv')
display(genome_scores.head())
display(genome_scores.dtypes)


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


movieId        int64
tagId          int64
relevance    float64
dtype: object

In [18]:
genome_scores = pd.read_csv('../data/raw/genome-scores.csv', dtype={"movieId": "int64", "tagId": "int64", "relevance": "float64"})
display(genome_scores.head())
display(genome_scores.dtypes)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


movieId        int64
tagId          int64
relevance    float64
dtype: object

In [19]:
# sum of relevance
genome_scores["relevance"].sum()


np.float64(1363992.6214999978)

In [20]:
# nb de ligne ou relevance est egal a 0
genome_scores[genome_scores["relevance"] == 0].shape[0]


0

In [21]:
genome_scores.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11709768 entries, 0 to 11709767
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 268.0 MB


In [7]:
# convert the timestamp to datetime
# userId	movieId	rating	timestamp
# 0	1	2	3.5	1112486027
# 1	1	29	3.5	1112484676
# 2	1	32	3.5	1112484819
# 3	1	47	3.5	1112484727
# 4	1	50	3.5	1112484580

# unit does not seem to be seconds
ratings["timestamp"] = pd.to_datetime(ratings["timestamp"], unit="s")

# 20 lignes au hasard
ratings.sample(20)


Unnamed: 0,userId,movieId,rating,timestamp
12792112,88407,1658,4.0,2001-08-10 04:15:42
9266741,64060,6636,4.5,2003-10-22 05:44:37
9382633,64901,2985,4.5,2009-12-07 16:31:59
12013417,82938,266,5.0,1996-05-20 14:15:17
7745084,53351,898,5.0,1999-10-03 09:53:27
16150162,111793,2353,4.0,2006-07-22 21:51:07
11547327,79703,8337,4.0,2004-11-17 17:01:10
17354239,120006,57368,3.5,2014-09-13 22:27:45
9021764,62380,32587,3.5,2008-03-05 23:53:23
94209,654,1200,3.0,2000-05-22 17:20:11


In [22]:
genres = movies["genres"].str.get_dummies(sep="|")

new_movies = pd.concat([movies[["movieId", "title"]], genres], axis=1)

new_movies.head()


Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# merge the 2 tables together
movie_ratings = ratings.merge(new_movies, on="movieId", how="inner")

movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,2,3.5,1112486027,Jumanji (1995),0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,29,3.5,1112484676,"City of Lost Children, The (CitÃ© des enfants ...",0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
2,1,32,3.5,1112484819,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0
3,1,47,3.5,1112484727,Seven (a.k.a. Se7en) (1995),0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,3.5,1112484580,"Usual Suspects, The (1995)",0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [31]:
movie_ratings.columns


Index(['userId', 'movieId', 'rating', 'timestamp', 'title',
       '(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western'],
      dtype='object')

In [29]:
# Calculate user_matrix
user_matrix = movie_ratings.drop(
        ["movieId", "timestamp", "title", "rating"], axis=1
    ).groupby(
        "userId"
    ).agg(
        "mean"
    )

user_matrix.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.377143,0.417143,0.057143,0.108571,0.234286,0.12,0.0,0.245714,0.394286,0.0,0.257143,0.011429,0.017143,0.102857,0.062857,0.228571,0.24,0.051429,0.022857
2,0.0,0.311475,0.278689,0.016393,0.016393,0.163934,0.016393,0.0,0.311475,0.016393,0.016393,0.295082,0.016393,0.032787,0.065574,0.098361,0.377049,0.311475,0.065574,0.032787
3,0.0,0.326203,0.26738,0.02139,0.053476,0.278075,0.112299,0.005348,0.31016,0.106952,0.005348,0.171123,0.0,0.032086,0.058824,0.085561,0.497326,0.26738,0.032086,0.016043
4,0.0,0.464286,0.214286,0.071429,0.142857,0.392857,0.214286,0.0,0.285714,0.107143,0.0,0.0,0.0,0.071429,0.107143,0.142857,0.178571,0.464286,0.035714,0.035714
5,0.0,0.272727,0.318182,0.090909,0.166667,0.363636,0.106061,0.0,0.409091,0.166667,0.0,0.015152,0.045455,0.121212,0.030303,0.242424,0.151515,0.227273,0.015152,0.030303


In [33]:
user_matrix.describe()


Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
count,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0,138493.0
mean,9e-06,0.294642,0.237483,0.060892,0.089467,0.370534,0.172769,0.008376,0.439358,0.105169,0.009364,0.063853,0.027612,0.045306,0.075188,0.197186,0.159425,0.279579,0.055962,0.021902
std,0.000574,0.140114,0.108887,0.062554,0.079895,0.126901,0.08138,0.019349,0.134915,0.064291,0.021865,0.070444,0.037702,0.047525,0.048622,0.098628,0.106614,0.119516,0.043425,0.026178
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.192308,0.16,0.022222,0.039216,0.28777,0.118006,0.0,0.347826,0.0625,0.0,0.026455,0.0,0.01548,0.043988,0.131579,0.091463,0.196347,0.030303,0.0
50%,0.0,0.285714,0.227273,0.047619,0.073171,0.361582,0.165605,0.0,0.428571,0.095238,0.0,0.048077,0.015152,0.035853,0.06993,0.183908,0.142202,0.272727,0.048951,0.016736
75%,0.0,0.382979,0.30303,0.082734,0.117647,0.44,0.218487,0.010417,0.52381,0.135965,0.011628,0.081633,0.040816,0.061538,0.098592,0.246575,0.2,0.35443,0.075,0.033333
max,0.1,0.96,0.928571,1.0,1.0,1.0,0.956522,0.942857,1.0,0.793427,0.772727,1.0,0.607143,0.952381,0.787879,1.0,1.0,0.956522,0.91689,0.822222


In [49]:
import numpy as np

genres = "01100000000000001000"
genres = [int(genre) for genre in genres]
genres1 = np.array(genres)
genres2 = np.array(genres).reshape(-1, 1)
genres3 = np.array(genres).reshape(1, -1)


print(genres1)
print(genres2)
print(genres3)

[0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
[[0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]]
[[0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]


In [46]:
users = pd.read_csv('../data/processed/user_matrix.csv')
users.head()

Unnamed: 0,userId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.0,0.129412,0.123529,0.029412,0.082353,0.352941,0.141176,0.029412,0.505882,...,0.017647,0.076471,0.005882,0.041176,0.047059,0.205882,0.076471,0.182353,0.041176,0.023529
1,2,0.0,0.083333,0.1,0.033333,0.066667,0.383333,0.033333,0.033333,0.483333,...,0.016667,0.116667,0.0,0.066667,0.05,0.233333,0.083333,0.233333,0.05,0.016667
2,3,0.0,0.127072,0.088398,0.022099,0.055249,0.298343,0.071823,0.027624,0.475138,...,0.027624,0.116022,0.0,0.038674,0.044199,0.132597,0.116022,0.176796,0.049724,0.01105
3,4,0.0,0.178571,0.142857,0.071429,0.071429,0.357143,0.035714,0.035714,0.571429,...,0.0,0.035714,0.035714,0.107143,0.071429,0.107143,0.0,0.178571,0.035714,0.107143
4,5,0.0,0.142857,0.095238,0.079365,0.126984,0.380952,0.015873,0.063492,0.555556,...,0.0,0.047619,0.015873,0.079365,0.063492,0.142857,0.031746,0.238095,0.031746,0.015873


In [47]:
users_id = [1]
users = users[users["userId"].isin(users_id)]
users = users.drop("userId", axis=1)

users

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.129412,0.123529,0.029412,0.082353,0.352941,0.141176,0.029412,0.505882,0.041176,0.017647,0.076471,0.005882,0.041176,0.047059,0.205882,0.076471,0.182353,0.041176,0.023529


In [50]:
columns = ['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
genres_df = pd.DataFrame(genres3, columns=columns)
genres_df.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
