In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy
import sklearn
from sklearn.preprocessing import StandardScaler

List of all genres

In [29]:
genres = ["Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "IMAX", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western", "(no genres listed)"]

----
## Processing movies data
Load movies to pandas dataframe

In [30]:
movies_df = pd.read_csv('../data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [31]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  87585 non-null  int64 
 1   title    87585 non-null  object
 2   genres   87585 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


Extract year and remove it from title and convert genres to list

In [32]:
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)')
movies_df['title'] = movies_df['title'].str.replace(r'\((\d{4})\)', '', regex=True)
movies_df['genres'] = movies_df['genres'].str.split('|')

One hot encode the genres, this will allow for easier processing later

In [33]:
for genre in genres:
    movies_df[genre] = movies_df['genres'].apply(lambda x: 1 if genre in x else 0)
movies_df.drop(columns=['genres'], inplace=True)

In [34]:
movies_df.head()

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Mean is the percentage of movies in each genre

In [35]:
movies_df.drop(columns=['movieId']).describe()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
count,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0
mean,0.110384,0.061677,0.052715,0.051607,0.264018,0.079648,0.106902,0.390192,0.043969,0.00403,0.098807,0.002226,0.012091,0.045818,0.118388,0.056026,0.134989,0.026546,0.019364,0.080836
std,0.31337,0.24057,0.223464,0.221234,0.440811,0.27075,0.30899,0.487796,0.205027,0.063357,0.298404,0.047133,0.109293,0.209092,0.323069,0.229972,0.341714,0.160752,0.137802,0.272584
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


----
## Processing ratings data
Load ratings to pandas dataframe

In [36]:
ratings_df = pd.read_csv('../data/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


Drop timestamp since it is not needed for our analysis

In [37]:
ratings_df.drop(columns=['timestamp'], inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [38]:
ratings_df.drop(columns=['userId', 'movieId']).describe()

Unnamed: 0,rating
count,32000200.0
mean,3.540396
std,1.058986
min,0.5
25%,3.0
50%,3.5
75%,4.0
max,5.0


In [39]:
ratings_df_to_merge = ratings_df.drop(columns=['userId'])
ratings_df_to_merge = ratings_df_to_merge.groupby('movieId')['rating'].mean().reset_index()
merged_df = pd.merge(movies_df, ratings_df_to_merge, on='movieId', how='left')

In [40]:
merged_df.head(5)

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),rating
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,3.897438
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3.275758
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,3.139447
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,2.845331
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,3.059602


In [41]:
# get 25%, 50%, 75% percentiles
percentiles = merged_df['rating'].quantile([0.25, 0.75]).values.tolist()

In [42]:
merged_df['low_rating'] = merged_df['rating'].apply(lambda x: 1 if x <= percentiles[0] else 0)
merged_df['medium_rating'] = merged_df['rating'].apply(lambda x: 1 if percentiles[0] < x <= percentiles[1] else 0)
merged_df['high_rating'] = merged_df['rating'].apply(lambda x: 1 if percentiles[1] < x else 0)

In [43]:
merged_df.head()

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),rating,low_rating,medium_rating,high_rating
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,3.897438,0,0,1
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,3.275758,0,1,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,1,0,0,0,0,0,3.139447,0,1,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,1,0,0,0,0,0,2.845331,0,1,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,3.059602,0,1,0


In [44]:
movie_rec_df = merged_df.drop(columns=['title', 'year', 'rating'])
# set index to movieId
movie_rec_df.set_index('movieId', inplace=True)
movie_rec_df.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),low_rating,medium_rating,high_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


data normalization - standarization using StandardScaler

StandardScaler transforms features to have mean=0 and standard deviation=1


In [45]:
# Create a copy of dataframe for normalization
movie_rec_df_normalized = movie_rec_df.copy()

# Apply StandardScaler to all feature columns (genres)
scaler = StandardScaler()
feature_columns = genres + ['low_rating', 'medium_rating', 'high_rating']
movie_rec_df_normalized[feature_columns] = scaler.fit_transform(movie_rec_df[feature_columns])

# Display normalized data
movie_rec_df_normalized.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),low_rating,medium_rating,high_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.352251,3.900441,4.239117,4.286865,1.669618,-0.294179,-0.345974,-0.799913,4.662987,-0.063614,...,-0.219131,-0.36645,-0.24362,-0.395037,-0.165135,-0.140522,-0.296555,-0.563527,-1.010202,1.89452
2,-0.352251,3.900441,-0.235898,4.286865,-0.59894,-0.294179,-0.345974,-0.799913,4.662987,-0.063614,...,-0.219131,-0.36645,-0.24362,-0.395037,-0.165135,-0.140522,-0.296555,-0.563527,0.989901,-0.527838
3,-0.352251,-0.256381,-0.235898,-0.233271,1.669618,-0.294179,-0.345974,-0.799913,-0.214455,-0.063614,...,-0.219131,2.728885,-0.24362,-0.395037,-0.165135,-0.140522,-0.296555,-0.563527,0.989901,-0.527838
4,-0.352251,-0.256381,-0.235898,-0.233271,1.669618,-0.294179,-0.345974,1.250135,-0.214455,-0.063614,...,-0.219131,2.728885,-0.24362,-0.395037,-0.165135,-0.140522,-0.296555,-0.563527,0.989901,-0.527838
5,-0.352251,-0.256381,-0.235898,-0.233271,1.669618,-0.294179,-0.345974,-0.799913,-0.214455,-0.063614,...,-0.219131,-0.36645,-0.24362,-0.395037,-0.165135,-0.140522,-0.296555,-0.563527,0.989901,-0.527838


In [46]:
test_user_id = 1
test_user_ratings = ratings_df[ratings_df['userId'] == test_user_id].drop(columns=['userId'])
# remove first  row from the test_user_ratings dataframe
dropped_row = test_user_ratings.iloc[3]
dropped_movie_id = dropped_row['movieId']
print(dropped_movie_id)
test_user_ratings = test_user_ratings.drop(test_user_ratings.index[0])
test_user_ratings

30.0


Unnamed: 0,movieId,rating
1,25,1.0
2,29,2.0
3,30,5.0
4,32,5.0
5,34,2.0
...,...,...
136,2985,5.0
137,2997,4.0
138,3030,4.0
139,3078,2.0


In [47]:
test_user_movies = movie_rec_df_normalized.loc[test_user_ratings['movieId']]

# multiply the test_user_movies elements by the test_user_ratings elements
profile = test_user_movies.T.dot(test_user_ratings['rating'].values)
profile[['low_rating', 'medium_rating', 'high_rating']] = profile[['low_rating', 'medium_rating', 'high_rating']] / (2 *profile[['low_rating', 'medium_rating', 'high_rating']].sum())

# normalize all other values
norm = np.linalg.norm(profile)
for genre in genres:
    profile[genre] = profile[genre] / norm


In [48]:
test_user_id = 2
test_user_ratings = ratings_df[ratings_df['userId'] == test_user_id].drop(columns=['userId'])
# remove first  row from the test_user_ratings dataframe
dropped_row = test_user_ratings.iloc[3]
dropped_movie_id = dropped_row['movieId']
print(dropped_movie_id)
test_user_ratings = test_user_ratings.drop(test_user_ratings.index[0])
test_user_movies = movie_rec_df_normalized.loc[test_user_ratings['movieId']]

# multiply the test_user_movies elements by the test_user_ratings elements
profile_2 = test_user_movies.T.dot(test_user_ratings['rating'].values)
profile_2[['low_rating', 'medium_rating', 'high_rating']] = profile[['low_rating', 'medium_rating', 'high_rating']] / (2 *profile_2[['low_rating', 'medium_rating', 'high_rating']].sum())

# normalize all other values
norm = np.linalg.norm(profile)
for genre in genres:
    profile_2[genre] = profile_2[genre] / norm

48.0


In [49]:
profile

Action                0.112658
Adventure             0.178395
Animation            -0.175287
Children             -0.125741
Comedy                0.135043
Crime                 0.125856
Documentary          -0.242476
Drama                 0.509554
Fantasy              -0.012623
Film-Noir             0.071437
Horror               -0.190594
IMAX                 -0.035100
Musical              -0.082205
Mystery               0.103345
Romance               0.253824
Sci-Fi                0.257203
Thriller             -0.047032
War                   0.560367
Western              -0.028008
(no genres listed)   -0.220359
low_rating           -1.016473
medium_rating        -1.449714
high_rating           2.966186
dtype: float64

In [50]:
# build ranking based on the profile
ranking = movie_rec_df_normalized.copy()
ranking['score'] = 0
for genre in genres:
    ranking['score'] += ranking[genre] * profile[genre]
for rating in ['low_rating', 'medium_rating', 'high_rating']:
    ranking['score'] += ranking[rating] * profile[rating]

ranking = ranking.sort_values(by='score', ascending=False)

In [51]:
ranking

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),low_rating,medium_rating,high_rating,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26236,2.838885,3.900441,-0.235898,-0.233271,1.669618,-0.294179,-0.345974,1.250135,-0.214455,-0.063614,...,2.728885,-0.243620,-0.395037,6.055656,-0.140522,-0.296555,-0.563527,-1.010202,1.894520,13.813112
212761,2.838885,-0.256381,-0.235898,-0.233271,-0.598940,3.399292,-0.345974,1.250135,-0.214455,-0.063614,...,2.728885,-0.243620,-0.395037,6.055656,-0.140522,-0.296555,-0.563527,-1.010202,1.894520,13.230046
897,-0.352251,3.900441,-0.235898,-0.233271,-0.598940,-0.294179,-0.345974,1.250135,-0.214455,-0.063614,...,2.728885,-0.243620,-0.395037,6.055656,-0.140522,-0.296555,-0.563527,-1.010202,1.894520,13.147254
75994,-0.352251,3.900441,-0.235898,-0.233271,-0.598940,-0.294179,-0.345974,1.250135,-0.214455,-0.063614,...,2.728885,-0.243620,-0.395037,6.055656,-0.140522,-0.296555,-0.563527,-1.010202,1.894520,13.147254
140295,2.838885,3.900441,4.239117,-0.233271,-0.598940,-0.294179,-0.345974,1.250135,-0.214455,-0.063614,...,-0.366450,4.104752,-0.395037,6.055656,-0.140522,-0.296555,-0.563527,-1.010202,1.894520,13.055090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4456,-0.352251,-0.256381,4.239117,-0.233271,-0.598940,-0.294179,-0.345974,-0.799913,-0.214455,-0.063614,...,-0.366450,-0.243620,-0.395037,-0.165135,-0.140522,-0.296555,-0.563527,0.989901,-0.527838,-5.163098
72131,-0.352251,-0.256381,-0.235898,-0.233271,-0.598940,-0.294179,2.890393,-0.799913,-0.214455,-0.063614,...,-0.366450,-0.243620,-0.395037,-0.165135,-0.140522,-0.296555,-0.563527,0.989901,-0.527838,-5.276869
58879,-0.352251,-0.256381,-0.235898,-0.233271,-0.598940,-0.294179,2.890393,-0.799913,-0.214455,-0.063614,...,-0.366450,-0.243620,-0.395037,-0.165135,-0.140522,-0.296555,-0.563527,0.989901,-0.527838,-5.276869
4457,-0.352251,-0.256381,-0.235898,-0.233271,-0.598940,-0.294179,2.890393,-0.799913,-0.214455,-0.063614,...,-0.366450,-0.243620,-0.395037,-0.165135,-0.140522,-0.296555,-0.563527,0.989901,-0.527838,-5.276869


In [52]:
recommended_movies = ranking.join(movies_df.set_index('movieId'), on='movieId', how='left', rsuffix='_movie')
recommended_movies = recommended_movies[['title', 'year', 'score']]
recommended_movies.head(10)

Unnamed: 0_level_0,title,year,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26236,"White Sun of the Desert, The (Beloe solntse pu...",1970,13.813112
212761,Kawashima Yoshiko: The Last Princess of Manchu...,1990,13.230046
897,For Whom the Bell Tolls,1943,13.147254
75994,"Buccaneer, The",1958,13.147254
140295,Mobile Suit Gundam III: Encounters in Space,1982,13.05509
49530,Blood Diamond,2006,13.048297
8690,Slaughterhouse-Five,1972,13.044791
53226,"Great War, The (Grande guerra, La)",1959,13.027441
2890,Three Kings,1999,13.027441
7215,To Have and Have Not,1944,13.009618


In [53]:
# get dropped movie position in the ranking
dropped_movie_position = ranking[ranking.index == dropped_movie_id]
dropped_movie_position

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),low_rating,medium_rating,high_rating,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48,-0.352251,-0.256381,4.239117,4.286865,-0.59894,-0.294179,-0.345974,1.250135,-0.214455,-0.063614,...,2.728885,-0.24362,-0.395037,-0.165135,-0.140522,-0.296555,-0.563527,0.989901,-0.527838,-3.269905
