In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy
import sklearn



List of all genres

In [3]:
genres = ["Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "IMAX", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western", "(no genres listed)"]

----
## Processing movies data
Load movies to pandas dataframe

In [4]:
movies_df = pd.read_csv('../data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  87585 non-null  int64 
 1   title    87585 non-null  object
 2   genres   87585 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


Extract year and remove it from title and convert genres to list

In [6]:
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)')
movies_df['title'] = movies_df['title'].str.replace(r'\((\d{4})\)', '', regex=True)
movies_df['genres'] = movies_df['genres'].str.split('|')

One hot encode the genres, this will allow for easier processing later

In [7]:
for genre in genres:
    movies_df[genre] = movies_df['genres'].apply(lambda x: 1 if genre in x else 0)
movies_df.drop(columns=['genres'], inplace=True)

In [8]:
movies_df.head()

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Mean is the percentage of movies in each genre

In [9]:
movies_df.drop(columns=['movieId']).describe()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
count,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0,87585.0
mean,0.110384,0.061677,0.052715,0.051607,0.264018,0.079648,0.106902,0.390192,0.043969,0.00403,0.098807,0.002226,0.012091,0.045818,0.118388,0.056026,0.134989,0.026546,0.019364,0.080836
std,0.31337,0.24057,0.223464,0.221234,0.440811,0.27075,0.30899,0.487796,0.205027,0.063357,0.298404,0.047133,0.109293,0.209092,0.323069,0.229972,0.341714,0.160752,0.137802,0.272584
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


----
## Processing ratings data
Load ratings to pandas dataframe

In [10]:
ratings_df = pd.read_csv('../data/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


Drop timestamp since it is not needed for our analysis

In [11]:
ratings_df.drop(columns=['timestamp'], inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [12]:
ratings_df.drop(columns=['userId', 'movieId']).describe()

Unnamed: 0,rating
count,32000200.0
mean,3.540396
std,1.058986
min,0.5
25%,3.0
50%,3.5
75%,4.0
max,5.0


In [13]:
ratings_df_to_merge = ratings_df.drop(columns=['userId'])
ratings_df_to_merge = ratings_df_to_merge.groupby('movieId')['rating'].mean().reset_index()
merged_df = pd.merge(movies_df, ratings_df_to_merge, on='movieId', how='left')

In [14]:
merged_df.head(5)

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),rating
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,3.897438
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3.275758
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,3.139447
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,2.845331
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,3.059602


In [28]:
# get 25%, 50%, 75% percentiles
percentiles = merged_df['rating'].quantile([0.25, 0.75]).values.tolist()

In [29]:
merged_df['low_rating'] = merged_df['rating'].apply(lambda x: 1 if x <= percentiles[0] else 0)
merged_df['medium_rating'] = merged_df['rating'].apply(lambda x: 1 if percentiles[0] < x <= percentiles[1] else 0)
merged_df['high_rating'] = merged_df['rating'].apply(lambda x: 1 if percentiles[1] < x else 0)

In [30]:
merged_df.head()

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),rating,low_rating,medium_rating,high_rating
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,3.897438,0,0,1
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,3.275758,0,1,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,1,0,0,0,0,0,3.139447,0,1,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,1,0,0,0,0,0,2.845331,0,1,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,3.059602,0,1,0


In [33]:
movie_rec_df = merged_df.drop(columns=['title', 'year', 'rating'])
# set index to movieId
movie_rec_df.set_index('movieId', inplace=True)
movie_rec_df.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),low_rating,medium_rating,high_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [101]:
test_user_id = 1
test_user_ratings = ratings_df[ratings_df['userId'] == test_user_id].drop(columns=['userId'])
# remove first  row from the test_user_ratings dataframe
dropped_row = test_user_ratings.iloc[3]
dropped_movie_id = dropped_row['movieId']
print(dropped_movie_id)
test_user_ratings = test_user_ratings.drop(test_user_ratings.index[0])
test_user_ratings

30.0


Unnamed: 0,movieId,rating
1,25,1.0
2,29,2.0
3,30,5.0
4,32,5.0
5,34,2.0
...,...,...
136,2985,5.0
137,2997,4.0
138,3030,4.0
139,3078,2.0


In [102]:
test_user_movies = movie_rec_df.loc[test_user_ratings['movieId']]

# multiply the test_user_movies elements by the test_user_ratings elements
profile = test_user_movies.T.dot(test_user_ratings['rating'].values)
profile[['low_rating', 'medium_rating', 'high_rating']] = profile[['low_rating', 'medium_rating', 'high_rating']] / (2 *profile[['low_rating', 'medium_rating', 'high_rating']].sum())

# normalize all other values
norm = np.linalg.norm(profile)
for genre in genres:
    profile[genre] = profile[genre] / norm


In [103]:
profile

Action                0.175195
Adventure             0.132520
Animation             0.000000
Children              0.015723
Comedy                0.381836
Crime                 0.139258
Documentary           0.006738
Drama                 0.804102
Fantasy               0.044922
Film-Noir             0.011230
Horror                0.024707
IMAX                  0.000000
Musical               0.000000
Mystery               0.083106
Romance               0.253809
Sci-Fi                0.150488
Thriller              0.125781
War                   0.163965
Western               0.015723
(no genres listed)    0.000000
low_rating            0.000000
medium_rating         0.051619
high_rating           0.448381
dtype: float64

In [104]:
# build ranking based on the profile
ranking = movie_rec_df.copy()
ranking['score'] = 0
for genre in genres:
    ranking['score'] += ranking[genre] * profile[genre]
for rating in ['low_rating', 'medium_rating', 'high_rating']:
    ranking['score'] += ranking[rating] * profile[rating]

ranking = ranking.sort_values(by='score', ascending=False)

In [105]:
ranking

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),low_rating,medium_rating,high_rating,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26236,1,1,0,0,1,0,0,1,0,0,...,1,0,0,1,0,0,0,0,1,2.359808
172197,1,0,0,0,1,1,0,1,0,0,...,1,0,1,0,0,0,0,0,1,2.328363
4956,1,1,0,0,1,0,0,1,0,0,...,1,0,1,0,0,0,0,0,1,2.321624
27344,1,1,1,0,1,0,0,1,1,0,...,1,0,0,0,0,0,0,0,1,2.240765
290347,0,0,1,0,1,1,0,1,0,0,...,1,1,0,0,0,0,0,0,1,2.177874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153855,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0.000000
153812,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0.000000
153470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.000000
278128,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0.000000


In [106]:
recommended_movies = ranking.join(movies_df.set_index('movieId'), on='movieId', how='left', rsuffix='_movie')
recommended_movies = recommended_movies[['title', 'year', 'score']]
recommended_movies.head(10)

Unnamed: 0_level_0,title,year,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26236,"White Sun of the Desert, The (Beloe solntse pu...",1970,2.359808
172197,Under New Management,2009,2.328363
4956,"Stunt Man, The",1980,2.321624
27344,Revolutionary Girl Utena: Adolescence of Utena...,1999,2.240765
290347,We Are Still Here,2022,2.177874
6902,Interstate 60,2002,2.171136
226208,Labou,2009,2.153167
157565,Rajathandhiram,2015,2.153167
1912,Out of Sight,1998,2.153167
7835,Song of the Thin Man,1947,2.110491


In [107]:
# get dropped movie position in the ranking
dropped_movie_position = ranking[ranking.index == dropped_movie_id]
dropped_movie_position

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),low_rating,medium_rating,high_rating,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1.391741
