In [34]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px

In [35]:
movies, ratings = pd.read_csv('../data/movies.csv'), pd.read_csv('../data/ratings.csv')

movies_df = movies.copy()
ratings_df = ratings.copy()

In [36]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [37]:
movies['year']  = movies['title'].str.extract(r'\((\d{4})\)')

movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [38]:
genres_df = movies['genres'].str.get_dummies('|')

# calculate the most common genres for each movie
movies['most_common_genre'] = genres_df.apply(lambda x: x.idxmax(), axis=1)

# show the resulting DataFrame
movies.head()

Unnamed: 0,movieId,title,genres,year,most_common_genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Adventure
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Adventure
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,Comedy
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,Comedy
4,5,Father of the Bride Part II (1995),Comedy,1995,Comedy


In [39]:
# movies = movies.dropna(subset=['year'], how='any')

In [40]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,27753440.0,27753440.0,27753440.0,27753440.0
mean,141942.0,18488.0,3.530445,1193122000.0
std,81707.4,35102.63,1.066353,216048200.0
min,1.0,1.0,0.5,789652000.0
25%,71176.0,1097.0,3.0,998605300.0
50%,142022.0,2716.0,3.5,1174256000.0
75%,212459.0,7150.0,4.0,1422744000.0
max,283228.0,193886.0,5.0,1537945000.0


In [41]:
movies.describe()

Unnamed: 0,movieId
count,58098.0
mean,111919.516197
std,59862.660956
min,1.0
25%,72437.75
50%,126549.0
75%,161449.5
max,193886.0


In [42]:
movies_by_year = movies.groupby('year').agg({'title':'count'}).reset_index()
movies_by_year.head()

Unnamed: 0,year,title
0,1874,1
1,1878,1
2,1883,1
3,1887,1
4,1888,4


In [43]:
import plotly.graph_objects as go
import plotly_express as px

plot = go.Figure(data=[go.Bar(x=movies_by_year["year"], y=movies_by_year["title"])])

plot.update_layout(
    xaxis = dict(
        rangeselector=dict(
            buttons=list(
                [
                    dict(count=1, step="year", stepmode="backward"),
                ]
            )
        ),
        rangeslider=dict(visible=True),
        title='Year'
    ),
    yaxis = dict(
        range=[0, 3000],
        title='Movie count'
        ),
    title='Total amount of movies per year over time'
)

plot.show()

In [44]:
movies['year'] = movies['year'].astype(int)

ValueError: cannot convert float NaN to integer

In [None]:
movies_cleaned = movies[movies['year'] >= 2000 ]
movies_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30204 entries, 2684 to 58097
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   movieId            30204 non-null  int64 
 1   title              30204 non-null  object
 2   genres             30204 non-null  object
 3   year               30204 non-null  int64 
 4   most_common_genre  30204 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.4+ MB


In [None]:
# movies_cleaned

In [None]:
# #count the number of unique movieid values for each userid
# counts = ratings.groupby('userId')['movieId'].nunique()

# # filter out rows where the userid has ratings in all the movieid columns
# ratings = ratings[~ratings['userId'].isin(counts[counts > 20].index)]


In [None]:
# ratings = ratings[(ratings['rating'] % 1 == 0)&(ratings['rating'] >= 3) & (ratings['rating'] < 5)]
# # ratings = ratings[(ratings['rating'] >= 2) & (ratings['rating'] < 5)]

# ratings.info()


In [None]:
# new_movie_set = movies_cleaned[movies_cleaned['movieId'].isin(ratings['movieId'])]
new_movie_set = movies[movies['movieId'].isin(ratings['movieId'])]


In [None]:
data = pd.merge(ratings, new_movie_set, on='movieId')


In [None]:
movies.loc[:, 'title_no_year'] = movies['title'].apply(lambda x: x.split("(")[0].rstrip())

In [None]:
# import numpy as np
# import pandas as pd



# user_ids = np.unique(ratings_df['userId'])
# movie_ids = np.unique(ratings_df['movieId'])


# pivot_table = np.zeros((len(movie_ids), len(user_ids)))


# for index, row in ratings_df.iterrows():
#     user_index = np.where(user_ids == row['userId'])[0][0]
#     movie_index = np.where(movie_ids == row['movieId'])[0][0]
#     pivot_table[movie_index, user_index] = row['rating']


# pivot_table_df = pd.DataFrame(pivot_table, index=movie_ids, columns=user_ids)


# print(pivot_table_df)


In [None]:
# ratings_fea =  ratings.set_index(['movieId', 'userId'])['rating'].unstack('userId')

In [None]:
# ratings_features = ratings.pivot(columns='userId', index='movieId', values='rating')
# ratings_features = ratings_features.apply(lambda x: x.fillna(0), axis=1)

# ratings_features

In [None]:
# matrix_movies_users = csr_matrix(ratings_features.values)

In [None]:
# matrix_movies_users

---

## 1.3) Recommender system

In [None]:
# model_KNN = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
# model_KNN.fit(ratings_features)

In [None]:
movies

Unnamed: 0,movieId,title,genres,year,most_common_genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Adventure
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Adventure
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,Comedy
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,Comedy
4,5,Father of the Bride Part II (1995),Comedy,1995,Comedy
...,...,...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed),1946,(no genres listed)
58094,193878,Les tribulations d'une caissière (2011),Comedy,2011,Comedy
58095,193880,Her Name Was Mumu (2016),Drama,2016,Drama
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi,2017,Adventure


In [None]:
idx = process.extractOne('lord of the rings', movies['title'])
idx

('Lord of the Rings, The (1978)', 90, 2033)

In [None]:
idx = process.extractOne('star wars', movies['title'])
idx

('Star Wars: Episode IV - A New Hope (1977)', 90, 257)

In [None]:
movies_df[movies_df['title'] == 'Star Wars: Episode IV - A New Hope (1977)']

Unnamed: 0,movieId,title,genres
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi


In [203]:
def get_movie(word):
    movie = process.extractOne(word, movies['title'])
    return [movie[0], movie[1]]

movie = get_movie('the fellowship of the ring')
movie

['Lord of the Rings: The Fellowship of the Ring, The (2001)', 90]

In [204]:
def get_movie_Id():
    cleaned = movies[movies['title'] == movie[0]]
    return cleaned['movieId'].values[0]
    
get_movie_Id()

4993

In [206]:
def clean_movies():
    cleaned = movies[movies['title'] == movie[0]]
    genre = cleaned['most_common_genre']
    
    df = movies[movies['most_common_genre'] == genre.values[0]]
    
    return df
clean = clean_movies()
clean.head()

Unnamed: 0,movieId,title,genres,year,most_common_genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Adventure
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Adventure
2,8,Tom and Huck (1995),Adventure|Children,1995,Adventure
3,13,Balto (1995),Adventure|Animation|Children,1995,Adventure
4,29,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,1995,Adventure


In [181]:
# def process_movies():
#     movie_row = movies[movies['title'] == movie[0]]
#     common_genres = movie_row['genres'].str.split('|').str[:2].tolist()

#     # df = movies[(movies['most_common_genre'] == common_genres[0][0])  (movies['most_common_genre'] == common_genres[0][1])]
#     df = movies[(movies['most_common_genre'] == common_genres[0][0]) | (movies['most_common_genre'] == common_genres[0][1])]

#     return df
# clean = process_movies()
# clean

Unnamed: 0,movieId,title,genres,year,most_common_genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Adventure
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Adventure
7,8,Tom and Huck (1995),Adventure|Children,1995,Adventure
12,13,Balto (1995),Adventure|Animation|Children,1995,Adventure
28,29,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,1995,Adventure
...,...,...,...,...,...
58023,193731,The Fencing Master (1992),Adventure|Thriller,1992,Adventure
58029,193743,Sunshine Girl and The Hunt For Black Eyed Kids...,Adventure|Horror|Thriller,2012,Adventure
58044,193773,Bloody Spear at Mount Fuji (1955),Adventure|Drama,1955,Adventure
58046,193777,Almost Home (2014),Adventure|Animation|Sci-Fi,2014,Adventure


In [216]:
def clean_ratings():
    rate = ratings[ratings['movieId'].isin(clean['movieId'])]
    return rate
rate = clean_ratings()

In [217]:
ratings_features = rate.pivot(columns='userId', index='movieId', values='rating').fillna(0)
# ratings_features = ratings_features.apply(lambda x: x.fillna(0), axis=1)

ratings_features

userId,3,4,5,6,10,11,12,13,14,15,...,283217,283218,283219,283220,283221,283222,283224,283226,283227,283228
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.5,4.0,...,5.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
2,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [218]:
matrix_movies_users = csr_matrix(ratings_features.values)

In [219]:
model_KNN = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
model_KNN.fit(ratings_features)

In [220]:
movieId = get_movie_Id()
movieId

4993

In [221]:
movie_id = process.extractOne('Lord of the Rings: The Fellowship of the Ring, The (2001)', clean['title'])


In [222]:
movie_id

('Lord of the Rings: The Fellowship of the Ring, The (2001)', 100, 290)

In [223]:
row_idx = ratings_features.index.get_loc(4993)
row = matrix_movies_users[row_idx, :]
row

<1x234012 sparse matrix of type '<class 'numpy.float64'>'
	with 61883 stored elements in Compressed Sparse Row format>

In [224]:
row_idx

290

In [225]:
ratings_features.iloc[290]

userId
3         0.0
4         3.5
5         0.0
6         0.0
10        0.0
         ... 
283222    4.5
283224    0.0
283226    0.0
283227    0.0
283228    0.0
Name: 4993, Length: 234012, dtype: float64

In [226]:
ratings_features.iloc[343]

userId
3         0.0
4         4.0
5         0.0
6         0.0
10        0.0
         ... 
283222    4.5
283224    0.0
283226    0.0
283227    0.0
283228    0.0
Name: 5952, Length: 234012, dtype: float64

In [227]:
clean.iloc[343]

movieId                                                       5952
title                Lord of the Rings: The Two Towers, The (2002)
genres                                           Adventure|Fantasy
year                                                          2002
most_common_genre                                        Adventure
Name: 343, dtype: object

In [228]:
ratings_features.loc[4993]

userId
3         0.0
4         3.5
5         0.0
6         0.0
10        0.0
         ... 
283222    4.5
283224    0.0
283226    0.0
283227    0.0
283228    0.0
Name: 4993, Length: 234012, dtype: float64

In [229]:
def recommender_system(movie_name, dataframe, model, number_recommendations):
    ind = []
    movie_id = process.extractOne(movie_name, clean['title'])[1]
    movie_idx = process.extractOne(movie_name, clean['title'])[2]
  
    print('Movie Selected: ', clean['title'][movie_idx], 'Id: ',movie_id)
    print('Searching for recommendation....')

    row_idx = ratings_features.index.get_loc(4993)
    row = matrix_movies_users[row_idx, :]
    
    distances, indices = model.kneighbors(dataframe[row_idx, :], n_neighbors=number_recommendations+1)
    selected = indices[0]
    selected = selected[selected != row_idx]

    selected_movies = clean.iloc[selected]

    
    return selected_movies


In [230]:

recommendations = recommender_system('the fellowship of the ring', matrix_movies_users,model_KNN, 20)
recommendations

Movie Selected:  Lord of the Rings: The Fellowship of the Ring, The (2001) Id:  90
Searching for recommendation....


Unnamed: 0,movieId,title,genres,year,most_common_genre
343,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,2002,Adventure
250,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,2001,Adventure
359,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,2003,Adventure
276,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,2001,Adventure
277,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,2001,Adventure
428,8368,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX,2004,Adventure
76,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi,1985,Adventure
334,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy,2002,Adventure
70,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,1975,Adventure
713,60007,Doctor Who (1996),Adventure|Sci-Fi,1996,Adventure


In [None]:
movies_df.loc[6768]

movieId                          6877
title      Girls Will Be Girls (2003)
genres                         Comedy
Name: 6768, dtype: object

In [None]:
# def recommender(movie_name, data, model, n_recommendations):
#     model.fit(data)
#     idx = process.extractOne(movie_name,movies['title'])[2]
#     # print(f"Movie selected: {df_movies['title'][idx]}, Index selected: {idx}")
#     print('Movie Selected: ', movies['title'][idx], 'Index: ',idx)
#     print('Searching for recomendation....')
#     distances, indices = model.kneighbors(movies_df[idx], n_neighbors=n_recommendations)
#     for i in indices:
#         print(indices)
#         print(movies_df['title'][i].where(i!=idx))
    

# recommendations = recommender('toy story', matrix_movies_users,model_KNN, 5)
# recommendations

In [None]:
# def recommender_system(movie_name, dataframe, model, number_recommendations):
#     ind = []
#     model.fit(dataframe)
#     idx = process.extractOne(movie_name, movies['title'])[2]
#     # print(f"Movie selected: {df_movies['title'][idx]}, Index selected: {idx}")

#     print('Movie Selected: ', movies['title'][idx], 'Index: ',idx)
#     print('Searching for recomendation....')
#     distances, indices = model.kneighbors(dataframe[idx], n_neighbors=number_recommendations)
#     for i in indices:
#         ind.append(movies['title'][i].where(i!=idx).index)
   
#     selected = pd.Index(ind[0])
#     selected_movies = movies.loc[selected]
#     print(ind)
    
#     return selected_movies 




In [None]:
# recommendations = recommender_system('toy story', matrix_movies_users,model_KNN, 10)
# recommendations