In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px

movies, ratings = pd.read_csv('../data/movies.csv'), pd.read_csv('../data/ratings.csv')


In [3]:
movies_df = movies.assign(genres=movies['genres'].str.split('|')).explode('genres')


In [4]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy
...,...,...,...
58096,193882,Flora (2017),Horror
58096,193882,Flora (2017),Sci-Fi
58097,193886,Leal (2018),Action
58097,193886,Leal (2018),Crime


In [5]:
movies_df[movies_df['title'] =='Lord of the Rings: The Fellowship of the Ring, The (2001)']

Unnamed: 0,movieId,title,genres
4898,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure
4898,4993,"Lord of the Rings: The Fellowship of the Ring,...",Fantasy


In [6]:
movies_df.drop_duplicates()
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106107 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   movieId  106107 non-null  int64 
 1   title    106107 non-null  object
 2   genres   106107 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.2+ MB


In [7]:
movies_df['year']  = movies_df['title'].str.extract(r'\((\d{4})\)')

In [8]:
movies_df = movies_df.dropna(subset=['year', 'genres'], how='any')

In [9]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105711 entries, 0 to 58097
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   movieId  105711 non-null  int64 
 1   title    105711 non-null  object
 2   genres   105711 non-null  object
 3   year     105711 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.0+ MB


In [10]:
movies_df.loc[:, 'year'] = movies_df['year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df.loc[:, 'year'] = movies_df['year'].astype(int)
  movies_df.loc[:, 'year'] = movies_df['year'].astype(int)


In [11]:
movies_df

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure,1995
0,1,Toy Story (1995),Animation,1995
0,1,Toy Story (1995),Children,1995
0,1,Toy Story (1995),Comedy,1995
0,1,Toy Story (1995),Fantasy,1995
...,...,...,...,...
58096,193882,Flora (2017),Horror,2017
58096,193882,Flora (2017),Sci-Fi,2017
58097,193886,Leal (2018),Action,2018
58097,193886,Leal (2018),Crime,2018


In [12]:
movies_df['title_no_year'] = movies['title'].apply(lambda x: x.split("(")[0].rstrip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['title_no_year'] = movies['title'].apply(lambda x: x.split("(")[0].rstrip())


In [13]:
movies_df

Unnamed: 0,movieId,title,genres,year,title_no_year
0,1,Toy Story (1995),Adventure,1995,Toy Story
0,1,Toy Story (1995),Animation,1995,Toy Story
0,1,Toy Story (1995),Children,1995,Toy Story
0,1,Toy Story (1995),Comedy,1995,Toy Story
0,1,Toy Story (1995),Fantasy,1995,Toy Story
...,...,...,...,...,...
58096,193882,Flora (2017),Horror,2017,Flora
58096,193882,Flora (2017),Sci-Fi,2017,Flora
58097,193886,Leal (2018),Action,2018,Leal
58097,193886,Leal (2018),Crime,2018,Leal


In [14]:
has_any_nan = movies_df.isna().any().any()
has_any_nan

False

In [15]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105711 entries, 0 to 58097
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   movieId        105711 non-null  int64 
 1   title          105711 non-null  object
 2   genres         105711 non-null  object
 3   year           105711 non-null  int64 
 4   title_no_year  105711 non-null  object
dtypes: int64(2), object(3)
memory usage: 4.8+ MB


In [16]:
# movies_df.to_csv('../data/new_movies_dataset.csv', index=False)

In [17]:
new_movies = pd.read_csv('../data/new_movies_dataset.csv')
new_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105711 entries, 0 to 105710
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   movieId        105711 non-null  int64 
 1   title          105711 non-null  object
 2   genres         105711 non-null  object
 3   year           105711 non-null  int64 
 4   title_no_year  105697 non-null  object
dtypes: int64(2), object(3)
memory usage: 4.0+ MB


In [18]:
new_movies

Unnamed: 0,movieId,title,genres,year,title_no_year
0,1,Toy Story (1995),Adventure,1995,Toy Story
1,1,Toy Story (1995),Animation,1995,Toy Story
2,1,Toy Story (1995),Children,1995,Toy Story
3,1,Toy Story (1995),Comedy,1995,Toy Story
4,1,Toy Story (1995),Fantasy,1995,Toy Story
...,...,...,...,...,...
105706,193882,Flora (2017),Horror,2017,Flora
105707,193882,Flora (2017),Sci-Fi,2017,Flora
105708,193886,Leal (2018),Action,2018,Leal
105709,193886,Leal (2018),Crime,2018,Leal


In [19]:
# Should use this code to return sub genres for recommendations
# genres_df_dict = {g:df for g, df in movies_df.groupby('genres')}

# for key, df in genres_df_dict.items():
#     genres_df_dict[key] = df.to_dict(orient='records')

In [20]:
# len(genres_df_dict)

In [21]:
# genres_df_dict['Action']

In [22]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 847.0 MB


In [23]:
# NOTE: taken from ChatGPT.

# Convert movieId & userId to Categorical's to make them work with csr_matrix
movieIds = pd.Categorical(ratings['movieId'])
userIds = pd.Categorical(ratings['userId'])

# Create the csr matrix
mat_movies_users = csr_matrix((ratings['rating'], (movieIds.codes, userIds.codes)))

mat_movies_users.shape

(53889, 283228)