# Movie Recommendation Systems

# Notebook 2: Initial Setup

**1. Setup** - importing libraries and functions

**2. Load Data** - loading datasets

**3. Clean Data** - removing duplicates 

## 1. Setup

In [6]:
#Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

import surprise
from surprise import Dataset, Reader, SVD, KNNWithMeans, KNNBasic
from sklearn.model_selection import train_test_split

import ml_metrics as metrics

%matplotlib inline

In [7]:
#Load data
def load_file(df): 
    """
    Reads csv file as a Pandas dataframe.
    
    Parameters
    ----------
    df: name of csv file 
    
    Returns 
    -------
    Dataframe of csv file
    
    ---
    """
    return pd.read_csv(df)

In [8]:
def surprise_algo(algo, trainset_surprise, testset_surprise, userIds): 
    '''
    A list with lists of recommendations for each user; list requires to be stored elsewhere
    
    ---
    Parameters
    
    
    algo: Surprise package algorithm  
    
    trainset_surprise: training data, setup from Surprise (not the same as Pandas dataframe setup) 
    
    testset_surprise: testing data, setup from Surprise (not the same as Pandas dataframe setup) 
    
    userIds: list of all the userIds; to recommend each user in userIds 
    
    ---
    Returns 
    
    A list with lists of recommendations 
    
    ---
    
    '''

    #Fit SVD recommender
    algo.fit(trainset_surprise)
    algo_pred = algo.test(testset_surprise)
    
    #Dataframe of test set including predicted ratings
    algo_pred = pd.DataFrame(algo_pred).drop('details', axis=1)
    algo_pred.columns = ['userId','movieId','rating','pred_rating']
    algo_pred = algo_pred.sort_values(['userId','pred_rating'], ascending=[True,False])
    
    #Recommendations for all users in the test data
    algo_recs = []

    for userId in userIds:
        algo_recs_ = algo_pred.set_index('userId').loc[userId]['movieId'].tolist()
        algo_recs.append(algo_recs_)
    
    return algo_recs

## 2. Load data

In [9]:
#Load data and define dataframes
movies = load_file("movies.csv")
ratings = load_file("ratings.csv")

## 3. Clean data

### 3.1 'movies' dataframe

**3.1.1 Preview**

In [10]:
#movies dataframe preview
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


**3.1.2 Check nulls and duplicate rows**

In [11]:
print('Number of nulls in "movies" dataframe: \n' , movies.isnull().sum())
print('\n Number of duplicate rows in "movies" dataframe :', movies.duplicated().sum())

Number of nulls in "movies" dataframe: 
 movieId    0
title      0
genres     0
dtype: int64

 Number of duplicate rows in "movies" dataframe : 0


**3.1.3 Check duplicate movie titles**

Only 'title' in 'movies' is prone to duplicates.

In [12]:
#Number of movie title duplicates
print('Number of duplicates: ', movies[['title']].duplicated().sum())

Number of duplicates:  5


**3.1.4: Clean duplicate movie titles (genres)**

*3.1.4.1 Investigate*

In [13]:
#Dataframe of duplicates
duplicates = movies[movies['title'].duplicated()]
display(duplicates)

Unnamed: 0,movieId,title,genres
5601,26958,Emma (1996),Romance
6932,64997,War of the Worlds (2005),Action|Sci-Fi
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
9135,147002,Eros (2004),Drama|Romance
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller


In [14]:
#Store movieIds and titles of duplicates
duplicate_id = list(duplicates['movieId'])
duplicate_title = list(duplicates['title'])

In [15]:
#Dataframe of movie titles with duplicates
duplicate_df = movies[movies['title'].isin(duplicate_title)]
duplicate_df

Unnamed: 0,movieId,title,genres
650,838,Emma (1996),Comedy|Drama|Romance
2141,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller
4169,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
5601,26958,Emma (1996),Romance
5854,32600,Eros (2004),Drama
5931,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
6932,64997,War of the Worlds (2005),Action|Sci-Fi
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
9135,147002,Eros (2004),Drama|Romance
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller


In [16]:
#Dataframe of original movies with duplicates
original_df = duplicate_df[~duplicate_df['movieId'].isin(duplicate_id)]
display(original_df)

#Original movieIds
original_id = list(original_df['movieId'])

Unnamed: 0,movieId,title,genres
650,838,Emma (1996),Comedy|Drama|Romance
2141,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller
4169,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
5854,32600,Eros (2004),Drama
5931,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller


*3.1.4.2: Combine 'genres' by movie title*

* Duplicates show different 'movieId' and 'genres' under the same 'title
* For each movie title, one of the 'genres' are a subset of another 

Actions:
1. Combine genres: unique genre values extracted later using split, list and set
2. Merge combined genres to 'movies': 
    * Fillna with non-duplicates in new genre col 
    * Drop original 'genres' from "movies" 

In [17]:
#1. Combine genres
merge_genres = duplicate_df.groupby(['title'])['genres'].apply('|'.join).reset_index()
merge_genres

Unnamed: 0,title,genres
0,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller|Comedy|Crime|Drama...
1,Emma (1996),Comedy|Drama|Romance|Romance
2,Eros (2004),Drama|Drama|Romance
3,Saturn 3 (1980),Adventure|Sci-Fi|Thriller|Sci-Fi|Thriller
4,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller|Action|Sci-Fi


In [18]:
#'movies' length
len(movies)

9742

In [19]:
#Remove duplicate titles in movies
movies = movies[~movies['title'].duplicated()]

In [20]:
#Check if duplicates were removed from 'movies'
len(movies)

9737

In [21]:
#2. Merge combined genres to 'movies' 
movies = pd.merge(movies, merge_genres, on='title', how='left')

In [22]:
#Check merging of combined genres  
movies['genres_y'].notnull().sum()

5

In [23]:
#Fill NaNs with original genres - originals were not duplicates
movies['genres_y'] = movies['genres_y'].fillna(movies['genres_x'])

In [24]:
#Check number of filling non-duplicates
sum(movies['genres_y'] == movies['genres_x'])

9732

In [25]:
#Drop original genres column, 'genres_x'
movies.drop('genres_x',axis=1, inplace=True)

In [26]:
#Rename new genres, which includes combined genres of duplicates
movies.rename(columns={'genres_y':'genres'}, inplace=True)

In [27]:
#Convert strings of genres into lists
movies['genres'] = movies['genres'].apply(lambda x: str(x).split('|'))

In [28]:
#Keep list of uniques from each list of genres
movies['genres'] = movies['genres'].apply(lambda x: list(set(x)))

In [29]:
#Explode 'genres'
#Create a row for each list element from a column containing lists
movies = movies.explode('genres')
len(movies)

22074

In [30]:
#Preview
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Fantasy
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Animation


**3.1.5: Extract years from 'title' into new col 'year'**

In [31]:
#Separate year from 'title' column and placed into a new column 'year'

#Create 'year' column from extracting title 
movies['year'] = movies['title'].str.extract('(\(\d\d\d\d\))',expand=False)
#Remove parentheses in 'year' column
movies['year'] = movies['year'].str.extract('(\d\d\d\d)',expand=False)
#Removing '(year)' in 'title' column
movies['title'] = movies['title'].str.replace('(\(\d\d\d\d\))', '')
#Apply the strip function to get rid of any ending whitespace characters that may have appeared
movies['title'] = movies['title'].apply(lambda x: x.strip())

  movies['title'] = movies['title'].str.replace('(\(\d\d\d\d\))', '')


In [32]:
#Preview
movies.head(10)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Fantasy,1995
0,1,Toy Story,Comedy,1995
0,1,Toy Story,Adventure,1995
0,1,Toy Story,Children,1995
0,1,Toy Story,Animation,1995
1,2,Jumanji,Fantasy,1995
1,2,Jumanji,Children,1995
1,2,Jumanji,Adventure,1995
2,3,Grumpier Old Men,Comedy,1995
2,3,Grumpier Old Men,Romance,1995


In [33]:
#Convert years from string to integers
movies['year'] = pd.to_numeric(movies['year'])
movies['year'] = movies['year'].astype('Int64')

In [34]:
#Number of missing 'year' values 
movies['year'].isnull().sum()

16

In [35]:
#View missing 'year' movie titles
movies[movies['year'].isnull()]

Unnamed: 0,movieId,title,genres,year
6058,40697,Babylon 5,Sci-Fi,
9029,140956,Ready Player One,Thriller,
9029,140956,Ready Player One,Action,
9029,140956,Ready Player One,Sci-Fi,
9089,143410,Hyena Road,(no genres listed),
9134,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),
9175,149334,Nocturnal Animals,Thriller,
9175,149334,Nocturnal Animals,Drama,
9255,156605,Paterson,(no genres listed),
9363,162414,Moonlight,Drama,


**Findings**
* These titles are likely involving a range of years
* These nulls are left untouched - not many relative to the size of the dataframe. 

In [36]:
#Preview
movies.head(10)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Fantasy,1995
0,1,Toy Story,Comedy,1995
0,1,Toy Story,Adventure,1995
0,1,Toy Story,Children,1995
0,1,Toy Story,Animation,1995
1,2,Jumanji,Fantasy,1995
1,2,Jumanji,Children,1995
1,2,Jumanji,Adventure,1995
2,3,Grumpier Old Men,Comedy,1995
2,3,Grumpier Old Men,Romance,1995


### 3.2 'ratings' dataframe

**3.2.1 Preview**

In [37]:
#ratings dataframe preview
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


**3.2.2 Check nulls and duplicate rows**

In [38]:
print('Number of nulls in "ratings" dataframe: \n', ratings.isnull().sum())
print('\n Number of duplicates rows in "ratings" dataframe :', ratings.duplicated().sum())

Number of nulls in "ratings" dataframe: 
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

 Number of duplicates rows in "ratings" dataframe : 0


**3.2.3 Drop 'timestamp'**
* 'timestamp' is not relevant for our purposes

In [39]:
ratings.drop('timestamp', axis=1, inplace=True)

**3.2.4 Update duplicate movieIds to original movieIds**

*3.2.4.1 Find duplicates in 'ratings'*

In [40]:
#Find movieIds that require updating 
#movieIds reference duplicates
duplicate_ratings = ratings[ratings['movieId'].isin(duplicate_id)]
duplicate_ratings

Unnamed: 0,userId,movieId,rating
4747,28,64997,3.5
11451,68,64997,2.5
17819,111,144606,4.0
49832,318,147002,4.0
80596,509,26958,3.5
81458,514,168358,2.5


*3.2.4.2 Match duplicates with their original 'movieId'*

In [41]:
#Update movieIds - replace duplicate movieIds with original movieIds
print('List of original movieIds: ', original_id)
print('List of duplicate movieIds: ', duplicate_id)

#Dataframe of correpsonding original and duplicate movieIds
update_id = pd.DataFrame([original_id, duplicate_id]).transpose()
update_id.columns = ['original_id','duplicate_id']
display(update_id)

List of original movieIds:  [838, 2851, 6003, 32600, 34048]
List of duplicate movieIds:  [26958, 64997, 144606, 147002, 168358]


Unnamed: 0,original_id,duplicate_id
0,838,26958
1,2851,64997
2,6003,144606
3,32600,147002
4,34048,168358


*3.2.4.3 Merge to update 'movieId' accordingly*

In [42]:
#Merge 
ratings = pd.merge(ratings, update_id, left_on='movieId', right_on='duplicate_id', how='left')

#Preview
ratings.head()

Unnamed: 0,userId,movieId,rating,original_id,duplicate_id
0,1,1,4.0,,
1,1,3,4.0,,
2,1,6,4.0,,
3,1,47,5.0,,
4,1,50,5.0,,


In [43]:
#Number of updated movieIds
ratings['original_id'].notnull().sum()

6

*3.2.4.4 Update movieIds by replacing duplicates*

In [44]:
#Fill nulls with original movieIds (no updates needed for these)
ratings['original_id'] = ratings['original_id'].fillna(ratings['movieId'])

#Drop irrelevant columns
ratings.drop(['movieId','duplicate_id'], axis=1, inplace=True)

#Rename back to original 'movieId'
ratings.rename(columns={'original_id':'movieId'}, inplace=True)

#Preview
ratings.head()

Unnamed: 0,userId,rating,movieId
0,1,4.0,1.0
1,1,4.0,3.0
2,1,4.0,6.0
3,1,5.0,47.0
4,1,5.0,50.0


In [45]:
#Convert movieIds to integers
ratings['movieId'] = ratings['movieId'].astype(int)

In [46]:
#Rearrange columns back to original order
ratings = ratings[['userId','movieId','rating']]

*3.2.4.4 Update movieIds by replacing duplicates*

In [47]:
#Check for duplicates after updates
pd.merge(ratings, ratings[ratings.duplicated()], how='inner')

Unnamed: 0,userId,movieId,rating
0,111,6003,4.0
1,111,6003,4.0
2,509,838,3.5
3,509,838,3.5


In [48]:
#Check length
len(ratings)

100836

In [49]:
#Drop duplicates
ratings.drop_duplicates(inplace=True)

In [50]:
#Check length
len(ratings)

100834

**3.2.4 Counts**

In [51]:
#Number of users
len(set(ratings['userId']))

610

In [52]:
#Range - number of ratings
ratings.groupby('userId')['rating'].count().sort_values()

userId
442      20
406      20
147      20
194      20
569      20
       ... 
274    1346
448    1864
474    2108
599    2478
414    2698
Name: rating, Length: 610, dtype: int64

**Findings**
* More movies (items) than users; implying that item-item collaborative filtering would provide better results than user-user collaborative filtering
* Each user rated at least 20 movies (at most 2,698 movies).

In [53]:
#Preview
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [54]:
print('Number of movies in "ratings": ', len(set(ratings['movieId'])))
print('Number of movies in "movies": ', len(set(movies['movieId'])))

Number of movies in "ratings":  9719
Number of movies in "movies":  9737


In [55]:
#Check what movies are missing between the two 
movies_movielist = pd.Series(list(set(movies['movieId'])))
ratings_movielist = pd.Series(list(set(ratings['movieId'])))

In [56]:
#Movies in 'movies' and not in 'ratings'
movies_movielist[movies_movielist.isin(ratings_movielist)==False]

925      1076
1458    34482
2489     2939
2818     3338
2914     3456
3530     4194
4665     5721
5259     6668
5372     6849
5497     7020
5900     7792
6297     8765
7818    85565
8504    25855
8563    26085
9437    30892
9626    32160
9669    32371
dtype: int64

In [57]:
#Movies in 'ratings' and not in 'movies'
ratings_movielist[ratings_movielist.isin(movies_movielist)==False]

Series([], dtype: int64)

In [58]:
#Store list of missing movieIds
missing_movies = list(movies_movielist[movies_movielist.isin(ratings_movielist)==False])

**Findings**
* Will note that not all movies have been seen by any of the users.