# Movie Recommender System
# Wittlieff, Alexa

#### Load Libraries & Import Data

In [1]:
# Load Libraries
import pandas as pd
import numpy as np

In [2]:
# Import ratings and movie titles datasets
ratings_df = pd.read_csv(r'C:\Users\Alexa\Documents\GitHub\DSC 630 Predictive Analytics\ratings.csv')
titles_df = pd.read_csv(r'C:\Users\Alexa\Documents\GitHub\DSC 630 Predictive Analytics\movies.csv')

In [3]:
# Verify import success
ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [4]:
# Verify import success
titles_df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


#### Explore Data's Current State

In [5]:
# View data frame dimensions
ratings_df.shape

(100836, 4)

In [6]:
# View data types
ratings_df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [7]:
# View data frame dimensions
titles_df.shape

(9742, 3)

In [8]:
# View data types
titles_df.dtypes

movieId     int64
title      object
genres     object
dtype: object

#### Transform Data

In [9]:
# Merge two dataframes
ratings_df = ratings_df.merge(titles_df, on='movieId', how='left')

In [10]:
# Verify import success
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [11]:
# Remove unnecessary columns
ratings_df = ratings_df.drop(['movieId', 'timestamp', 'genres'], axis=1)

In [12]:
# Verify results
ratings_df

Unnamed: 0,userId,rating,title
0,1,4.0,Toy Story (1995)
1,1,4.0,Grumpier Old Men (1995)
2,1,4.0,Heat (1995)
3,1,5.0,Seven (a.k.a. Se7en) (1995)
4,1,5.0,"Usual Suspects, The (1995)"
...,...,...,...
100831,610,4.0,Split (2017)
100832,610,5.0,John Wick: Chapter Two (2017)
100833,610,5.0,Get Out (2017)
100834,610,5.0,Logan (2017)


#### Feature Engineering - Total Number of Ratings

In [13]:
# Rating counts by title
total_ratings = pd.DataFrame(ratings_df.groupby('title')['rating'].count())
total_ratings = total_ratings.reset_index()

In [14]:
# Rename column
total_ratings = total_ratings.rename(columns={'rating':'count of ratings'})

In [15]:
# Verify results
total_ratings

Unnamed: 0,title,count of ratings
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [18]:
# Merge two dataframes
ratings_df = ratings_df.merge(total_ratings, on='title', how = 'left')

In [19]:
# Trim dataset to remove titles with less than 100 ratings
ratings_df = ratings_df[ratings_df['count of ratings'] > 100]

In [20]:
# View results
ratings_df

Unnamed: 0,userId,rating,title,count of ratings
0,1,4.0,Toy Story (1995),215
2,1,4.0,Heat (1995),102
3,1,5.0,Seven (a.k.a. Se7en) (1995),203
4,1,5.0,"Usual Suspects, The (1995)",204
7,1,4.0,Braveheart (1995),237
...,...,...,...,...
100217,610,5.0,"Departed, The (2006)",107
100310,610,4.5,"Dark Knight, The (2008)",149
100326,610,4.5,WALL·E (2008),104
100380,610,3.5,Up (2009),105


#### Pivot Ratings Table

In [21]:
# Pivot the table
user_df = ratings_df.pivot_table(index='userId', columns='title', values='rating')

#### Movie Suggestions

In [24]:
# Set movie title
movie = 'Pretty Woman (1990)'

In [25]:
# Calculate correlations
correlations = user_df.corrwith(user_df[movie]).sort_values(ascending=False)

# Print movie suggestions
print('Your movie suggestions are:\n')
for i in range (1,11):
    print(correlations.index[i])

Your movie suggestions are:

Harry Potter and the Chamber of Secrets (2002)
Clueless (1995)
Finding Nemo (2003)
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Waterworld (1995)
Cliffhanger (1993)
Ghost (1990)
Trainspotting (1996)
Dark Knight, The (2008)
Monsters, Inc. (2001)


#### Reference

Nair, A. (2019, September 25). How To Build Your First Recommender System Using Python & MovieLens Dataset. Analytics India Magazine. https://analyticsindiamag.com/how-to-build-your-first-recommender-system-using-python-movielens-dataset/