In [1]:
import numpy as np
import pandas as pd

In [2]:
# importing ratings.csv file
ratings_data = pd.read_csv("F:/movie-recommendation-system/dataset/ratings.csv")  
# viewing some of its uppermost rows
ratings_data.head() 
# the file has columns of userID, movieID, rating of the movie and also the timestamp for the respective movie

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# importing movies.csv file
movies_data = pd.read_csv("F:/movie-recommendation-system/dataset/movies.csv")  
# viewing some of its uppermost rows
movies_data.head() 
# the file has columns of movieID, titile and genres of the respective movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# from the above two dataset files we can see that movieID is common column for both of them and hence is the link between them
# therefore we are gonna merge these two files on the basis of movieID

final_data = pd.merge(ratings_data, movies_data, on='movieId')
# viewing some of its uppermost rows
final_data.head()
# now the merged content has columns of userID, movieId, rating, timestamp, title and the genres of the movies

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [5]:
# this is grouping the data by title and getting the mean of ratings for the corresponding movie
final_data.groupby('title')['rating'].mean().sort_values(ascending=False).head()
# the drawback of this grouping is that we dont know how many users rated a movie because it is not possible that 200s-300s of peope rate a movie 5-star

title
Karlson Returns (1970)                           5.0
Winter in Prostokvashino (1984)                  5.0
My Love (2006)                                   5.0
Sorority House Massacre II (1990)                5.0
Winnie the Pooh and the Day of Concern (1972)    5.0
Name: rating, dtype: float64

In [6]:
# this is grouping the data by title and the total no. of ratings for a particular movie
final_data.groupby('title')['rating'].count().sort_values(ascending=False).head()
# the drawback of this grouping is that we dont know that what is the average rating of the movie

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
Name: rating, dtype: int64

In [7]:
ratings = pd.DataFrame(final_data.groupby('title')['rating'].mean())
ratings['num of ratings'] = pd.DataFrame(final_data.groupby('title')['rating'].count())
ratings.head()
# now the problem is solved we have the average rating and correspondingly we have the no. of users who have rated the movie

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.0,1
'Hellboy': The Seeds of Creation (2004),4.0,1
'Round Midnight (1986),3.5,2
'Salem's Lot (2004),5.0,1
'Til There Was You (1997),4.0,2


In [8]:
# A matrix is created that has the user ids on one axis and the movie title on another axis. 
# Each cell will then consist of the rating the user gave to that movie. 
# Note there will be a lot of NaN values, because most people have not seen most of the movies.
final_matrix = final_data.pivot_table(index='userId',columns='title',values='rating')
final_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [9]:
ratings.sort_values('num of ratings',ascending=False).head(10)
# sorting the newly made ratings dataset on the basis of no. of ratings

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),4.164134,329
"Shawshank Redemption, The (1994)",4.429022,317
Pulp Fiction (1994),4.197068,307
"Silence of the Lambs, The (1991)",4.16129,279
"Matrix, The (1999)",4.192446,278
Star Wars: Episode IV - A New Hope (1977),4.231076,251
Jurassic Park (1993),3.75,238
Braveheart (1995),4.031646,237
Terminator 2: Judgment Day (1991),3.970982,224
Schindler's List (1993),4.225,220


In [10]:
# for testing the movie recommendation system lets choose 2 movies- Jurassic Park and Braveheart.
# now we will find out all the user ratings for these two movies from the matrix we made.
# jurassic park user ratings
jurassic_park_user_ratings = final_matrix['Jurassic Park (1993)']
jurassic_park_user_ratings.head()

userId
1    4.0
2    NaN
3    NaN
4    NaN
5    NaN
Name: Jurassic Park (1993), dtype: float64

In [11]:
# braveheart user ratings
braveheart_user_ratings = final_matrix['Braveheart (1995)']
braveheart_user_ratings.head()

userId
1    4.0
2    NaN
3    NaN
4    NaN
5    4.0
Name: Braveheart (1995), dtype: float64

In [12]:
# now we will use corrwith() [correlation with] method in order to find the correlation between the 2 panda series.
# corrwith() method for jurassic park
similar_to_jurassic_park = final_matrix.corrwith(jurassic_park_user_ratings)
# corrwith() method for braveheart
similar_to_braveheart = final_matrix.corrwith(braveheart_user_ratings)

  c = cov(x, y, rowvar)
  c *= 1. / np.float64(fact)


In [13]:
# when we were making the ratings dataset for the two movies we saw that some users havent rated the movie therefor the value is NaN in that case
# therefore we need to clean are dataset ans get rid of NaN
# this could be done if we use dataframe instead of series
#jurassic park correlation data
corr_jurassic_park = pd.DataFrame(similar_to_jurassic_park,columns=['Correlation'])
corr_jurassic_park.dropna(inplace=True)
corr_jurassic_park.head(10)

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
"'burbs, The (1989)",0.104077
(500) Days of Summer (2009),-0.125237
*batteries not included (1987),0.290929
...And Justice for All (1979),-0.981981
10 Cent Pistol (2015),-1.0
10 Cloverfield Lane (2016),0.700649
10 Things I Hate About You (1999),-0.127968
"10,000 BC (2008)",0.486159
101 Dalmatians (1996),0.127571
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.04919


In [14]:
# braveheart correlation data
corr_braveheart = pd.DataFrame(similar_to_braveheart,columns=['Correlation'])
corr_braveheart.dropna(inplace=True)
corr_braveheart.head(10)

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
"'burbs, The (1989)",0.332504
(500) Days of Summer (2009),0.021388
*batteries not included (1987),-1.0
...And Justice for All (1979),0.327327
10 Cloverfield Lane (2016),0.534522
10 Items or Less (2006),-1.0
10 Things I Hate About You (1999),0.034926
"10,000 BC (2008)",-0.141456
101 Dalmatians (1996),-0.139975
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.048223


In [15]:
# now from the data we can see that the value of the correlation is varying a lot.
# we want to recommend the movie which has the highest correlation value 
# therefore we will sort the data so that the movie with the highest correlation value comes first.

# jurrasic park
corr_jurassic_park.sort_values('Correlation',ascending=False).head(10)

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
In the Valley of Elah (2007),1.0
The Overnight (2015),1.0
Monkey Business (1952),1.0
"Cup, The (Phörpa) (1999)",1.0
Fear (1996),1.0
Feast (2005),1.0
Police Story 2 (Ging chaat goo si juk jaap) (1988),1.0
Insidious (2010),1.0
"Whistleblower, The (2010)",1.0
When the Cat's Away (Chacun cherche son chat) (1996),1.0


In [16]:
# braveheart
corr_braveheart.sort_values('Correlation',ascending=False).head(10)

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
Sisters (2015),1.0
"Class, The (Klass) (2007)",1.0
Ulee's Gold (1997),1.0
Say It Isn't So (2001),1.0
Savannah Smiles (1982),1.0
Chasers (1994),1.0
Children of Dune (2003),1.0
Living Out Loud (1998),1.0
Underworld (1996),1.0
Furious 7 (2015),1.0


In [17]:
# many movies that have the high correlation value can be rated by only a few of the users 

# jurassic park
corr_jurassic_park = corr_jurassic_park.join(ratings['num of ratings'])
corr_jurassic_park.head()

Unnamed: 0_level_0,Correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",0.104077,17
(500) Days of Summer (2009),-0.125237,42
*batteries not included (1987),0.290929,7
...And Justice for All (1979),-0.981981,3
10 Cent Pistol (2015),-1.0,2


In [18]:
# braveheart
corr_braveheart = corr_braveheart.join(ratings['num of ratings'])
corr_braveheart.head()

Unnamed: 0_level_0,Correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",0.332504,17
(500) Days of Summer (2009),0.021388,42
*batteries not included (1987),-1.0,7
...And Justice for All (1979),0.327327,3
10 Cloverfield Lane (2016),0.534522,14


In [19]:
# so we should mark that their should be a minimum no. of users involvement while calculatin the correlation value
# lets say minimum no. of users is 50

# jurassic park
corr_jurassic_park[corr_jurassic_park['num of ratings']>50].sort_values('Correlation',ascending=False).head(10)

# so we can recommend 'Field of Dreams', 'Kung Fu Panda' and so on to the users who saw jurassic park.

Unnamed: 0_level_0,Correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Jurassic Park (1993),1.0,238
Field of Dreams (1989),0.643783,56
Kung Fu Panda (2008),0.634736,54
Lethal Weapon (1987),0.608894,75
The Hunger Games (2012),0.591129,54
"Bug's Life, A (1998)",0.540132,92
Outbreak (1995),0.53378,101
"Sting, The (1973)",0.524691,64
Ghostbusters (a.k.a. Ghost Busters) (1984),0.522286,120
Top Gun (1986),0.515869,83


In [20]:
# braveheart
corr_braveheart[corr_braveheart['num of ratings']>50].sort_values('Correlation',ascending=False).head(10)

#so we can recommend 'Field of Dreams', 'Mystic River' and so on to the users who saw braveheart.

Unnamed: 0_level_0,Correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Braveheart (1995),1.0,237
Field of Dreams (1989),0.682274,56
Mystic River (2003),0.644782,52
Grumpier Old Men (1995),0.636963,52
Kung Fu Panda (2008),0.625232,54
Batman Begins (2005),0.61055,116
Guardians of the Galaxy (2014),0.604996,59
Top Gun (1986),0.600929,83
Rocky (1976),0.599043,64
"Dark Knight Rises, The (2012)",0.598218,76
