In [1]:
import pandas as pd
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 500)

In [2]:
movie = pd.read_csv('../input/movielens-20m-dataset/movie.csv')
rating = pd.read_csv('../input/movielens-20m-dataset/rating.csv')
df = movie.merge(rating, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


# Create user_movie matrix with users in rows and movies in columns.

In [3]:
df.shape

(20000797, 6)

# total number of comments is 20000797
# number of unique movies is 27262

In [4]:
df["title"].nunique()

27262

# number of comments per movie

In [5]:
rating_counts = pd.DataFrame(df["title"].value_counts())
rating_counts.head()

Unnamed: 0,title
Pulp Fiction (1994),67310
Forrest Gump (1994),66172
"Shawshank Redemption, The (1994)",63366
"Silence of the Lambs, The (1991)",63299
Jurassic Park (1993),59715


# Narrow the scope to movies with 1000 or more comments, the total number of comments is 17766015 and the total number of movies is 3159

In [6]:
rare_movies = rating_counts[rating_counts["title"] <= 1000].index

common_movies = df[~df["title"].isin(rare_movies)]

common_movies.shape

(17766015, 6)

In [7]:
common_movies["title"].nunique()

3159

# Create user_movie matrix with users in rows and movies in columns

In [8]:
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

user_movie_df.shape

(138493, 3159)

In [9]:
user_movie_df.head(10)

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
6.0,,,,,,,,,,,...,,,,,,,,,,
7.0,,,,,,,,,,,...,,,,,,,,,,2.0
8.0,,,,,,,,,,,...,,,,,,,,,,
9.0,,,,,,,,,,,...,,,,,,,,,,
10.0,,,,,,,,,,,...,,,,,,,,,,


#  Item-Based Movie Suggestions
Now that we have the user-movie matrix, we can calculate the correlations. In user_movie_df the columns were the movie name, then if we fetch this column the user id-movie scores will come. This will be assigned to a variable named movie name.

In [10]:
movie_name = "Finding Nemo (2003)"
movie_name = user_movie_df[movie_name]

In [11]:
user_movie_df.corrwith(movie_name).sort_values(ascending=False).head(10)

title
Finding Nemo (2003)        1.000000
Monsters, Inc. (2001)      0.563173
Bug's Life, A (1998)       0.522080
Toy Story (1995)           0.504607
Toy Story 2 (1999)         0.489461
Incredibles, The (2004)    0.470720
Cars (2006)                0.464074
Lion King, The (1994)      0.453159
Toy Story 3 (2010)         0.445990
Ratatouille (2007)         0.443615
dtype: float64

In [12]:
user_movie_df.corrwith(movie_name).sort_values(ascending=False)[1:6]

title
Monsters, Inc. (2001)      0.563173
Bug's Life, A (1998)       0.522080
Toy Story (1995)           0.504607
Toy Story 2 (1999)         0.489461
Incredibles, The (2004)    0.470720
dtype: float64

# Importing the necessary libraries and load data

In [13]:
import numpy as np
import pandas as pd
df1 = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.data',sep='\t',names=['user_id','item_id','rating','timestamp'])
df2 = pd.read_csv("../input/movielens-100k-dataset/ml-100k/u.item", sep="|", encoding="iso-8859-1",names=["item_id","item_name","date","unknown1"
"website","rat1","rat2","rat3","rat4","rat5","rat6","rat7","rat8","rat9","rat10","rat11","rat12","rat13",
"rat14","rat15","rat16","rat17","rat18","rat19","rat20"])
print(df1.head())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


# df1 contains the user id , the movie id and the corresponding ratings
# df2 contains the movie name and it’s corresponding item_id

In [14]:
df2 = df2.iloc[:,0:2]
df2.head()

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


# Merge dataframes

In [15]:
data = df1.merge(df2,on="item_id")
data.drop(['timestamp'],inplace=True,axis=1)
data.head()

Unnamed: 0,user_id,item_id,rating,item_name
0,196,242,3,Kolya (1996)
1,63,242,3,Kolya (1996)
2,226,242,5,Kolya (1996)
3,154,242,3,Kolya (1996)
4,306,242,5,Kolya (1996)


# Pivot table
 create a table with each movie representing a column and each user representing a row

In [16]:
data_table = pd.pivot_table(data,values='rating',columns='item_name',index='user_id')
data_table.head()

item_name,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


# Recommending

In [17]:
print("here are a list of 20 movies to recommend to a user who has liked 'Jurassic Park (1993)'")
print(data_table.corr()['Jurassic Park (1993)'].sort_values(ascending=False).iloc[:20])

here are a list of 20 movies to recommend to a user who has liked 'Jurassic Park (1993)'
item_name
Killer (Bulletproof Heart) (1994)                           1.0
Jurassic Park (1993)                                        1.0
Safe Passage (1994)                                         1.0
Roseanna's Grave (For Roseanna) (1997)                      1.0
Albino Alligator (1996)                                     1.0
Outlaw, The (1943)                                          1.0
Nico Icon (1995)                                            1.0
Mr. Jones (1993)                                            1.0
Midnight Dancers (Sibak) (1994)                             1.0
Metisse (Café au Lait) (1993)                               1.0
Love Serenade (1996)                                        1.0
King of the Hill (1993)                                     1.0
Jack and Sarah (1995)                                       1.0
Second Jungle Book: Mowgli & Baloo, The (1997)              1.0
Hurri

In [18]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD


columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.data', sep='\t', names=columns)


columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie title']]

combined_movies_data = pd.merge(df, movie_names, on='item_id')
combined_movies_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


# create the user-item table by pivoting the data

In [19]:
rating_crosstab = combined_movies_data.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)
rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


In [20]:
X = rating_crosstab.T

# Matrix of 1664 rows (as many as the unique movies) and 12 columns which are the latent variables

In [21]:
SVD = TruncatedSVD(n_components=12, random_state=5)

resultant_matrix = SVD.fit_transform(X)

resultant_matrix.shape

(1664, 12)

# Correlation Pearson

In [22]:
### correlation matrix
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape

(1664, 1664)

In [23]:
col_idx = rating_crosstab.columns.get_loc("Aladdin (1992)")
corr_specific = corr_mat[col_idx]
pd.DataFrame({'corr_specific':corr_specific, 'Movies': rating_crosstab.columns})\
.sort_values('corr_specific', ascending=False)\
.head(10)

Unnamed: 0,corr_specific,Movies
36,1.0,Aladdin (1992)
142,0.978227,Beauty and the Beast (1991)
867,0.964129,"Lion King, The (1994)"
1445,0.959699,"Sword in the Stone, The (1963)"
338,0.937533,Cool Runnings (1993)
88,0.935516,Apollo 13 (1995)
1365,0.933516,"Sound of Music, The (1965)"
797,0.932167,Jurassic Park (1993)
1249,0.930703,Robin Hood: Prince of Thieves (1991)
300,0.929626,Cinderella (1950)


In [24]:
col_idx = rating_crosstab.columns.get_loc("Godfather, The (1972)")
corr_specific = corr_mat[col_idx]
pd.DataFrame({'corr_specific':corr_specific, 'Movies': rating_crosstab.columns})\
.sort_values('corr_specific', ascending=False)\
.head(10)

Unnamed: 0,corr_specific,Movies
612,1.0,"Godfather, The (1972)"
613,0.921444,"Godfather: Part II, The (1974)"
498,0.92142,Fargo (1996)
623,0.900758,GoodFellas (1990)
237,0.865385,"Bronx Tale, A (1993)"
1398,0.865148,Star Wars (1977)
209,0.864269,"Boot, Das (1981)"
389,0.857308,Dead Man Walking (1995)
622,0.845558,"Good, The Bad and The Ugly, The (1966)"
1190,0.842705,Pulp Fiction (1994)


In [25]:
col_idx = rating_crosstab.columns.get_loc("Pulp Fiction (1994)")
corr_specific = corr_mat[col_idx]
pd.DataFrame({'corr_specific':corr_specific, 'Movies': rating_crosstab.columns})\
.sort_values('corr_specific', ascending=False)\
.head(10)

Unnamed: 0,corr_specific,Movies
1190,1.0,Pulp Fiction (1994)
1572,0.974919,"Usual Suspects, The (1995)"
571,0.971153,Full Metal Jacket (1987)
1329,0.969588,"Silence of the Lambs, The (1991)"
623,0.96783,GoodFellas (1990)
1534,0.960617,True Romance (1993)
1183,0.959133,"Professional, The (1994)"
1231,0.95357,Reservoir Dogs (1992)
1301,0.951028,Seven (Se7en) (1995)
1440,0.943573,Swimming with Sharks (1995)
