# MovieLens Popularity Based Recommendation Model


In [26]:
#Import all libraries to be utilised by running the codes below.
import pandas as pd
import numpy as np

The codes below are written to pass in the column names and thereafter reads in the data set from the movielens folder downloaded to your desktop. The column names were extracted from the readme file on the movielens web page.

In [42]:
#passing in the column names for each of the data files. Then read them using pandas. 


users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code'] #Passing in the column names as gotten from the readme data file

users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols,
 encoding='latin-1') #read in the user data using pandas.


ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp'] #column names for ratings data file

ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols,
 encoding='latin-1') #reading in the rating data file.


movies_cols = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] #column names for the items data file.

movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_cols,
 encoding='latin-1') #reading in the item data file using pandas.



However, there is a need to examine the data by observing the shape which prints out the number of rows and columns in the data set. Furthermore, examine the content of the data by calling the head() function on the data set.

In [28]:
print(users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [29]:
print(ratings.shape)
ratings.head()

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [30]:
print(movies.shape)
movies.head()

(1682, 24)


Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Let's now go ahead to check out the basic details of the data files.

In [31]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
user_id       943 non-null int64
age           943 non-null int64
sex           943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(2), object(3)
memory usage: 36.9+ KB


In [32]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
movie_id              1682 non-null int64
movie title           1682 non-null object
release date          1681 non-null object
video release date    0 non-null float64
IMDb URL              1679 non-null object
unknown               1682 non-null int64
Action                1682 non-null int64
Adventure             1682 non-null int64
Animation             1682 non-null int64
Children's            1682 non-null int64
Comedy                1682 non-null int64
Crime                 1682 non-null int64
Documentary           1682 non-null int64
Drama                 1682 non-null int64
Fantasy               1682 non-null int64
Film-Noir             1682 non-null int64
Horror                1682 non-null int64
Musical               1682 non-null int64
Mystery               1682 non-null int64
Romance               1682 non-null int64
Sci-Fi                1682 non-null int64
Thriller 

In [33]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
user_id      100000 non-null int64
movie_id     100000 non-null int64
rating       100000 non-null int64
timestamp    100000 non-null int64
dtypes: int64(4)
memory usage: 3.1 MB


Now we are set to create the popularity model for the movielens data. First we need to merge the dataframes to form one dataframe.

In [43]:
#First merge movies and ratings dataframes together.
data = pd.merge(movies, ratings, how='outer', sort=True, on=['movie_id'])
data.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,user_id,rating,timestamp
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,308,4,887736532
1,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,287,5,875334088
2,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,148,4,877019411
3,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,280,4,891700426
4,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,66,3,883601324


In [44]:
#Merge the data (movies and ratings merged dataframe) with the user dataframes
dataset = pd.merge(data, users, how ='outer', sort=True, on=['user_id'])
dataset.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Thriller,War,Western,user_id,rating,timestamp,age,sex,occupation,zip_code
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,1,5,874965758,24,M,technician,85711
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,1,0,0,1,3,876893171,24,M,technician,85711
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,1,0,0,1,4,878542960,24,M,technician,85711
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,1,3,876893119,24,M,technician,85711
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,1,0,0,1,3,889751712,24,M,technician,85711


The next step is to group the movies by their titles in order to view the total number of rating entries for each movie title.

In [45]:
#group the movies by their titles and call size function to view the total number of ratings for each movie.

total_ratings = dataset.groupby('movie title').size()
total_ratings.head()


movie title
'Til There Was You (1997)      9
1-900 (1994)                   5
101 Dalmatians (1996)        109
12 Angry Men (1957)          125
187 (1997)                    41
dtype: int64

Next step is to take the mean ratings of each movie, by first grouping by the movie titles and selecting the movie title and ratings heading only, then finally calling the mean function.

In [46]:
mean_ratings = (dataset.groupby('movie title'))['movie title','rating'].mean()
mean_ratings.head()

Unnamed: 0_level_0,rating
movie title,Unnamed: 1_level_1
'Til There Was You (1997),2.333333
1-900 (1994),2.6
101 Dalmatians (1996),2.908257
12 Angry Men (1957),4.344
187 (1997),3.02439


In [47]:
#Convert the total ratings result into a data frame.

total_ratings = pd.DataFrame({'movie title':total_ratings.index, 'total ratings':total_ratings.values})

mean_ratings['movie title'] = mean_ratings.index

Now, we merge the two summary tables together containing the total ratings and the mean ratings. However, we sort the data by the total ratings in a descending order.

In [48]:
movielens = pd.merge(mean_ratings, total_ratings, on=['movie title']).sort_values(by = 'total ratings', ascending = False)
movielens.head()


Unnamed: 0,rating,movie title,total ratings
1398,4.358491,Star Wars (1977),583
333,3.803536,Contact (1997),509
498,4.155512,Fargo (1996),508
1234,4.00789,Return of the Jedi (1983),507
860,3.156701,Liar Liar (1997),485


Below, we take a look at the summary statistics of the data set in order to determine the accurate cut off point for the total ratings. This is due to the fact that a movie with high mean rating which has been rated by only 3 people can not be recommended.

In [49]:
movielens.describe()

Unnamed: 0,rating,total ratings
count,1664.0,1664.0
mean,3.077018,60.096154
std,0.780418,80.956484
min,1.0,1.0
25%,2.665094,7.0
50%,3.162132,27.0
75%,3.651808,80.25
max,5.0,583.0


From the results above, we see that the 75th percentile has a total rating approximately at 80. We can take a random guess of the percentile of an approximate total rating of 100. Thereafter cut off the remaining data and sort the dataframe by their mean ratings.

In [52]:
movielens_results =  movielens[:350].sort_values(by = 'total ratings', ascending = False)
movielens_results.head()

Unnamed: 0,rating,movie title,total ratings
1398,4.358491,Star Wars (1977),583
333,3.803536,Contact (1997),509
498,4.155512,Fargo (1996),508
1234,4.00789,Return of the Jedi (1983),507
860,3.156701,Liar Liar (1997),485


Here is the popularity based recommendation movie lens model.