In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
from collections import deque
import plotly
from plotly.offline import init_notebook_mode, plot
import plotly.graph_objs as go
init_notebook_mode(connected = True)

In [2]:
## Run this only once
with zipfile.ZipFile("netflix-prize-data.zip","r") as zip_ref:
     zip_ref.extractall("netflixdata")

In [66]:
## Read movie_titles file
movie_Title = pd.read_csv('netflixdata/movie_titles.csv',
                        encoding = 'ISO-8859-1',
                        header = None,
                        names = ['Id', 'Year', 'Name']).set_index('Id')
movie_Title

Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW
6,1997.0,Sick
7,1992.0,8 Man
8,2004.0,What the #$*! Do We Know!?
9,1991.0,Class of Nuke 'Em High 2
10,2001.0,Fighter


In [67]:
## Read combined data 1 file
data_1 = pd.read_csv('netflixdata/combined_data_1.txt', header = None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
data_1

Unnamed: 0,User,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
5,823519,3.0,2004-05-03
6,893988,3.0,2005-11-17
7,124105,4.0,2004-08-05
8,1248029,3.0,2004-04-22
9,1842128,4.0,2004-05-09


In [68]:
## Read combined data 2 file
data_2 = pd.read_csv('netflixdata/combined_data_2.txt', header = None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
data_2

Unnamed: 0,User,Rating,Date
0,4500:,,
1,2532865,4.0,2005-07-26
2,573364,3.0,2005-06-20
3,1696725,3.0,2004-02-27
4,1253431,3.0,2004-03-31
5,1265574,2.0,2003-09-01
6,1049643,1.0,2003-11-15
7,1601348,4.0,2005-04-05
8,1495289,5.0,2005-07-09
9,1254903,3.0,2003-09-02


In [69]:
## Read combined data 3 file
data_3 = pd.read_csv('netflixdata/combined_data_3.txt', header = None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
data_3

Unnamed: 0,User,Rating,Date
0,9211:,,
1,1277134,1.0,2003-12-02
2,2435457,2.0,2005-06-01
3,2338545,3.0,2001-02-17
4,2218269,1.0,2002-12-27
5,441153,4.0,2002-10-11
6,1921624,2.0,2005-08-31
7,2096652,3.0,2004-05-31
8,818736,2.0,2004-02-17
9,284560,3.0,2003-07-27


In [70]:
## Read combined data 4 file
data_4 = pd.read_csv('netflixdata/combined_data_4.txt', header = None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
data_4

Unnamed: 0,User,Rating,Date
0,13368:,,
1,2385003,4.0,2004-07-08
2,659432,3.0,2005-03-16
3,751812,2.0,2002-12-16
4,2625420,2.0,2004-05-25
5,1650301,1.0,2005-08-30
6,2269227,4.0,2005-10-27
7,2220672,4.0,2002-08-19
8,2500511,4.0,2003-08-11
9,1452058,2.0,2005-01-29


In [71]:
## Combine all the data files into one
data_all = data_1
data_all = pd.concat([data_all, data_2], ignore_index = True)
data_all = pd.concat([data_all, data_3], ignore_index = True)
data_all = pd.concat([data_all, data_4], ignore_index = True)
data_all

Unnamed: 0,User,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
5,823519,3.0,2004-05-03
6,893988,3.0,2005-11-17
7,124105,4.0,2004-08-05
8,1248029,3.0,2004-04-22
9,1842128,4.0,2004-05-09


In [72]:
print("The dataframe of all the combined files has", data_all.shape[0], "rows and", data_all.shape[1], "columns.")

The dataframe of all the combined files has 100498277 rows and 3 columns.


In [73]:
# Find empty rows to slice dataframe for each movie
tmp_Movies = data_all[data_all['Rating'].isna()]['User'].reset_index()
tmp_Movies

Unnamed: 0,index,User
0,0,1:
1,548,2:
2,694,3:
3,2707,4:
4,2850,5:
5,3991,6:
6,5011,7:
7,5105,8:
8,20016,9:
9,20112,10:


In [74]:
movie_Indices = [[index, int(movie[:-1])] for index, movie in tmp_Movies.values]
len(movie_Indices)

17770

In [75]:
# Shift the movie_Indices by one to get start and endpoints of all movies
shifted_movieIndices = deque(movie_Indices)
shifted_movieIndices.rotate(-1)
len(shifted_movieIndices)

17770

In [76]:
# Gather all dataframes
user_data = []

# Iterate over all movies
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_Indices, shifted_movieIndices):

    # Check if it is the last movie in the file
    if df_id_1<df_id_2:
        tmp_df = data_all.iloc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = data_all.iloc[df_id_1+1:].copy()
        
    # Create movie_id column
    tmp_df['Movie'] = movie_id
    
    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all dataframes
final_df = pd.concat(user_data)
del user_data, data_all, tmp_Movies, tmp_df, shifted_movieIndices, movie_Indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(final_df.shape))
final_df.sample(5)

Shape User-Ratings:	(100462738, 4)


Unnamed: 0,User,Rating,Date,Movie
94887199,146765,4.0,2004-02-29,16864
54393936,930931,4.0,2005-06-15,9909
10073316,1332725,3.0,2005-05-31,1962
7312785,2283439,4.0,2003-12-03,1467
95196636,2506564,3.0,2003-12-12,16882


In [77]:
# Get data
data = final_df['Rating'].value_counts().sort_index(ascending=False)

In [78]:
# Create trace
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / final_df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               marker = dict(color = '#db0000'))

In [79]:
# Create layout
layout = dict(title = 'Distribution Of {} Netflix-Ratings'.format(final_df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))

In [80]:
# Create plot
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(fig)

'temp-plot.html'

Filter Sparse Movies and Users

In [39]:
# Filter sparse movies
min_movie_ratings = 40000
filter_Movies = (final_df['Movie'].value_counts() > min_movie_ratings)
filter_Movies = filter_Movies[filter_Movies].index.tolist()

In [40]:
# Filter sparse users
min_user_ratings = 400
filter_Users = (final_df['User'].value_counts() > min_user_ratings)
filter_Users = filter_Users[filter_Users].index.tolist()

In [41]:
# Actual filtering
filterd_df = final_df[(final_df['Movie'].isin(filter_Movies)) & (final_df['User'].isin(filter_Users))]
del filter_Movies, filter_Users, min_movie_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(final_df.shape))
print('Shape User-Ratings filtered:\t{}'.format(filterd_df.shape))

Shape User-Ratings unfiltered:	(100462738, 4)
Shape User-Ratings filtered:	(23842843, 4)


In [42]:
# Shuffle DataFrame
filterd_df = filterd_df.drop('Date', axis=1).sample(frac=1).reset_index(drop=True)

# Testingsize
n = 100000

# Split train- & testset
train = filterd_df[:-n]
test = filterd_df[-n:]

In [43]:
print("Train data shape", train.shape)
print("Test data shape", test.shape)

Train data shape (23742843, 3)
Test data shape (100000, 3)


In [81]:
matrix_df = train.pivot_table(index='User', columns='Movie', values='Rating')
print('Shape of the User-Movie-Matrix:\t{}'.format(matrix_df.shape))
matrix_df.sample(5)

Shape of the User-Movie-Matrix:	(74258, 646)


Movie,30,175,191,197,241,290,299,312,313,329,...,17431,17441,17479,17482,17508,17560,17622,17627,17709,17764
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2310051,,,3.0,,,,2.0,,4.0,,...,,5.0,2.0,5.0,,,2.0,,,
2626475,,5.0,4.0,3.0,,,5.0,3.0,,5.0,...,,,,,,4.0,,4.0,,5.0
585922,2.0,2.0,5.0,,,5.0,3.0,,1.0,5.0,...,,,,,,3.0,1.0,,,5.0
1507168,,3.0,,,4.0,,3.0,,,,...,5.0,3.0,,3.0,,,,3.0,,1.0
349724,5.0,,,,4.0,,,,,,...,,,,,,,,5.0,4.0,


Cosine User-User Similarity

In [45]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

# User index for recommendation
user_index = 0

# Number of similar users for recommendation
no_of_recomm = 100

# Plot top n recommendations
top20_recomm = 20

In [46]:
# Fill in missing values
imputedMatrix_df = matrix_df.T.fillna(matrix_df.mean(axis=1)).T
imputedMatrix_df.head()

Movie,30,175,191,197,241,290,299,312,313,329,...,17431,17441,17479,17482,17508,17560,17622,17627,17709,17764
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000033,4.0,3.376033,4.0,3.376033,3.376033,3.376033,3.376033,3.376033,3.376033,3.376033,...,3.376033,2.0,3.376033,3.0,3.376033,3.0,3.376033,3.0,3.376033,4.0
1000062,3.0,3.306233,3.306233,3.0,5.0,3.306233,3.0,3.306233,4.0,3.306233,...,3.306233,4.0,3.0,3.306233,3.306233,3.0,3.306233,4.0,3.0,3.0
1000079,4.0,3.026764,3.026764,3.0,3.026764,3.026764,3.0,3.026764,2.0,3.026764,...,3.026764,3.0,3.0,3.026764,3.026764,3.026764,1.0,2.0,3.026764,3.0
1000084,3.708185,3.708185,3.708185,4.0,3.708185,3.708185,3.708185,4.0,4.0,3.708185,...,3.708185,3.708185,4.0,3.708185,3.708185,4.0,3.708185,3.708185,3.708185,3.708185
1000095,3.772152,4.0,3.772152,3.772152,3.772152,3.772152,3.772152,3.772152,3.0,5.0,...,3.772152,3.772152,4.0,3.772152,5.0,3.772152,4.0,3.0,3.772152,3.772152


In [47]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity between all users
similarity = cosine_similarity(imputedMatrix_df.values)
similarity

array([[1.        , 0.97396189, 0.96201277, ..., 0.97784986, 0.97761995,
        0.97343282],
       [0.97396189, 1.        , 0.95510403, ..., 0.97490613, 0.97207822,
        0.97053425],
       [0.96201277, 0.95510403, 1.        , ..., 0.95344525, 0.95609527,
        0.95473123],
       ...,
       [0.97784986, 0.97490613, 0.95344525, ..., 1.        , 0.97596711,
        0.97044239],
       [0.97761995, 0.97207822, 0.95609527, ..., 0.97596711, 1.        ,
        0.9715344 ],
       [0.97343282, 0.97053425, 0.95473123, ..., 0.97044239, 0.9715344 ,
        1.        ]])

In [48]:
# Remove self-similarity from similarity-matrix
similarity = similarity - np.eye(similarity.shape[0])
similarity

array([[-2.22044605e-15,  9.73961895e-01,  9.62012772e-01, ...,
         9.77849863e-01,  9.77619947e-01,  9.73432818e-01],
       [ 9.73961895e-01,  4.44089210e-16,  9.55104029e-01, ...,
         9.74906125e-01,  9.72078216e-01,  9.70534246e-01],
       [ 9.62012772e-01,  9.55104029e-01, -1.11022302e-15, ...,
         9.53445250e-01,  9.56095273e-01,  9.54731227e-01],
       ...,
       [ 9.77849863e-01,  9.74906125e-01,  9.53445250e-01, ...,
         1.11022302e-15,  9.75967112e-01,  9.70442395e-01],
       [ 9.77619947e-01,  9.72078216e-01,  9.56095273e-01, ...,
         9.75967112e-01,  1.77635684e-15,  9.71534399e-01],
       [ 9.73432818e-01,  9.70534246e-01,  9.54731227e-01, ...,
         9.70442395e-01,  9.71534399e-01,  2.66453526e-15]])

In [49]:
# Sort similar users by index
similar_user_index = np.argsort(similarity[user_index])[::-1]
similarity

array([[-2.22044605e-15,  9.73961895e-01,  9.62012772e-01, ...,
         9.77849863e-01,  9.77619947e-01,  9.73432818e-01],
       [ 9.73961895e-01,  4.44089210e-16,  9.55104029e-01, ...,
         9.74906125e-01,  9.72078216e-01,  9.70534246e-01],
       [ 9.62012772e-01,  9.55104029e-01, -1.11022302e-15, ...,
         9.53445250e-01,  9.56095273e-01,  9.54731227e-01],
       ...,
       [ 9.77849863e-01,  9.74906125e-01,  9.53445250e-01, ...,
         1.11022302e-15,  9.75967112e-01,  9.70442395e-01],
       [ 9.77619947e-01,  9.72078216e-01,  9.56095273e-01, ...,
         9.75967112e-01,  1.77635684e-15,  9.71534399e-01],
       [ 9.73432818e-01,  9.70534246e-01,  9.54731227e-01, ...,
         9.70442395e-01,  9.71534399e-01,  2.66453526e-15]])

In [50]:
# Sort similar users by score
similar_user_score = np.sort(similarity[user_index])[::-1]

In [51]:
# Get unrated movies
unrated_movies = matrix_df.iloc[user_index][matrix_df.iloc[user_index].isna()].index
unrated_movies

Int64Index([  175,   197,   241,   290,   299,   312,   313,   329,   331,
              357,
            ...
            17328, 17330, 17339, 17387, 17405, 17431, 17479, 17508, 17622,
            17709],
           dtype='int64', name='Movie', length=404)

In [53]:
# Weight ratings of the top n most similar users with their rating and compute the mean for each movie
mean_movie_recommendations = (imputedMatrix_df.iloc[similar_user_index[:no_of_recomm]].T * similar_user_score[:no_of_recomm]).T.mean(axis=0)
mean_movie_recommendations

Movie
30       4.583693
175      4.600706
191      4.635365
197      4.603173
241      4.604229
290      4.600437
299      4.591258
312      4.576748
313      4.598377
329      4.601594
331      4.582925
357      4.564805
406      4.602933
457      4.614590
468      4.571336
482      4.603011
483      4.595428
571      4.617851
607      4.601719
658      4.599034
708      4.571391
758      4.598378
788      4.598802
798      4.593127
886      4.603521
985      4.601174
1046     4.587148
1073     4.608212
1102     4.601406
1110     4.600102
           ...   
16969    4.604131
17031    4.599701
17088    4.607301
17129    4.601873
17149    4.603493
17154    4.591073
17157    4.609388
17169    4.642296
17174    4.588948
17189    4.587183
17215    4.603795
17295    4.578986
17308    4.557260
17324    4.609148
17328    4.604558
17330    4.584835
17339    4.582052
17355    4.609960
17387    4.562371
17405    4.606445
17431    4.588155
17441    4.597498
17479    4.584361
17482    4.599886
1750

In [54]:
# Filter for unrated movies and sort results
best_movie_recommendations = mean_movie_recommendations[unrated_movies].sort_values(ascending=False).to_frame().join(movie_Title)
best_movie_recommendations

Unnamed: 0_level_0,0,Year,Name
Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12293,4.632647,1972.0,The Godfather
7057,4.629748,2002.0,Lord of the Rings: The Two Towers: Extended Ed...
13081,4.626568,2004.0,Troy
7230,4.625842,2001.0,The Lord of the Rings: The Fellowship of the R...
5154,4.625030,2005.0,Constantine
3624,4.624302,2003.0,The Last Samurai
3079,4.623961,1994.0,The Lion King: Special Edition
9442,4.623095,2004.0,The Chronicles of Riddick
3864,4.622302,2005.0,Batman Begins
14961,4.621482,2003.0,Lord of the Rings: The Return of the King: Ext...


In [55]:
# Create user-id mapping
user_id_mapping = {id:i for i, id in enumerate(imputedMatrix_df.index)}

In [56]:
prediction = []
# Iterate over all testset items
for user_id in test['User'].unique():
    
    # Sort similar users by index
    similar_user_index = np.argsort(similarity[user_id_mapping[user_id]])[::-1]
    # Sort similar users by score
    similar_user_score = np.sort(similarity[user_id_mapping[user_id]])[::-1]
    
    for movie_id in test[test['User']==user_id]['Movie'].values:
        # Compute predicted score
        score = (imputedMatrix_df.iloc[similar_user_index[:no_of_recomm]][movie_id] * similar_user_score[:no_of_recomm]).values.sum() / similar_user_score[:no_of_recomm].sum()
        prediction.append([user_id, movie_id, score])

In [57]:
# Create prediction DataFrame
df_pred = pd.DataFrame(prediction, columns=['User', 'Movie', 'Prediction']).set_index(['User', 'Movie'])
df_pred = test.set_index(['User', 'Movie']).join(df_pred)
df_pred

Unnamed: 0_level_0,Unnamed: 1_level_0,Rating,Prediction
User,Movie,Unnamed: 2_level_1,Unnamed: 3_level_1
2597758,2391,4.0,4.611707
2113596,7624,5.0,4.380168
2618083,14644,2.0,4.369909
1156513,7233,4.0,4.624447
853918,13050,3.0,4.306955
1976943,10225,5.0,4.568632
476098,6450,5.0,4.446022
1259176,6974,4.0,4.655027
356546,1962,3.0,4.610594
939176,12785,5.0,4.682411


In [58]:
# Get labels and predictions
y_true = df_pred['Rating'].values
y_pred = df_pred['Prediction'].values

In [59]:
# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))
rmse

1.3020319411168924

In [64]:
# Create trace
trace = go.Bar(x = best_movie_recommendations.iloc[:top20_recomm, 0],
               text = best_movie_recommendations['Name'],
               textposition = 'outside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, top20_recomm + 1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Recommended Movies For A User Based On Similarity'.format(top20_recomm),
              xaxis = dict(title = 'Recommendation-Rating',
                           range = (4.1, 4.8)),
              yaxis = dict(title = 'Movie'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
plot(fig)

'temp-plot.html'