In [1]:
import pandas as pd
# import SVD from surprise
from surprise import SVD

# # import dataset from surprise
from surprise import Dataset
from surprise import Reader


# import accuracy from surprise
from surprise import accuracy

# import train_test_split from surprise.model_selection
from surprise.model_selection import train_test_split
# import GridSearchCV from surprise.model_selection
from surprise.model_selection import GridSearchCV
# import cross_validate from surprise.model_selection
from surprise.model_selection import cross_validate

In [2]:
import numpy as np
from scipy.sparse import csr_matrix

In [3]:
df = pd.read_csv('netflix_data/combined_data_1.txt', header=None, names=['cID', 'rating'], usecols=[0,1])

In [4]:
df.dtypes

cID        object
rating    float64
dtype: object

In [5]:
df.head()

Unnamed: 0,cID,rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [6]:
print(df.iloc[::5000000, :])

              cID  rating
0              1:     NaN
5000000   2560324     4.0
10000000  2271935     2.0
15000000  1921803     2.0
20000000  1933327     3.0


In [7]:
df.iloc[694]

cID        3:
rating    NaN
Name: 694, dtype: object

In [8]:
## adding movie ID - create an umpy array with correct length... 
df_nan = pd.DataFrame(pd.isnull(df.rating)) # where it is missing - is the movieID
df_nan = df_nan[df_nan['rating'] == True] # place holder of true... 
df_nan = df_nan.reset_index()

In [9]:
df_nan.head() # index columns are spots where rating was missing - indicating movie ID

Unnamed: 0,index,rating
0,0,True
1,548,True
2,694,True
3,2707,True
4,2850,True


In [10]:
# turns it into a numpy array - otherwise will run out of memory
movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]): # creates a tuple from the index 
    # numpy approach
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

print('Movie numpy: {}'.format(movie_np)) #stored movie ID as a numpy array
print('Length: {}'.format(len(movie_np)))

Movie numpy: [1.000e+00 1.000e+00 1.000e+00 ... 4.499e+03 4.499e+03 4.499e+03]
Length: 24053764


In [11]:
# remove the movie ID rows now from the original DF
df = df[pd.notnull(df['rating'])]

df['Movie_Id'] = movie_np.astype(int) # from numpy to column
df['cID'] = df['cID'].astype(int)
print('-Dataset examples-')
print(df.iloc[::5000000, :])

-Dataset examples-
              cID  rating  Movie_Id
1         1488844     3.0         1
5000996    501954     2.0       996
10001962   404654     5.0      1962
15002876   886608     2.0      2876
20003825  1193835     2.0      3825


In [12]:
# loading the movie mapping to get movie titles
df_title = pd.read_csv('netflix_data/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'], usecols=[0,1,2])
df_title.set_index('Movie_Id', inplace = True)
print(df_title.head(10))

            Year                          Name
Movie_Id                                      
1         2003.0               Dinosaur Planet
2         2004.0    Isle of Man TT 2004 Review
3         1997.0                     Character
4         1994.0  Paula Abdul's Get Up & Dance
5         2004.0      The Rise and Fall of ECW
6         1997.0                          Sick
7         1992.0                         8 Man
8         2004.0    What the #$*! Do We Know!?
9         1991.0      Class of Nuke 'Em High 2
10        2001.0                       Fighter


In [13]:
# take only movies and customers with a lot of ratings and reviews
f = ['count','mean']

df_movie_summary = df.groupby('Movie_Id')['rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print('Movie minimum times of review: {}'.format(movie_benchmark))

df_cust_summary = df.groupby('cID')['rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print('Customer minimum times of review: {}'.format(cust_benchmark))

Movie minimum times of review: 1799.0
Customer minimum times of review: 52.0


In [14]:
# trimming the data 
print('Original Shape: {}'.format(df.shape))
df = df[~df['Movie_Id'].isin(drop_movie_list)]
df = df[~df['cID'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(df.shape))
print('-Data Examples-')
print(df.iloc[::5000000, :])

Original Shape: (24053764, 3)
After Trim Shape: (17337458, 3)
-Data Examples-
              cID  rating  Movie_Id
696        712664     5.0         3
6932490   1299309     5.0      1384
13860273   400155     3.0      2660
20766530   466962     4.0      3923


In [15]:
# convert to matrix
df_p = pd.pivot_table(df,values='rating',index='cID',columns='Movie_Id')

print(df_p.shape)

(143458, 1350)


In [22]:
df2 = df.sample(100000)

In [23]:
reader = Reader()
data = Dataset.load_from_df(df2[['cID','Movie_Id','rating']][:], reader)

In [24]:
# trainset, testset = train_test_split(data, test_size=.15)

In [25]:
base_svd = SVD()

In [26]:
cross_validate(base_svd, data, measures=['RMSE', 'MAE'])

{'test_rmse': array([0.99345483, 0.99795282, 1.00368603, 0.99744051, 0.99668949]),
 'test_mae': array([0.7998508 , 0.80265014, 0.80678863, 0.80226874, 0.8003296 ]),
 'fit_time': (2.8854610919952393,
  2.8722550868988037,
  2.997709274291992,
  2.9148950576782227,
  2.9121739864349365),
 'test_time': (0.05809187889099121,
  2.378498077392578,
  0.05705380439758301,
  0.05543780326843262,
  0.05572199821472168)}

In [27]:
df2.head()

Unnamed: 0,cID,rating,Movie_Id
21487110,493976,4.0,4043
2538600,211815,4.0,468
1022333,1504131,4.0,241
14109140,1310374,4.0,2724
22307403,262595,4.0,4227


In [38]:
df2.sort_values(by='rating', ascending=False)

Unnamed: 0,cID,rating,Movie_Id
7742574,255278,5.0,1553
3305802,1132838,5.0,607
14013028,521738,5.0,2690
9839406,950817,5.0,1905
12361833,1731609,5.0,2376
...,...,...,...
19820251,374220,1.0,3782
4688563,685718,1.0,919
20488749,316155,1.0,3890
18081930,896745,1.0,3434


In [46]:
df_user = df2[(df2['cID'] == 1664010) & (df2['rating'] == 5)]
df_user = df_user.set_index('Movie_Id')
df_user = df_user.join(df_title)['Name']

In [47]:
df_user

Movie_Id
4405             In Living Color: Season 3
4149      Scooby-Doo 2: Monsters Unleashed
3113                          Dante's Peak
2181                  A Better Tomorrow II
3148                   The English Patient
4067                         Silver Streak
2122                  Being John Malkovich
3181    Batman Beyond: Return of the Joker
1043                    Outrageous Fortune
1035                            Disclosure
255         The Hunchback of Notre Dame II
3728          So I Married an Axe Murderer
Name: Name, dtype: object

In [49]:
# to recommend .. 
user_1664010 = df_title.copy()
user_1664010 = user_1664010.reset_index()
user_1664010 = user_1664010[~user_1664010['Movie_Id'].isin(drop_movie_list)]

# get the full dataset 
data = Dataset.load_from_df(df2[['cID', 'Movie_Id','rating']], reader)

trainset = data.build_full_trainset()

base_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x5068f2d30>

In [52]:
user_1664010['estimate_score'] = user_1664010['Movie_Id'].apply(lambda x: base_svd.predict(1664010, x).est)

In [54]:
user_1664010 = user_1664010.sort_values('estimate_score', ascending=False)
user_1664010.head(10)

Unnamed: 0,Movie_Id,Year,Name,estimate_score
751,752,1993.0,Star Trek: The Next Generation: Season 7,5.0
831,832,2003.0,Tupac: Resurrection,5.0
3149,3150,2000.0,Monty Python: The Life of Python,5.0
3078,3079,1994.0,The Lion King: Special Edition,5.0
3077,3078,1994.0,The Best of Friends: Season 2,5.0
721,722,2003.0,The Wire: Season 1,5.0
3045,3046,1990.0,The Simpsons: Treehouse of Horror,5.0
1624,1625,1986.0,Aliens: Collector's Edition,5.0
2943,2944,1994.0,Hoop Dreams,5.0
2941,2942,1999.0,Friends: Season 6,5.0
