In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV, cross_validate
from surprise import Reader, Dataset
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
import pickle


In [179]:
a = pd.read_csv('data_big/genome-scores.csv')
b = pd.read_csv('data_big/genome-tags.csv')
c = pd.read_csv('data_big/links.csv')
d = pd.read_csv('data_big/movies.csv')
e = pd.read_csv('data_big/ratings.csv')
f = pd.read_csv('data_big/tags.csv')

In [180]:
a.head(1), b.head(1), c.head(1), d.head(1), e.head(1), f.head(1)

(   movieId  tagId  relevance
 0        1      1      0.029,
    tagId  tag
 0      1  007,
    movieId  imdbId  tmdbId
 0        1  114709   862.0,
    movieId             title                                       genres
 0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy,
    userId  movieId  rating   timestamp
 0       1      307     3.5  1256677221,
    userId  movieId   tag   timestamp
 0      14      110  epic  1443148538)

In [62]:
ratings = pd.read_csv('data_movie/ratings.csv')
movies = pd.read_csv('data_movie/movies.csv')


In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### SVD

In [14]:
a = csc_matrix([[1,0,0],[5,0,2],[0,1,0],[0,0,3],[4,0,9]], dtype=float)
u, s, vt = svds(a, k=2)
a.toarray(), u, s, vt

(array([[1., 0., 0.],
        [5., 0., 2.],
        [0., 1., 0.],
        [0., 0., 3.],
        [4., 0., 9.]]),
 array([[-2.21829477e-01,  4.58445949e-02],
        [-8.50288016e-01,  3.86369035e-01],
        [-2.14780401e-19, -2.00071339e-20],
        [ 3.88289052e-01,  2.35719092e-01],
        [ 2.77549248e-01,  8.90535654e-01]]),
 array([ 3.89366418, 10.99269663]),
 array([[-8.63729488e-01, -8.36282754e-19,  5.03955724e-01],
        [ 5.03955724e-01, -2.19932353e-19,  8.63729488e-01]]))

In [15]:
u@((np.diag(s))@vt)

array([[ 1.00000000e+00,  6.11485906e-19, -5.96967461e-17],
       [ 5.00000000e+00,  1.83460645e-18,  2.00000000e+00],
       [ 6.11485906e-19,  7.47739084e-37, -6.11411539e-19],
       [-4.63836131e-17, -1.83423462e-18,  3.00000000e+00],
       [ 4.00000000e+00, -3.05676023e-18,  9.00000000e+00]])

In [16]:
print('Rounded Approximation of Ratings Matrix')
np.round(u@((np.diag(s))@vt))

Rounded Approximation of Ratings Matrix


array([[ 1.,  0., -0.],
       [ 5.,  0.,  2.],
       [ 0.,  0., -0.],
       [-0., -0.,  3.],
       [ 4., -0.,  9.]])

# Surprise

In [4]:
jokes = Dataset.load_builtin(name='jester')

In [18]:
train, test = train_test_split(jokes, test_size=.2)

In [19]:
type(jokes), type(train), type(test)

(surprise.dataset.DatasetAutoFolds, surprise.trainset.Trainset, list)

In [20]:
len(test), test[0]

(352288, ('36491', '8', 4.625))

## Memory Based

In [21]:
print('Number of users: ', train.n_users, '\n')
print('Number of items: ', train.n_items, '\n')

Number of users:  58769 

Number of items:  140 



In [22]:
sim_cos = {'name':'cosine', 'user_based':False} # false because a lot more items than users

In [23]:
basic = knns.KNNBasic(sim_options=sim_cos)
basic.fit(train)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fd202f27430>

In [24]:
basic.sim

array([[1.        , 0.19269271, 0.16906683, ..., 0.11536429, 0.04969122,
        0.14033756],
       [0.19269271, 1.        , 0.73101644, ..., 0.48633554, 0.39095748,
        0.27832863],
       [0.16906683, 0.73101644, 1.        , ..., 0.42496115, 0.55837548,
        0.36203939],
       ...,
       [0.11536429, 0.48633554, 0.42496115, ..., 1.        , 0.37252916,
        0.3770817 ],
       [0.04969122, 0.39095748, 0.55837548, ..., 0.37252916, 1.        ,
        0.46486107],
       [0.14033756, 0.27832863, 0.36203939, ..., 0.3770817 , 0.46486107,
        1.        ]])

In [25]:
predictions = basic.test(test)

In [26]:
print(accuracy.rmse(predictions))

RMSE: 4.2212
4.221218206839122


In [27]:
sim_pearson = {'name':'pearson', 'user_based':False}
basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
basic_pearson.fit(train)
predictions = basic_pearson.test(test)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 4.2806
4.280580865951402


In [28]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(train)
predictions = knn_means.test(test)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 4.1445
4.144453238255045


In [29]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
knn_baseline.fit(train)
predictions = knn_baseline.test(test)
print(accuracy.rmse(predictions))

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 4.1402
4.1401779372662375


## Model Based

In [30]:
'''param_grid = {'n_factors':[20, 100],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
gs_model.fit(jokes)'''

"param_grid = {'n_factors':[20, 100],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],\n              'reg_all': [0.4, 0.6]}\ngs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)\ngs_model.fit(jokes)"

In [31]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(train)
predictions = svd.test(test)
print(accuracy.rmse(predictions))

RMSE: 4.3009
4.300923277017237


# Prediction

In [32]:
user_34_prediction = svd.predict('34', '25')
user_34_prediction

Prediction(uid='34', iid='25', r_ui=None, est=2.913630589764392, details={'was_impossible': False})

In [33]:
# get the prediction from tuple
user_34_prediction[3]

2.913630589764392

# --------

In [34]:
ratings.drop('timestamp', axis=1, inplace=True)
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [35]:
reader = Reader()
ratings_surprise = Dataset.load_from_df(ratings, reader)
ratings_surprise

<surprise.dataset.DatasetAutoFolds at 0x7fd202f27220>

In [36]:
dataset = ratings_surprise.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


In [37]:
#SVD
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(ratings_surprise)

In [38]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 0.8686126846318561, 'mae': 0.6677102738484544}
{'rmse': {'n_factors': 50, 'reg_all': 0.05}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}


In [39]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, ratings_surprise, n_jobs=-1)

In [40]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([0.97549083, 0.97784204, 0.98068767, 0.96629701, 0.97447788]))
('test_mae', array([0.7527836 , 0.7552543 , 0.75716101, 0.7466849 , 0.75139268]))
('fit_time', (0.5259459018707275, 0.5449931621551514, 0.55328369140625, 0.5318360328674316, 0.5032486915588379))
('test_time', (1.229274034500122, 1.1912047863006592, 1.21421480178833, 1.177954912185669, 1.1448559761047363))
-----------------------
0.9749590863421554


In [41]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline,ratings_surprise)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [42]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([0.877887  , 0.88163464, 0.87221742, 0.87345024, 0.87857156]))
('test_mae', array([0.67070626, 0.674071  , 0.66546405, 0.66893945, 0.66886303]))
('fit_time', (0.6157629489898682, 0.5989048480987549, 0.5886871814727783, 0.5864698886871338, 0.6548950672149658))
('test_time', (1.4477810859680176, 1.8756911754608154, 1.4557979106903076, 1.5077109336853027, 1.4154348373413086))


0.8767521745326015

### RECOMMENDATIONS

In [43]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [47]:
svd = SVD(n_factors= 50, reg_all=0.05)
svd.fit(dataset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd1b837abe0>

In [48]:
svd.predict(2, 4)

Prediction(uid=2, iid=4, r_ui=None, est=3.076510920472636, details={'was_impossible': False})

In [49]:
def movie_rater(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list

In [54]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [58]:
r_55245 = input('How would you rate movie 55245')
r_2491 = input('How would you rate movie 2491')
r_4718 = input('How would you rate movie 4718')
r_5990 = input('How would you rate movie 5990')
ratings = {'r_55245':r_55245, 'r_2491':r_2491, 'r_4718':r_4718, 'r_5990':r_5990}
new_user_ratings = []
for i in ratings.keys():
    new_user_rating = {}
    user = 1000 #a new id
    movie = int(i[2:])
    rate = ratings[i]
    new_user_rating['userId']=user
    new_user_rating['movieId']=movie
    new_user_rating['rating']=rate
    new_user_ratings.append(new_user_rating)


In [59]:
new_user_ratings

[{'userId': 1000, 'movieId': 55245, 'rating': '2'},
 {'userId': 1000, 'movieId': 2491, 'rating': '3'},
 {'userId': 1000, 'movieId': 4718, 'rating': '4'},
 {'userId': 1000, 'movieId': 5990, 'rating': '1'}]

In [65]:
ratings.rating.describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [None]:
def recommended_movies(new_user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break
            
recommended_movies(ranked_movies,df_movies,5)

# Pyspark

In [116]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [71]:
spark = SparkSession\
        .builder\
        .appName('ALSExample').config('spark.driver.host', 'localhost')\
        .getOrCreate()

In [72]:
movie_ratings = spark.read.csv('./data_movie/ratings.csv', header='true', inferSchema='true')

In [76]:
movie_ratings

DataFrame[userId: int, movieId: int, rating: double, timestamp: int]

In [77]:
movie_ratings.dtypes


[('userId', 'int'),
 ('movieId', 'int'),
 ('rating', 'double'),
 ('timestamp', 'int')]

In [79]:
movie_ratings = movie_ratings.drop('timestamp')


In [81]:
(training, test) = movie_ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5,rank=4, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating',
          coldStartStrategy='drop')

# fit the ALS model to the training set
model = als.fit(training)

In [82]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',
                                predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-square error = ' + str(rmse))

Root-mean-square error = 0.9949730930260284


In [84]:
# initialize the ALS model
als_model = ALS(userCol='userId', itemCol='movieId', 
                ratingCol='rating', coldStartStrategy='drop')

# create the parameter grid                 
params = ParamGridBuilder()\
          .addGrid(als_model.regParam, [0.01, 0.001, 0.1])\
          .addGrid(als_model.rank, [4, 10, 50]).build()


# instantiating crossvalidator estimator
cv = CrossValidator(estimator=als_model, estimatorParamMaps=params,\
                    evaluator=evaluator,parallelism=4)
                    
best_model = cv.fit(movie_ratings)    

# We see the best model has a rank of 50, so we will use that in our future models with this dataset
best_model.bestModel.rank

2022-03-22 17:00:49.091 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:00:49.388 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:00:50.906 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:00:57.632 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:00:58.962 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:01:01.455 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:01:01.809 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:01:04.969 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:01:07.580 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:01:17.439 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:01:17.680 INFO    py4j.clientserver: Closing down clientserver connection
2022-03-22 17:01:17.835 INFO    

50

In [99]:
predictions = best_model.transform(test)
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',
                                predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-square error = ' + str(rmse))

Root-mean-square error = 0.4926263276001733


In [87]:
movie_titles = spark.read.csv('./data_movie/movies.csv',header='true',inferSchema='true')

movie_titles.head(5)

[Row(movieId=1, title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy'),
 Row(movieId=2, title='Jumanji (1995)', genres='Adventure|Children|Fantasy'),
 Row(movieId=3, title='Grumpier Old Men (1995)', genres='Comedy|Romance'),
 Row(movieId=4, title='Waiting to Exhale (1995)', genres='Comedy|Drama|Romance'),
 Row(movieId=5, title='Father of the Bride Part II (1995)', genres='Comedy')]

In [88]:
def name_retriever(movie_id, movie_title_df):
    return movie_title_df.where(movie_title_df.movieId == movie_id).take(1)[0]['title']

In [89]:
print(name_retriever(1023, movie_titles))


Winnie the Pooh and the Blustery Day (1968)


In [90]:
users = movie_ratings.select(als.getUserCol()).distinct().limit(1)
userSubsetRecs = model.recommendForUserSubset(users, 10)
recs = userSubsetRecs.take(1)



In [91]:
# use indexing to obtain the movie id of top predicted rated item
first_recommendation = recs[0]['recommendations'][0][0]

# use the name retriever function to get the values
name_retriever(first_recommendation,movie_titles)

'Easy Money (1983)'

In [93]:
recommendations = model.recommendForAllUsers(5)
recommendations.where(recommendations.userId == 3).collect()

[Row(userId=3, recommendations=[Row(movieId=158783, rating=8.596968650817871), Row(movieId=7164, rating=8.580899238586426), Row(movieId=89118, rating=8.460037231445312), Row(movieId=72171, rating=7.737977027893066), Row(movieId=5666, rating=7.695328712463379)])]

In [173]:
def new_user_recs(user_id, new_ratings, rating_df, movie_title_df, num_recs):
    # turn the new_recommendations list into a spark DataFrame
    new_user_ratings = spark.createDataFrame(new_ratings,rating_df.columns)
    
    # combine the new ratings df with the rating_df
    movie_ratings_combined = rating_df.union(new_user_ratings)
    
    # split the dataframe into a train and test set
#     (training, test) = movie_ratings_combined.randomSplit([0.8, 0.2],seed=0)
    
    # create an ALS model and fit it
    als = ALS(maxIter=5,rank=50, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
    model = als.fit(movie_ratings_combined)
    
    # make recommendations for all users using the recommendForAllUsers method
    recommendations = model.recommendForAllUsers(num_recs)
    
    # get recommendations specifically for the new user that has been added to the DataFrame
    recs_for_user = recommendations.where(recommendations.userId == user_id).take(1)
    
    user_recommendations = []
    for ranking, (movie_id, rating) in enumerate(recs_for_user[0]['recommendations']):
        movie_string = name_retriever(movie_id,movie_title_df)
        user_recommendations.append(movie_string)
        print('Recommendation {}: {}  | predicted score :{}'.format(ranking+1,movie_string,rating))
    return user_recommendations

In [122]:
user_id = 100000
user_ratings_1 = [(user_id,3253,5),
                  (user_id,2459,5),
                  (user_id,2513,4),
                  (user_id,6502,5),
                  (user_id,441,4)]
new_user_recs(user_id,
             new_ratings=user_ratings_1,
             rating_df=movie_ratings,
             movie_title_df=movie_titles,
             num_recs = 10)

Recommendation 1: Shawshank Redemption, The (1994)  | predicted score :5.785210609436035
Recommendation 2: Princess Bride, The (1987)  | predicted score :5.694006443023682
Recommendation 3: Silence of the Lambs, The (1991)  | predicted score :5.684373378753662
Recommendation 4: Star Wars: Episode V - The Empire Strikes Back (1980)  | predicted score :5.634759902954102
Recommendation 5: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)  | predicted score :5.618373870849609
Recommendation 6: Office Space (1999)  | predicted score :5.606531620025635
Recommendation 7: Shrek (2001)  | predicted score :5.59533166885376
Recommendation 8: Monty Python and the Holy Grail (1975)  | predicted score :5.593297004699707
Recommendation 9: Lord of the Rings: The Fellowship of the Ring, The (2001)  | predicted score :5.574115753173828
Recommendation 10: Lord of the Rings: The Two Towers, The (2002)  | predicted score :5.570826053619385


In [174]:
pickle_out = open('pickle/new_user_recs.pickle','wb')
pickle.dump(new_user_recs, pickle_out)
pickle_out.close()

print_in = open('pickle/new_user_recs.pickle','rb')
new_user_recs2 = pickle.load(print_in)
print_in.close()

In [123]:
movie_ratings.write.format('parquet').mode("overwrite").save("movie_ratings")

movie_ratings2=spark.read.format('parquet').load('movie_ratings')

movie_titles.write.format('parquet').mode("overwrite").save("movie_titles")

movie_titles2=spark.read.format('parquet').load('movie_titles')


In [175]:
new_user_recs2(user_id,
             new_ratings=user_ratings_1,
             rating_df=movie_ratings2,
             movie_title_df=movie_titles2,
             num_recs = 20)



Recommendation 1: Shawshank Redemption, The (1994)  | predicted score :5.785210609436035
Recommendation 2: Princess Bride, The (1987)  | predicted score :5.694006443023682
Recommendation 3: Silence of the Lambs, The (1991)  | predicted score :5.684373378753662
Recommendation 4: Star Wars: Episode V - The Empire Strikes Back (1980)  | predicted score :5.634759902954102
Recommendation 5: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)  | predicted score :5.618373870849609
Recommendation 6: Office Space (1999)  | predicted score :5.606531620025635
Recommendation 7: Shrek (2001)  | predicted score :5.59533166885376
Recommendation 8: Monty Python and the Holy Grail (1975)  | predicted score :5.593297004699707
Recommendation 9: Lord of the Rings: The Fellowship of the Ring, The (2001)  | predicted score :5.574115753173828
Recommendation 10: Lord of the Rings: The Two Towers, The (2002)  | predicted score :5.570826053619385
Recommendation 11: Wallace & Gromit: A Cl

['Shawshank Redemption, The (1994)',
 'Princess Bride, The (1987)',
 'Silence of the Lambs, The (1991)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
 'Office Space (1999)',
 'Shrek (2001)',
 'Monty Python and the Holy Grail (1975)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 'Lord of the Rings: The Two Towers, The (2002)',
 'Wallace & Gromit: A Close Shave (1995)',
 'Matrix, The (1999)',
 'Lord of the Rings: The Return of the King, The (2003)',
 'Back to the Future (1985)',
 'Memento (2000)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)',
 'Finding Nemo (2003)',
 'Roman Holiday (1953)',
 'Forrest Gump (1994)']

In [149]:
most_reviewed_movies = ratings.groupby('movieId')['timestamp'].count().sort_values(ascending=False)[:50].index.tolist()

In [162]:
most_reviewed_movies_dict = {}
for i in range(len(most_reviewed_movies)):
    most_reviewed_movies_dict[most_reviewed_movies[i]] = name_retriever(most_reviewed_movies[i],movie_titles)
most_reviewed_movies_dict

{356: 'Forrest Gump (1994)',
 318: 'Shawshank Redemption, The (1994)',
 296: 'Pulp Fiction (1994)',
 593: 'Silence of the Lambs, The (1991)',
 2571: 'Matrix, The (1999)',
 260: 'Star Wars: Episode IV - A New Hope (1977)',
 480: 'Jurassic Park (1993)',
 110: 'Braveheart (1995)',
 589: 'Terminator 2: Judgment Day (1991)',
 527: "Schindler's List (1993)",
 2959: 'Fight Club (1999)',
 1: 'Toy Story (1995)',
 1196: 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 50: 'Usual Suspects, The (1995)',
 2858: 'American Beauty (1999)',
 47: 'Seven (a.k.a. Se7en) (1995)',
 780: 'Independence Day (a.k.a. ID4) (1996)',
 150: 'Apollo 13 (1995)',
 1198: 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 4993: 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 1210: 'Star Wars: Episode VI - Return of the Jedi (1983)',
 858: 'Godfather, The (1972)',
 457: 'Fugitive, The (1993)',
 592: 'Batman (1989)',
 5952: 'Lord of the Rings: The Two Towers, The (2002

In [165]:
pickle_out = open('pickle/most_reviewed_movies_dict.pickle','wb')
pickle.dump(most_reviewed_movies_dict, pickle_out)
pickle_out.close()

In [188]:
movie_ratings.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



In [1]:
test

NameError: name 'test' is not defined