In [2]:
from google.cloud import storage
import os
from io import BytesIO
import pandas as pd

In [3]:
#Connect to GCP bucket 
bucket_name = "jupyter_dataset" 
storage_client = storage.Client()

bucket = storage_client.get_bucket(bucket_name)

In [4]:
blob_rating = storage.blob.Blob("ratings_small.csv",bucket)
blob_meta = storage.blob.Blob("movies_metadata_small.csv",bucket)

In [5]:
# Convert to a pandas dataframe
content_rating = blob_rating.download_as_string()
content_meta = blob_meta.download_as_string()
ratingDF = pd.read_csv(BytesIO(content_rating))
metaDF = pd.read_csv(BytesIO(content_meta))

In [15]:
#slice meta dataset and rename the column
meta_slice = metaDF[['id','popularity','title']].rename(columns={'id':'movieId'})

In [17]:
train = pd.merge(ratingDF, meta_slice, how='left', on='movieId')
train['popularity'].fillna(0, inplace = True)
train['title'].fillna('NA', inplace = True)

In [21]:
train

Unnamed: 0,userId,movieId,rating,timestamp,popularity,title
0,1,1,4.0,964982703,0.000000,
1,1,3,4.0,964981247,2.292110,Shadows in Paradise
2,1,6,4.0,964982224,5.538671,Judgment Night
3,1,47,5.0,964983815,0.000000,
4,1,50,5.0,964982931,0.000000,
...,...,...,...,...,...,...
100836,610,166534,4.0,1493848402,0.000000,
100837,610,168248,5.0,1493850091,0.000000,
100838,610,168250,5.0,1494273047,0.000000,
100839,610,168252,5.0,1493846352,0.000000,


# ALS model

In [22]:
# impport everything we need for the rec sys
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession

In [24]:
# initialize spark session
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()
# create a spark dataframe from a pandas dataframe
df = spark.createDataFrame(train)
df.show()

+------+-------+------+---------+------------------+--------------------+
|userId|movieId|rating|timestamp|        popularity|               title|
+------+-------+------+---------+------------------+--------------------+
|     1|      1|   4.0|964982703|               0.0|                  NA|
|     1|      3|   4.0|964981247|2.2921099999999996| Shadows in Paradise|
|     1|      6|   4.0|964982224|          5.538671|      Judgment Night|
|     1|     47|   5.0|964983815|               0.0|                  NA|
|     1|     50|   5.0|964982931|               0.0|                  NA|
|     1|     70|   3.0|964982400|14.471373000000002| Million Dollar Baby|
|     1|    101|   5.0|964980868|         20.477329|Leon: The Profess...|
|     1|    110|   4.0|964982176| 7.832755000000001|   Three Colors: Red|
|     1|    151|   5.0|964984041|               0.0|                  NA|
|     1|    157|   5.0|964984100|          6.197298|Star Trek III: Th...|
|     1|    163|   5.0|964983650|     

In [25]:
# create testing and training data
(training, test) = df.randomSplit([0.7, 0.3])

In [26]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")
model = als.fit(training)

In [27]:
# generate predictions and evaluations
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.1229522556717215


In [28]:
#recommendations
userRecs = model.recommendForAllUsers(10) # top 10 movie recommendations for each user
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[319, 6.876989],...|
|   463|[[6380, 7.602112]...|
|   496|[[2693, 8.752673]...|
|   148|[[89904, 6.49785]...|
|   540|[[6380, 8.334433]...|
|   392|[[86345, 8.281724...|
|   243|[[179819, 10.7873...|
|    31|[[1866, 12.906917...|
|   516|[[4255, 9.738839]...|
|   580|[[6385, 6.4166355...|
|   251|[[3477, 9.391308]...|
|   451|[[4102, 8.861765]...|
|    85|[[6993, 9.207008]...|
|   137|[[306, 6.0016575]...|
|    65|[[1096, 5.969762]...|
|   458|[[4233, 9.959537]...|
|   481|[[1866, 8.409735]...|
|    53|[[222, 8.25079], ...|
|   255|[[102481, 9.00279...|
|   588|[[3155, 6.839693]...|
+------+--------------------+
only showing top 20 rows



In [35]:
#split the 10 recommendations to single records
import pyspark.sql.functions as functions
userRecs_long = userRecs.withColumn('recommendation', functions.explode('recommendations')).select(['userId', 'recommendation'])
userRecs_long.show()

+------+-------------------+
|userId|     recommendation|
+------+-------------------+
|   471|    [319, 6.876989]|
|   471| [94677, 6.7653666]|
|   471|  [6380, 6.6441865]|
|   471| [86320, 6.3061438]|
|   471|  [86347, 6.252352]|
|   471|   [2693, 6.222385]|
|   471|[148881, 6.1944075]|
|   471|    [2072, 6.13944]|
|   471|  [2990, 6.1089253]|
|   471|  [4291, 6.0106416]|
|   463|   [6380, 7.602112]|
|   463| [69951, 6.4275064]|
|   463| [177593, 6.423801]|
|   463|  [86320, 6.382269]|
|   463|    [954, 6.050317]|
|   463|   [27822, 5.96999]|
|   463| [59900, 5.9162755]|
|   463|   [918, 5.8790655]|
|   463|  [94677, 5.866765]|
|   463|  [1126, 5.8517847]|
+------+-------------------+
only showing top 20 rows



In [38]:
#remain only IDs
def get_index(record):
    return record.userId, record.recommendation[0]
userRecs_id = userRecs_long.rdd.map(get_index).toDF(['userId', 'movieId'])
userRecs_id.show()

+------+-------+
|userId|movieId|
+------+-------+
|   471|    319|
|   471|  94677|
|   471|   6380|
|   471|  86320|
|   471|  86347|
|   471|   2693|
|   471| 148881|
|   471|   2072|
|   471|   2990|
|   471|   4291|
|   463|   6380|
|   463|  69951|
|   463| 177593|
|   463|  86320|
|   463|    954|
|   463|  27822|
|   463|  59900|
|   463|    918|
|   463|  94677|
|   463|   1126|
+------+-------+
only showing top 20 rows



In [39]:
#convert to dataframe and merge the title column
userRecs_result = userRecs_id.toPandas()
movieList = userRecs_result['movieId'].tolist() #convert movie id to list
titleList = [] #create a new list to sotrage title
for i in movieList:    #connecting the movie id and title
    try:
        a = pd.Index(meta_slice['movieId']).get_loc(i)
        titleList.append(meta_slice.iloc[a,2])
    except:
        titleList.append('NA')

userRecs_result['title'] = titleList
userRecs_result

Unnamed: 0,userId,movieId,title
0,471,319,True Romance
1,471,94677,
2,471,6380,
3,471,86320,Twenty Years Later
4,471,86347,
...,...,...,...
6095,89,136341,
6096,89,33090,
6097,89,121781,
6098,89,149350,


In [41]:
#write to csv file
userRecs_result.to_csv('userRecs_result.csv')

# Popularity based Algorithm

In [58]:
#calculate the average popularity of each movie
popRec = meta_slice.sort_values(by="popularity" , ascending=False)[:10]
popRec

Unnamed: 0,movieId,popularity,title
53,680,140.950236,Pulp Fiction
1811,155,123.167259,The Dark Knight
101,78,96.272374,Blade Runner
521,550,63.869599,Fight Club
58,278,51.645403,The Shawshank Redemption
65,13,48.307194,Forrest Gump
990,22,47.326665,Pirates of the Caribbean: The Curse of the Bla...
44,11,42.149697,Star Wars
97,424,41.725123,Schindler's List
129,238,41.109264,The Godfather


# Overlapping

In [52]:
#output the popular based algrism result as a list to calculate the overlapping rate
popList = popRec['movieId'].tolist()
popList

[680, 155, 78, 550, 278, 13, 22, 11, 424, 238]

In [44]:
#output all user ID 
userID = []
for i in train['userId']:
    if i not in userID:
        userID.append(i)

In [50]:
movieUserList

[1254, 2693, 89904, 898, 1260, 6380, 2935, 2936, 25825, 25771]

In [46]:
overlappingRate = []
for i in userID:
    indexList = userRecs_result[userRecs_result.userId==i].index.tolist()  #find the index of one specific userId
    movieUserList = [] 
    for index in indexList:
        movieUserList.append(userRecs_result.iloc[index,1]) #list the recommandation movie ID of one user
    overlappingMovie = set(movieUserList) & set(popList) #find the overlapping elements
    ol_rate = int(len(overlappingMovie)) * 0.1
    overlappingRate.append(ol_rate)
       
overlappingRate

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [48]:
overlappingDF = pd.DataFrame({'userID': userID,'overlapping_rate': overlappingRate})
overlappingDF

Unnamed: 0,userID,overlapping_rate
0,1,0.0
1,2,0.0
2,3,0.0
3,4,0.0
4,5,0.0
...,...,...
605,606,0.0
606,607,0.0
607,608,0.0
608,609,0.0


In [59]:
sum(overlappingRate)

0.0