## GR5291 ALS
### cy2475 Chenghao Yu

In [1]:
# import ALS and Linear Regression models
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql import Row
from pyspark.sql import SparkSession

In [2]:
from pyspark import SparkContext
from pyspark import SQLContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [3]:
import pandas as pd
import time

In [4]:
df_train = pd.read_csv("training.csv")
df_test = pd.read_csv("testing.csv")

In [5]:
df_train['set'] = [1]*df_train.shape[0]
df_test['set'] = [0]*df_test.shape[0]

In [6]:
df = pd.concat([df_train, df_test])

In [7]:
df[0:5]

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,artist_name,genre_ids,language,song_length,bd,city,gender,registered_via,registration_init_time,expiration_date,set
0,qXWBbfsZ6923Dq7OvwMP2rtVE3phQqlEKcLp2I5wofw=,JTMsJNPcL2ambkp1Z/MeBEnFEzZEOKetdLPrJxHf7v8=,discover,Local playlist more,local-library,0,BIGBANG,465,31.0,220212,0,1,,4,20161231,20170103,1
1,Qja9LnwXWgTjFG5G1iMm/1J5Be6jhD/LUikBU9e9bWg=,zuyzUP11fbgoVp6IMfWK378GDToe59ZlK5gkf+7aKdg=,my library,My library,song,0,家家 (JiaJia),458,3.0,249835,42,6,male,9,20060324,20180103,1
2,6fGAUf1BJeTefYr5PUdmJVQ80DWsKr/3JRIf2w9MGAk=,TuknjyN6Uqqrm49GZYBfTY4I2fjJwr11nVb18xlgmEE=,my library,Local playlist more,local-playlist,1,周杰倫 (Jay Chou),465,3.0,231444,23,13,female,9,20090815,20180818,1
3,dY4MN8EfoumZIs/0+ugiv/qGkNWIWdnWLIrFpjFYNTw=,reXuGcEWDDCnL0K3Th//3DFG4S1ACSpJMzA+CFipo1g=,my library,Local playlist more,local-playlist,1,周湯豪 (NICKTHEREAL),458,3.0,219324,0,1,,7,20160620,20170919,1
4,kktfViwY41gERgWGXWDnVPl/oNF5fLNigX11zfhVOl0=,wcw0/6Vl8mOPpS0WUysfaKaOv8eAXim4/LAQfyFhgZU=,my library,Local playlist more,local-playlist,1,南拳媽媽,465,3.0,253074,22,9,female,3,20150818,20171217,1


In [8]:
df = df.drop(['source_system_tab','source_screen_name','source_type','artist_name',
              'genre_ids','language','song_length','bd','city','gender','registered_via',
              'registration_init_time','expiration_date'], axis=1)

In [9]:
df[0:5]

Unnamed: 0,msno,song_id,target,set
0,qXWBbfsZ6923Dq7OvwMP2rtVE3phQqlEKcLp2I5wofw=,JTMsJNPcL2ambkp1Z/MeBEnFEzZEOKetdLPrJxHf7v8=,0,1
1,Qja9LnwXWgTjFG5G1iMm/1J5Be6jhD/LUikBU9e9bWg=,zuyzUP11fbgoVp6IMfWK378GDToe59ZlK5gkf+7aKdg=,0,1
2,6fGAUf1BJeTefYr5PUdmJVQ80DWsKr/3JRIf2w9MGAk=,TuknjyN6Uqqrm49GZYBfTY4I2fjJwr11nVb18xlgmEE=,1,1
3,dY4MN8EfoumZIs/0+ugiv/qGkNWIWdnWLIrFpjFYNTw=,reXuGcEWDDCnL0K3Th//3DFG4S1ACSpJMzA+CFipo1g=,1,1
4,kktfViwY41gERgWGXWDnVPl/oNF5fLNigX11zfhVOl0=,wcw0/6Vl8mOPpS0WUysfaKaOv8eAXim4/LAQfyFhgZU=,1,1


In [10]:
df.msno = pd.Categorical(df.msno)
df['msno_int'] = df.msno.cat.codes
df.song_id = pd.Categorical(df.song_id)
df['song_id_int'] = df.song_id.cat.codes

In [11]:
df[0:5]

Unnamed: 0,msno,song_id,target,set,msno_int,song_id_int
0,qXWBbfsZ6923Dq7OvwMP2rtVE3phQqlEKcLp2I5wofw=,JTMsJNPcL2ambkp1Z/MeBEnFEzZEOKetdLPrJxHf7v8=,0,1,26260,118863
1,Qja9LnwXWgTjFG5G1iMm/1J5Be6jhD/LUikBU9e9bWg=,zuyzUP11fbgoVp6IMfWK378GDToe59ZlK5gkf+7aKdg=,0,1,13695,352221
2,6fGAUf1BJeTefYr5PUdmJVQ80DWsKr/3JRIf2w9MGAk=,TuknjyN6Uqqrm49GZYBfTY4I2fjJwr11nVb18xlgmEE=,1,1,4058,175975
3,dY4MN8EfoumZIs/0+ugiv/qGkNWIWdnWLIrFpjFYNTw=,reXuGcEWDDCnL0K3Th//3DFG4S1ACSpJMzA+CFipo1g=,1,1,20009,306854
4,kktfViwY41gERgWGXWDnVPl/oNF5fLNigX11zfhVOl0=,wcw0/6Vl8mOPpS0WUysfaKaOv8eAXim4/LAQfyFhgZU=,1,1,23431,334292


In [12]:
df_id_pair = df.drop(['target','set'], axis=1)

In [13]:
df_id_pair.to_csv('strid_intid.csv', index=False)

In [14]:
df_test_dic = df.loc[df['set']==0]
df_test_dic.to_csv("testing_dic.csv", index=False)

In [15]:
df = df.drop(['msno','song_id'], axis=1)

In [16]:
df[0:5]

Unnamed: 0,target,set,msno_int,song_id_int
0,0,1,26260,118863
1,0,1,13695,352221
2,1,1,4058,175975
3,1,1,20009,306854
4,1,1,23431,334292


In [17]:
df.to_csv('ALS.csv', header=False, index=False)

In [18]:
df_train = df.loc[df['set']==1]
df_test = df.loc[df['set']==0]

In [19]:
df_train = df_train.drop(['set'],axis=1)
df_test = df_test.drop(['set'],axis=1)

In [20]:
df_train.to_csv('ALStrain.csv', header = False)
df_test.to_csv('ALStest.csv', header = False)

In [21]:
# Build a SparkSession; SparkSession provides a single point of entry to interact with underlying Spark functionality
spark = SparkSession\
    .builder\
    .appName("ALSExample")\
    .getOrCreate()

In [22]:
# Load data as RDD, then transform it to DataFrame format
lines = spark.read.text("ALStrain.csv").rdd
parts = lines.map(lambda row: row.value.split(","))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[2]), songId=int(p[3]),
                                     rating=float(p[1])))
training = spark.createDataFrame(ratingsRDD)
# Split data to training part and test part
#(training, test) = ratings.randomSplit([0.8, 0.2])

In [23]:
lines = spark.read.text("ALStest.csv").rdd
parts = lines.map(lambda row: row.value.split(","))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[2]), songId=int(p[3]),
                                     rating=float(p[1])))
test = spark.createDataFrame(ratingsRDD)

In [24]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="songId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [25]:
# Make predictions using the model we just built; Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.48113769113975074


In [26]:
# Generate top 5 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   148|[[307609, 2.41548...|
|   463|[[129164, 1.92623...|
|   471|[[232751, 3.01539...|
|   496|[[217014, 3.86646...|
|   833|[[36054, 2.373713...|
|  1088|[[35059, 2.376929...|
|  1238|[[232751, 3.17090...|
|  1580|[[261633, 1.28104...|
|  1591|[[220835, 2.32352...|
|  1645|[[166735, 1.39530...|
|  1829|[[296436, 1.99897...|
|  1959|[[0, 0.0], [10, 0...|
|  2122|[[262770, 1.02939...|
|  2142|[[266513, 3.44459...|
|  2366|[[91479, 1.260464...|
|  2659|[[296436, 2.77124...|
|  2866|[[210452, 0.69766...|
|  3175|[[338000, 1.98941...|
|  3749|[[275000, 2.97553...|
|  3794|[[46443, 2.314936...|
+------+--------------------+
only showing top 20 rows



In [27]:
rlt = userRecs.toPandas()

In [28]:
rlt.to_csv("result.csv")

In [29]:
RLT = pd.DataFrame(index=range(rlt.shape[0]),columns=range(11))
RLT.columns = ['ID','R1','R2','R3','R4','R5','R6','R7','R8','R9','R10']

In [30]:
RLT.ID = rlt.userId
RLT

Unnamed: 0,ID,R1,R2,R3,R4,R5,R6,R7,R8,R9,R10
0,148,,,,,,,,,,
1,463,,,,,,,,,,
2,471,,,,,,,,,,
3,496,,,,,,,,,,
4,833,,,,,,,,,,
5,1088,,,,,,,,,,
6,1238,,,,,,,,,,
7,1580,,,,,,,,,,
8,1591,,,,,,,,,,
9,1645,,,,,,,,,,


In [36]:
for u in range(rlt.shape[0]):
    for r in range(10):
        RLT.iloc[u,r+1] = rlt.recommendations[u][r][0]
    if u%5000 == 0:
        print(u)
        print('-----')

0
-----
5000
-----
10000
-----
15000
-----
20000
-----
25000
-----
30000
-----


In [37]:
RLT[0:5]

Unnamed: 0,ID,R1,R2,R3,R4,R5,R6,R7,R8,R9,R10
0,148,325552,267525,210615,104988,177195,317184,121058,177310,204064,31291
1,463,182618,252179,271615,307661,203582,66325,7234,249962,179953,346459
2,471,292900,16097,268536,149367,62198,65864,313669,121058,23984,284220
3,496,33506,64728,155720,95672,207528,89433,277054,168440,273325,196620
4,833,34786,142402,313669,220080,267525,95672,16097,24404,123008,122083


In [38]:
RLT.to_csv("result_intID.csv", index = False)

In [39]:
user_id_dic = df_id_pair.drop(['song_id','song_id_int'], axis=1)

In [40]:
song_id_dic = df_id_pair.drop(['msno','msno_int'], axis=1)

In [41]:
song_id_dic[0:10]

Unnamed: 0,song_id,song_id_int
0,JTMsJNPcL2ambkp1Z/MeBEnFEzZEOKetdLPrJxHf7v8=,118863
1,zuyzUP11fbgoVp6IMfWK378GDToe59ZlK5gkf+7aKdg=,352221
2,TuknjyN6Uqqrm49GZYBfTY4I2fjJwr11nVb18xlgmEE=,175975
3,reXuGcEWDDCnL0K3Th//3DFG4S1ACSpJMzA+CFipo1g=,306854
4,wcw0/6Vl8mOPpS0WUysfaKaOv8eAXim4/LAQfyFhgZU=,334292
5,i/9mpmgGKK+l3dF0Nczmz5XAzWQplXVVXZKUypdG96g=,254132
6,rGQta84AYQvw7EfOxt2eioFMBtTJwH5566r1upAMfU8=,304774
7,BAgK0kbiNXGoX0QZzMQSWKizd60+5pjv7SJwRUVRLGk=,73040
8,0rKJxDhI03y0+iAVzugYyrpbRHEGjYIDno8T15FM2fo=,15915
9,RxxelttQ4TmwwuKb0Cksss9z+9PtyBhG3AK+UoQ7r1c=,165328


In [42]:
song_id_dic2 = song_id_dic.drop_duplicates()

In [43]:
user_id_dic2 = user_id_dic.drop_duplicates()

In [44]:
RLT2 = RLT.copy()

In [45]:
for u in range(RLT.shape[0]):
    RLT2.iloc[u,0] =  user_id_dic2.loc[user_id_dic2.msno_int==RLT.iloc[u,0]].iloc[0,0]
    if u%5000 == 0:
        print(u)

0
5000
10000
15000
20000
25000
30000


In [46]:
for u in range(RLT.shape[0]):
    for r in range(10):
        RLT2.iloc[u,r+1] =  song_id_dic2.loc[song_id_dic2.song_id_int==RLT.iloc[u,r+1]].iloc[0,0]
    if u%1000 == 0:
        print(u)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000


In [47]:
RLT2.to_csv("result_oriID.csv", index = False)

In [49]:
rlt = pd.read_csv('result_oriID.csv')
dic_rlt = rlt.set_index('ID')
dic_rlt[1:5]

Unnamed: 0_level_0,R1,R2,R3,R4,R5,R6,R7,R8,R9,R10
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
/1/GPKvOx2lWLskxHQM+mDKxBFv7XEMPP3TOHaZzHWI=,V4N/g5U0zScEMDOG3s5TPNQJakxjvWYEy6cdz3ZJBL4=,hdr1w+xO2EmEyYaS/Fw7fysAN3fFegNGWbw6OqNKJ0c=,lE/JwBjz0xQEWh3Z3VVIs62PqU2aKrptkIZnyFi4BL4=,rnX8FljjbCJaWynXAo+VyyycE67imloDdZMeylgB50k=,YwJQzIskIUiDSi5OKygFZH/lCvbFIjjtANycZRqfD/U=,9wnwXv+niMjMtmZyK9ayUdXT1AkuYxojjnf409NyAZw=,/GmBpmhc/8IGjwByeq+O2Kwv11OFfi1PGOs0ZvUJlZ0=,hE6aKyhjTdMRw1xIXlDLnZ+EOH0I2YkBcsE4V+wcmAA=,Ucj+8u6eHIYFrKnGIIqDCNfGdEVXPeOsUO671MHNZX4=,yrEoWmOyTbHBRz8PXPbg/r8XQEldmiPRcMGDFrwtcyI=
/27S3WYkO3xYembOe/CpR4LJh6GUdoPbfbvMrhHZfn4=,p621zOKRZYOkKMkQSYtqpmHP5pquwYXznsoCXULZwsU=,0tRDQkhnFxvCvQ3X5sqZzBUZ/LsmZP0gEEUCwqnNZhM=,kfIo/ICrUHy16+6OSioD7s/tcMNUeAitoIoHs2PECDI=,P1g8PLKzJWacSmRTr2yFF3Fvyab35r2NzPW/0TVe+lw=,9Auxo08gyygZVQpSknPKIwcNUmewTdZsMTDfLFgQU+k=,9rM1nc3EnClzYjVTOVaX5E9CfhjCS8ZWj6JRIsFQyPk=,stIfIRARkGfkMLplfC+QCc7YWOqau4T+A7pdmRsd4gQ=,Jta/oPni6IVILd9492koJ4/9USeCJQeqAISoF4Jf1Gc=,2JEnFftsK5BP5+e1tm+0y1QQzZ5pxUa18SoHwDU63TA=,nYZ4+/1e/QeJzI0an0UL3Tfum+f4jRY1prvqB6anb2E=
/5Xu7yJZF20khZSpwsqs03K9zmiYqlB9FCtK2QtLdrY=,4++g8du/wzUMEgh1VDI7cO9/UBY3Lb0bkssKJEoET8M=,9eK08t6s420mylV0ZdAOOCNwWS3fPJKRHIAskF3/kRI=,QCt0GMS98rF3az6+2tB0YkZdHlr25BH6M/RQAfEvDv4=,FI/3scaISrLiKqHi99Ug6+iReqQCq4unWgp35gzjtxU=,ZeGdxhwPvGs/2C+Z1LG5bBaIfxy0RYhSLNegGNOoWQA=,E931ZzopKXiqz2L2iV+rZ7HVHlwKA3ibPv1Yxbph6Mc=,mEtCsRukkbHJEd/8pPcWiBmd9E98hpvnQouAven7UWc=,SVsb0PnY5j7ufX5Dow7ODrbESsAmnPEJtdwq88Hv1wY=,lYJzKGHrOBy2En3yIv99/IEvB5dbt7BEe8uAg6fSYQ0=,XeIZEHaojvTt7J0/nQrL5843tFL7Gb5Arur+IOzsbBQ=
/qAla7c1s0x8MzaHjGDKCza0rHSqa7Y+/zcaWQMBpC4=,4CPsbLnLGbx9AqD/zgLjW1TVkoLMWeLaEeD0JFsqSDI=,NmVv+idRPSgEbcRX/Xa74wzyt9yQqTBeWKCCqVauTvk=,stIfIRARkGfkMLplfC+QCc7YWOqau4T+A7pdmRsd4gQ=,bubGTMT3XeT7iWr7iSDnsQqBLpTfvw4R0MT6S8J+9JI=,kSmKBbsz9aGcYQHBdo/Y1VUwgwrSF/BydcRwTSFksoA=,FI/3scaISrLiKqHi99Ug6+iReqQCq4unWgp35gzjtxU=,0tRDQkhnFxvCvQ3X5sqZzBUZ/LsmZP0gEEUCwqnNZhM=,2OHybmTM1UyiDDE1NlIzavupurXNV7ERV3fKARnGgYU=,KDvJlHjpb7SdNZL8z80bGaYA4GVdfY8pMHfgZI6gokY=,K3L1EpapSxiBAwDLMIV5jV+mH9dffNVUoWXNXwgF3O0=
