## Construire un moteur de recommandation avec SPARK

Utiliser les notes globales des clients attribuées aux restaurants pour générez des prédictions pour d'autres restaurants et recommander ainsi des restaurtants à des clients.

- Importation des librairies utiles

In [4]:
from pyspark.sql.types import *
from pyspark.sql import functions as F
import pandas as pd
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import IndexToString

#### Importation des données depuis le compte de stokage

In [6]:
#spark.conf.set(
# storage aaccount
# password
#               )

In [7]:
datasets = {
  dataset: spark.read.load( 
    "wasbs://default@storagestudent.blob.core.windows.net/datasets/S8-5/Exo/restaurant-data-with-consumer-ratings/{0}.csv".format(dataset), 
    format="csv",
    header="true"
                           )
  for dataset in ["chefmozaccepts", "chefmozcuisine", "chefmozhours4", "chefmozparking", "geoplaces2", "rating_final", "usercuisine", "userpayment", "userprofile"]
            }

In [8]:
print(datasets.keys())

#### Contenu des datasets

In [10]:
userprofile = datasets['userprofile']
display(userprofile.head(5))

userID,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,personality,religion,activity,color,weight,budget,height
U1001,22.139997,-100.978803,False,abstemious,informal,family,on foot,single,independent,1989,variety,thrifty-protector,none,student,black,69,medium,1.77
U1002,22.150087,-100.983325,False,abstemious,informal,family,public,single,independent,1990,technology,hunter-ostentatious,Catholic,student,red,40,low,1.87
U1003,22.119847,-100.946527,False,social drinker,formal,family,public,single,independent,1989,none,hard-worker,Catholic,student,blue,60,low,1.69
U1004,18.867,-99.183,False,abstemious,informal,family,public,single,independent,1940,variety,hard-worker,none,professional,green,44,medium,1.53
U1005,22.183477,-100.959891,False,abstemious,no preference,family,public,single,independent,1992,none,thrifty-protector,Catholic,student,black,65,medium,1.69


In [11]:
userpayment = datasets['userpayment']
display(userpayment.head(5))

userID,Upayment
U1001,cash
U1002,cash
U1003,cash
U1004,cash
U1004,bank_debit_cards


In [12]:
usercuisine = datasets['usercuisine']
display(usercuisine.head(5))

userID,Rcuisine
U1001,American
U1002,Mexican
U1003,Mexican
U1004,Bakery
U1004,Breakfast-Brunch


In [13]:
rating = datasets['rating_final']
display(rating.head(5))

userID,placeID,rating,food_rating,service_rating
U1077,135085,2,2,2
U1077,135038,2,2,1
U1077,132825,2,2,2
U1077,135060,1,2,2
U1068,135104,1,1,2


In [14]:
location = datasets['geoplaces2']
display(location.head(5))

placeID,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,zip,alcohol,smoking_area,dress_code,accessibility,price,url,Rambience,franchise,area,other_services
134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC464A41,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,?,No_Alcohol_Served,none,informal,no_accessibility,medium,kikucuernavaca.com.mx,familiar,f,closed,none
132825,22.1473922,-100.983092,0101000020957F00001AD016568C4858C1243261274BA54B41,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,78280,No_Alcohol_Served,none,informal,completely,low,?,familiar,f,open,none
135106,22.1497088,-100.9760928,0101000020957F0000649D6F21634858C119AE9BF528A34B41,El Rinc�n de San Francisco,Universidad 169,San Luis Potosi,San Luis Potosi,Mexico,?,78000,Wine-Beer,only at bar,informal,partially,medium,?,familiar,f,open,none
132667,23.7526973,-99.1633594,0101000020957F00005D67BCDDED8157C1222A2DC8D84D4941,little pizza Emilio Portes Gil,calle emilio portes gil,victoria,tamaulipas,?,?,?,No_Alcohol_Served,none,informal,completely,low,?,familiar,t,closed,none
132613,23.7529035,-99.165076,0101000020957F00008EBA2D06DC8157C194E03B7B504E4941,carnitas_mata,lic. Emilio portes gil,victoria,Tamaulipas,Mexico,?,?,No_Alcohol_Served,permitted,informal,completely,medium,?,familiar,t,closed,none


In [15]:
accepted_payment = datasets['chefmozaccepts']
display(accepted_payment.head(5))

placeID,Rpayment
135110,cash
135110,VISA
135110,MasterCard-Eurocard
135110,American_Express
135110,bank_debit_cards


In [16]:
hour = datasets['chefmozhours4']
display(hour.head(5))

placeID,hours,days
135111,00:00-23:30;,Mon;Tue;Wed;Thu;Fri;
135111,00:00-23:30;,Sat;
135111,00:00-23:30;,Sun;
135110,08:00-19:00;,Mon;Tue;Wed;Thu;Fri;
135110,00:00-00:00;,Sat;


In [17]:
chefcuisine = datasets['chefmozcuisine']
display(chefcuisine.head(5))

placeID,Rcuisine
135110,Spanish
135109,Italian
135107,Latin_American
135106,Mexican
135105,Fast_Food


In [18]:
parking = datasets['chefmozparking']
display(parking.head(5))

placeID,parking_lot
135111,public
135110,none
135109,none
135108,none
135107,none


### Prospects

- Nombre de userID dans 'usercuisine' par ordre décroissant

In [21]:
display(usercuisine.head(5))

userID,Rcuisine
U1001,American
U1002,Mexican
U1003,Mexican
U1004,Bakery
U1004,Breakfast-Brunch


In [22]:
#display(usercuisine.groupBy("userID").count().orderBy(F.desc("count")))
display(usercuisine.groupBy('userID').agg(F.count('userID').alias('count')).orderBy(-F.col('count')).head(5))

userID,count
U1135,103
U1108,18
U1101,15
U1016,14
U1060,13


- Nombre de placeID dans le dataset 'chefmozcuisine' par ordre décroissant

In [24]:
display(chefcuisine.head(5))

placeID,Rcuisine
135110,Spanish
135109,Italian
135107,Latin_American
135106,Mexican
135105,Fast_Food


In [25]:
#display(chefcuisine.groupBy("placeID").count().orderBy(F.desc("count")))
display(chefcuisine.groupBy('placeID').agg(F.count('placeID').alias('count')).orderBy(-F.col('count')).head(5))

placeID,count
132774,9
135099,6
135097,6
135103,4
135098,4


- Créer un dataset contenant pour chaque placeID, la liste des userID dans une colonne et le nombre de userID dans une autre colonne
- Ajouter la colonne RCuisine pour connaître le type de cuisine de chaque placeID
- Ordonner le dataset par userID de façon decroissante

In [27]:
prospects = (chefcuisine.join(usercuisine, 'Rcuisine', 'inner')
            .groupBy('placeID', 'Rcuisine')
            .agg(F.collect_set('userID').alias('users'))
            .withColumn('count_users', F.size('users'))
            .orderBy(F.desc('count_users'))                          
            )
display(prospects.head(5))

placeID,Rcuisine,users,count_users
132777,Mexican,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1123, U1135, U1081, U1038, U1083, U1037, U1079, U1103, U1048, U1089, U1136, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1033, U1066, U1015, U1056, U1008, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1107, U1004, U1094)",97
132732,Mexican,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1123, U1135, U1081, U1038, U1083, U1037, U1079, U1103, U1048, U1089, U1136, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1033, U1066, U1015, U1056, U1008, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1107, U1004, U1094)",97
132919,Mexican,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1123, U1135, U1081, U1038, U1083, U1037, U1079, U1103, U1048, U1089, U1136, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1033, U1066, U1015, U1056, U1008, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1107, U1004, U1094)",97
132868,Mexican,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1123, U1135, U1081, U1038, U1083, U1037, U1079, U1103, U1048, U1089, U1136, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1033, U1066, U1015, U1056, U1008, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1107, U1004, U1094)",97
132761,Mexican,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1123, U1135, U1081, U1038, U1083, U1037, U1079, U1103, U1048, U1089, U1136, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1033, U1066, U1015, U1056, U1008, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1107, U1004, U1094)",97


- On s'intéresse maintenant au jeu de données dans dataset 'rating_final'

In [29]:
rating = datasets["rating_final"]
display(rating.head(5))

userID,placeID,rating,food_rating,service_rating
U1077,135085,2,2,2
U1077,135038,2,2,1
U1077,132825,2,2,2
U1077,135060,1,2,2
U1068,135104,1,1,2


- Calculer un score NPS pour chaque placeID:
  - Il faut les notes exactes données à chaque placeID en colonne
  - Compter le nombre de chacune des notes attribuées pour chacune des placeID

In [31]:
display(rating.groupBy("placeID").pivot("rating").count().head(5))

placeID,0,1,2
132834,7,11,7
132626,1,1,2
135042,4,7,9
135058,4,8,6
132767,2,1,3


- Calcul du NPS:
  - NPS = (somme_de_note_max / somme_totale_des_notes) - (somme_de_note_min / somme_totale_des_notes)
  - Joindre ensuite au dataset 'chefcuisine'
  - Utiliser la window function F.rank() pour affichez un ranking global pour chaque placeID et un ranking pour chaque RCuisine

In [33]:
display(
  rating.groupBy("placeID")
        .pivot("rating")
        .count()
        .fillna(0)
        .withColumn("Total", F.col("0") + F.col("1") + F.col("2"))
        .withColumn("NPS", (F.col("2") / F.col("Total") - (F.col("0") / F.col("Total"))))
        .join(chefcuisine, "placeID", "left")
        .withColumn("Global Rank", F.rank().over(Window.orderBy(F.desc("NPS"))))
        .withColumn("Rcuisine Rank", F.rank().over(Window.partitionBy("Rcuisine").orderBy(F.desc("NPS"))))
        .orderBy("Global Rank")
        .head(5)
        #.select("placeID", "Rcuisine", "NPS", "Global Rank", "Rank")
        )

placeID,0,1,2,Total,NPS,Rcuisine,Global Rank,Rcuisine Rank
134986,0,0,8,8,1.0,International,1,1
135034,0,0,5,5,1.0,Japanese,1,1
132955,0,0,5,5,1.0,Bar_Pub_Brewery,1,1
132922,0,1,5,6,0.8333333333333334,Cafeteria,4,1
132755,0,1,4,5,0.8,Mexican,5,1


- Ce ranking est-il représentatif ? Nombre d'utilisateurs qui ont donné leur review
- En utilisant le résultat d'au dessus, joindre le nombre d'utilisateurs pour chaque placeID.

In [35]:
from pyspark.sql.window import Window

nps = (rating.groupBy("placeID")
             .pivot("rating")
             .count()
             .fillna(0)
             .withColumn("Total", F.col("0") + F.col("1") + F.col("2"))
             .withColumn("NPS", (F.col("2") / F.col("Total") - (F.col("0") / F.col("Total"))))
             .join(chefcuisine, "placeID", "left")
             .withColumn("Global Rank", F.rank().over(Window.orderBy(F.desc("NPS"))))
             .withColumn("Rcuisine Rank", F.rank().over(Window.partitionBy("Rcuisine").orderBy(F.desc("NPS"))))
             .orderBy("Global Rank")
             .join(prospects, "placeID", "left"))

display(nps.head(5))

placeID,0,1,2,Total,NPS,Rcuisine,Global Rank,Rcuisine Rank,Rcuisine.1,users,count_users
132955,0,0,5,5,1.0,Bar_Pub_Brewery,1,1,Bar_Pub_Brewery,List(U1135),1
134986,0,0,8,8,1.0,International,1,1,International,List(U1135),1
135034,0,0,5,5,1.0,Japanese,1,1,Japanese,"List(U1108, U1093, U1016, U1135, U1004, U1013, U1014)",7
132922,0,1,5,6,0.8333333333333334,Cafeteria,4,1,Cafeteria,"List(U1108, U1128, U1008, U1135, U1105, U1101, U1009, U1004, U1060)",9
132755,0,1,4,5,0.8,Mexican,5,1,Mexican,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1123, U1135, U1081, U1038, U1083, U1037, U1079, U1103, U1048, U1089, U1136, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1033, U1066, U1015, U1056, U1008, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1107, U1004, U1094)",97


#### Formatage

In [37]:
hours = datasets["chefmozhours4"]
display(hours.head(5))

placeID,hours,days
135111,00:00-23:30;,Mon;Tue;Wed;Thu;Fri;
135111,00:00-23:30;,Sat;
135111,00:00-23:30;,Sun;
135110,08:00-19:00;,Mon;Tue;Wed;Thu;Fri;
135110,00:00-00:00;,Sat;


- Reformater le dataset pour le rendre plus lisible

In [39]:
hours_1 = (hours.withColumn("days_splitted", F.split("days", ";"))
                                    .withColumn("day", F.explode("days_splitted"))
                                    .filter(F.col("day") != "")
                                    .groupBy("placeID")
                                    .pivot("day").agg(F.first("hours"))
           )

display(hours_1.head(5))

placeID,Fri,Mon,Sat,Sun,Thu,Tue,Wed
132023,11:00-00:00;,11:00-00:00;,11:00-00:00;,11:00-00:00;,11:00-00:00;,11:00-00:00;,11:00-00:00;
132012,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;
132026,12:00-14:30;,12:00-14:30;,,,12:00-14:30;,12:00-14:30;,12:00-14:30;
132030,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;
132024,11:00-21:00;,11:00-21:00;,11:00-21:00;,11:00-21:00;,11:00-21:00;,11:00-21:00;,11:00-21:00;


### Construire un modèle de recommandation à partir du dataset rating

In [41]:
rating = datasets['rating_final']
display(rating.head(5))

userID,placeID,rating,food_rating,service_rating
U1077,135085,2,2,2
U1077,135038,2,2,1
U1077,132825,2,2,2
U1077,135060,1,2,2
U1068,135104,1,1,2


#### Preprocessing

In [43]:
userIdIndexer = StringIndexer(inputCol="userID", outputCol="userIdIndex").fit(rating)
rating = userIdIndexer.transform(rating)
display(rating.head(5))

userID,placeID,rating,food_rating,service_rating,userIdIndex
U1077,135085,2,2,2,112.0
U1077,135038,2,2,1,112.0
U1077,132825,2,2,2,112.0
U1077,135060,1,2,2,112.0
U1068,135104,1,1,2,81.0


In [44]:
rating.printSchema()

In [45]:
rating = rating.select(
  F.col('userIdIndex'),
  F.col('placeID').cast(IntegerType()),
  F.col('rating').cast(IntegerType()),
                       )
rating.printSchema()

In [46]:
display(rating.head(5))

userIdIndex,placeID,rating
112.0,135085,2
112.0,135038,2
112.0,132825,2
112.0,135060,1
81.0,135104,1


#### Gridsearch et cross validation pour déterminer les meilleurs paramètres du modèle ALS

In [48]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

train_set, test_set = rating.randomSplit([0.8, 0.2], seed=100)

als = ALS(userCol="userIdIndex", itemCol="placeID", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

paramGrid = ParamGridBuilder() \
                    .addGrid(als.rank, [1, 5]) \
                    .addGrid(als.alpha, [2.0, 3.0])\
                    .build()

In [49]:
cv = CrossValidator(
                    estimator=als, 
                    evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=5
                    )

model = cv.fit(train_set)

## Evaluation

In [51]:
predictions = model.transform(test_set)

In [52]:
predictions = IndexToString(inputCol="userIdIndex", outputCol="userID").transform(predictions)
predictions = predictions.withColumn("prediction", F.abs(F.round(predictions["prediction"],0)))
display(predictions.head(5))

userIdIndex,placeID,rating,prediction,userID
26.0,135027,0,1.0,U1116
48.0,135027,2,1.0,U1132
23.0,135027,0,0.0,U1081
12.0,135066,2,2.0,U1016
81.0,132663,1,1.0,U1068


- Utiliser les datasets userpayment et chefmozaccepts pour filtrez les prédictions et ne garder que les recommandations concernant des users ayant des moyens de paiment compatibles avec les restaurants

In [54]:
userpayment = datasets['userpayment']
display(userpayment.head(5))

userID,Upayment
U1001,cash
U1002,cash
U1003,cash
U1004,cash
U1004,bank_debit_cards


In [55]:
accepted_payment = datasets['chefmozaccepts']
display(accepted_payment.head(5))

placeID,Rpayment
135110,cash
135110,VISA
135110,MasterCard-Eurocard
135110,American_Express
135110,bank_debit_cards


In [56]:
payment_matching = userpayment.join(accepted_payment, userpayment['Upayment'] == accepted_payment['Rpayment'], how='inner')
payment_matching = payment_matching.select("userID", "placeID").dropDuplicates()
display(payment_matching.head(5))

userID,placeID
U1081,135110
U1006,135107
U1042,135107
U1029,135105
U1070,135104


In [57]:
payment_matching.count(), predictions.count()

In [58]:
predictions_filtered = predictions.join(payment_matching, ["userID", "placeID"], how='inner').drop("userIdIndex")
predictions_filtered.count()

In [59]:
display(predictions_filtered.head(5))

userID,placeID,rating,prediction
U1116,135027,0,1.0
U1132,135027,2,1.0
U1081,135027,0,0.0
U1016,135066,2,2.0
U1124,135071,1,1.0


In [60]:
rmse = evaluator.evaluate(predictions_filtered)
print("Root-mean-square error = " + str(rmse))

- Les 10 meilleurs restaurants recommandés pour chacun des clients

In [62]:
userRecommendations = model.bestModel.recommendForAllUsers(10)
display(userRecommendations)

userIdIndex,recommendations
31,"List(List(134986, 1.1379204), List(135109, 1.0855305), List(132922, 1.048903), List(132875, 1.0379986), List(132768, 1.036059), List(135073, 1.0314724), List(132584, 0.99903345), List(135034, 0.9964369), List(135055, 0.9938734), List(132937, 0.9907132))"
85,"List(List(134986, 1.3660666), List(135109, 1.3031728), List(132922, 1.2592016), List(132875, 1.246111), List(132768, 1.2437826), List(135073, 1.2382765), List(132584, 1.1993337), List(135034, 1.1962165), List(135055, 1.1931391), List(132937, 1.1893451))"
137,"List(List(134986, 1.9794658), List(135109, 1.888331), List(132922, 1.8246157), List(132875, 1.805647), List(132768, 1.8022733), List(135073, 1.7942947), List(132584, 1.7378654), List(135034, 1.7333486), List(135055, 1.7288893), List(132937, 1.7233919))"
65,"List(List(134986, 1.9940985), List(135109, 1.9022901), List(132922, 1.8381038), List(132875, 1.8189949), List(132768, 1.8155961), List(135073, 1.8075585), List(132584, 1.7507123), List(135034, 1.7461619), List(135055, 1.7416698), List(132937, 1.7361317))"
53,"List(List(134986, 1.6756427), List(135109, 1.5984961), List(132922, 1.5445603), List(132875, 1.5285031), List(132768, 1.525647), List(135073, 1.5188931), List(132584, 1.4711251), List(135034, 1.4673015), List(135055, 1.4635267), List(132937, 1.458873))"
133,"List(List(134986, 1.2302933), List(135109, 1.1736505), List(132922, 1.1340497), List(132875, 1.1222601), List(132768, 1.1201632), List(135073, 1.1152042), List(132584, 1.0801319), List(135034, 1.0773245), List(135055, 1.074553), List(132937, 1.0711362))"
78,"List(List(134986, 1.4288181), List(135109, 1.3630352), List(132922, 1.3170443), List(132875, 1.3033522), List(132768, 1.300917), List(135073, 1.2951579), List(132584, 1.2544261), List(135034, 1.2511657), List(135055, 1.247947), List(132937, 1.2439789))"
108,"List(List(134986, 2.2492387), List(135109, 2.1456838), List(132922, 2.0732849), List(132875, 2.051731), List(132768, 2.0478973), List(135073, 2.0388315), List(132584, 1.9747118), List(135034, 1.9695793), List(135055, 1.9645123), List(132937, 1.9582657))"
34,"List(List(132560, 0.0), List(132630, 0.0), List(132660, 0.0), List(132740, 0.0), List(132830, 0.0), List(132870, 0.0), List(135000, 0.0), List(135030, 0.0), List(135040, 0.0), List(135050, 0.0))"
101,"List(List(134986, 2.1958115), List(135109, 2.094716), List(132922, 2.0240371), List(132875, 2.0029953), List(132768, 1.9992527), List(135073, 1.990402), List(132584, 1.9278054), List(135034, 1.9227948), List(135055, 1.9178482), List(132937, 1.91175))"


In [63]:
userRecommendations = IndexToString(inputCol="userIdIndex", outputCol="userID", 
                                    labels=userIdIndexer.labels
                                   ).transform(userRecommendations).drop("userIdIndex")
display(userRecommendations)

recommendations,userID
"List(List(134986, 1.1379204), List(135109, 1.0855305), List(132922, 1.048903), List(132875, 1.0379986), List(132768, 1.036059), List(135073, 1.0314724), List(132584, 0.99903345), List(135034, 0.9964369), List(135055, 0.9938734), List(132937, 0.9907132))",U1057
"List(List(134986, 1.3660666), List(135109, 1.3031728), List(132922, 1.2592016), List(132875, 1.246111), List(132768, 1.2437826), List(135073, 1.2382765), List(132584, 1.1993337), List(135034, 1.1962165), List(135055, 1.1931391), List(132937, 1.1893451))",U1030
"List(List(134986, 1.9794658), List(135109, 1.888331), List(132922, 1.8246157), List(132875, 1.805647), List(132768, 1.8022733), List(135073, 1.7942947), List(132584, 1.7378654), List(135034, 1.7333486), List(135055, 1.7288893), List(132937, 1.7233919))",U1011
"List(List(134986, 1.9940985), List(135109, 1.9022901), List(132922, 1.8381038), List(132875, 1.8189949), List(132768, 1.8155961), List(135073, 1.8075585), List(132584, 1.7507123), List(135034, 1.7461619), List(135055, 1.7416698), List(132937, 1.7361317))",U1086
"List(List(134986, 1.6756427), List(135109, 1.5984961), List(132922, 1.5445603), List(132875, 1.5285031), List(132768, 1.525647), List(135073, 1.5188931), List(132584, 1.4711251), List(135034, 1.4673015), List(135055, 1.4635267), List(132937, 1.458873))",U1045
"List(List(134986, 1.2302933), List(135109, 1.1736505), List(132922, 1.1340497), List(132875, 1.1222601), List(132768, 1.1201632), List(135073, 1.1152042), List(132584, 1.0801319), List(135034, 1.0773245), List(135055, 1.074553), List(132937, 1.0711362))",U1017
"List(List(134986, 1.4288181), List(135109, 1.3630352), List(132922, 1.3170443), List(132875, 1.3033522), List(132768, 1.300917), List(135073, 1.2951579), List(132584, 1.2544261), List(135034, 1.2511657), List(135055, 1.247947), List(132937, 1.2439789))",U1103
"List(List(134986, 2.2492387), List(135109, 2.1456838), List(132922, 2.0732849), List(132875, 2.051731), List(132768, 2.0478973), List(135073, 2.0388315), List(132584, 1.9747118), List(135034, 1.9695793), List(135055, 1.9645123), List(132937, 1.9582657))",U1026
"List(List(132560, 0.0), List(132630, 0.0), List(132660, 0.0), List(132740, 0.0), List(132830, 0.0), List(132870, 0.0), List(135000, 0.0), List(135030, 0.0), List(135040, 0.0), List(135050, 0.0))",U1073
"List(List(134986, 2.1958115), List(135109, 2.094716), List(132922, 2.0240371), List(132875, 2.0029953), List(132768, 1.9992527), List(135073, 1.990402), List(132584, 1.9278054), List(135034, 1.9227948), List(135055, 1.9178482), List(132937, 1.91175))",U1012


In [64]:
userRecommendations = userRecommendations.withColumn("recommendations", F.explode(userRecommendations.recommendations)).toPandas()
df = pd.DataFrame(userRecommendations['recommendations'].tolist(), index=userRecommendations.index, columns=['Category', 'Rating'])
userRecommendations = pd.concat([userRecommendations, df], axis=1)
userRecommendations.drop(['recommendations'], axis=1, inplace=True)
userRecommendations.Rating = userRecommendations.Rating.round(0)
userRecommendations.head()

Unnamed: 0,userID,Category,Rating
0,U1057,134986,1.0
1,U1057,135109,1.0
2,U1057,132922,1.0
3,U1057,132875,1.0
4,U1057,132768,1.0


- Les 10 meilleurs clients recommandés pour chacun des restaurants

In [66]:
itemRecommendations = model.bestModel.recommendForAllItems(10)
display(itemRecommendations)

placeID,recommendations
135000,"List(List(131, 2.434928), List(136, 2.4186454), List(118, 2.119228), List(103, 2.0419743), List(39, 1.9167094), List(100, 1.9116743), List(18, 1.8585181), List(75, 1.8296623), List(7, 1.8269359), List(25, 1.8195282))"
135027,"List(List(131, 1.6194421), List(136, 1.6086128), List(118, 1.4094735), List(103, 1.3580931), List(39, 1.274781), List(100, 1.2714322), List(18, 1.2360786), List(75, 1.216887), List(7, 1.2150736), List(25, 1.2101469))"
135066,"List(List(131, 2.4782162), List(136, 2.4616442), List(118, 2.1569035), List(103, 2.0782764), List(39, 1.9507847), List(100, 1.9456601), List(18, 1.8915589), List(75, 1.8621901), List(7, 1.8594152), List(25, 1.8518758))"
132663,"List(List(131, 1.6706334), List(136, 1.6594617), List(118, 1.4540277), List(103, 1.4010231), List(39, 1.3150774), List(100, 1.3116227), List(18, 1.2751516), List(75, 1.2553533), List(7, 1.2534827), List(25, 1.2484002))"
135108,"List(List(131, 2.4132352), List(136, 2.3970976), List(118, 2.1003475), List(103, 2.0237823), List(39, 1.8996333), List(100, 1.8946431), List(18, 1.8419604), List(75, 1.8133618), List(7, 1.8106595), List(25, 1.8033179))"
135071,"List(List(131, 1.8727553), List(136, 1.860232), List(118, 1.6299435), List(103, 1.5705261), List(39, 1.4741822), List(100, 1.4703096), List(18, 1.4294261), List(75, 1.4072325), List(7, 1.4051355), List(25, 1.3994381))"
132723,"List(List(131, 2.3622367), List(136, 2.3464403), List(118, 2.0559614), List(103, 1.981014), List(39, 1.8594888), List(100, 1.854604), List(18, 1.8030348), List(75, 1.7750405), List(7, 1.7723954), List(25, 1.7652088))"
135062,"List(List(131, 2.3643842), List(136, 2.3485734), List(118, 2.0578303), List(103, 1.9828149), List(39, 1.8611792), List(100, 1.85629), List(18, 1.8046739), List(75, 1.7766541), List(7, 1.7740066), List(25, 1.7668135))"
132862,"List(List(131, 2.8491356), List(136, 2.8300831), List(118, 2.4797313), List(103, 2.389336), List(39, 2.2427623), List(100, 2.2368708), List(18, 2.1746721), List(75, 2.1409078), List(7, 2.1377172), List(25, 2.1290495))"
132773,"List(List(131, 2.562558), List(136, 2.5454218), List(118, 2.23031), List(103, 2.149007), List(39, 2.0171764), List(100, 2.0118773), List(18, 1.9559348), List(75, 1.9255666), List(7, 1.9226971), List(25, 1.9149011))"


In [67]:
itemRecommendations = itemRecommendations.withColumn("recommendations", F.explode(itemRecommendations.recommendations)).toPandas()
df = pd.DataFrame(itemRecommendations['recommendations'].tolist(), index=itemRecommendations.index, columns=['userIdIndex', 'Rating'])
itemRecommendations = pd.concat([itemRecommendations, df], axis=1)
itemRecommendations.drop(['recommendations'], axis=1, inplace=True)
itemRecommendations.Rating = itemRecommendations.Rating.round(0)
itemRecommendations.head()

Unnamed: 0,placeID,userIdIndex,Rating
0,135000,131,2.0
1,135000,136,2.0
2,135000,118,2.0
3,135000,103,2.0
4,135000,39,2.0


In [68]:
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
itemRecommendations = sqlCtx.createDataFrame(itemRecommendations)
display(itemRecommendations.head(5))

placeID,userIdIndex,Rating
135000,131,2.0
135000,136,2.0
135000,118,2.0
135000,103,2.0
135000,39,2.0


In [69]:
itemRecommendations = IndexToString(inputCol="userIdIndex", outputCol="userID", 
                                    labels=userIdIndexer.labels
                                   ).transform(itemRecommendations).drop("userIdIndex")
display(itemRecommendations.head(5))

placeID,Rating,userID
135000,2.0,U1021
135000,2.0,U1074
135000,2.0,U1127
135000,2.0,U1102
135000,2.0,U1055


In [70]:
itemRecommendations = itemRecommendations.toPandas()
itemRecommendations.head()

Unnamed: 0,placeID,Rating,userID
0,135000,2.0,U1021
1,135000,2.0,U1074
2,135000,2.0,U1127
3,135000,2.0,U1102
4,135000,2.0,U1055
