# Final Project: Restaurant Recommander System

## Imports

In [1]:
import pandas as pd
import numpy as np
import yaml
import os
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import f1_score
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.functions import concat
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [3]:
config_file = open("config.yml", "r")
config = yaml.load(config_file, Loader=yaml.FullLoader)
DATA_DIRECTORY = config["DATA_DIRECTORY"]

## Analyse de données

Pour pouvoir déterminer à quel client nous allons recommander un restaurant. Nous allons d'abbord prédire les notes que ces derniers peuvent attribuer. Nous allons donc nous tourner vers l'algorithme ALS pour factoriser des matrices. 

### Manipulation avec Spark

In [3]:
spark = SparkSession.builder.appName("ALSMatrixFactorisation").getOrCreate()

your 131072x1 screen size is bogus. expect trouble
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/17 20:16:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


**Chargement des données**

Nous allons ici récupérer le CSV qui contient l'historique des commandes des clients mais aussi les notes potentielles qu'ils peuvent attribuer aux restaurants.

In [4]:
spark_train_order_df = spark.read.option("header", True).csv(
    os.path.join(DATA_DIRECTORY, "orders.csv")
)
spark_train_order_df.show()

23/05/17 20:16:36 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------+-----------+----------+-----------+------------+----------+----------------------+------------------------------+-----------+--------+-------------+-------------+----------------+---------------+-------------------+-------------------+--------------------+---------------------+--------------+--------------+-------------------+---------+-------------------+---------------+-------------+----------------------+
|akeed_order_id|customer_id|item_count|grand_total|payment_mode|promo_code|vendor_discount_amount|promo_code_discount_percentage|is_favorite|is_rated|vendor_rating|driver_rating|deliverydistance|preparationtime|      delivery_time|order_accepted_time|driver_accepted_time|ready_for_pickup_time|picked_up_time|delivered_time|      delivery_date|vendor_id|         created_at|LOCATION_NUMBER|LOCATION_TYPE|CID X LOC_NUM X VENDOR|
+--------------+-----------+----------+-----------+------------+----------+----------------------+------------------------------+-----------+---

On va ici utiliser `CID x LOC_NUM` en tant que clé des user et `vendor_id` en tant que clé des items

In [5]:
CUSTOMER_KEY = "CID x LOC_NUM"
df_ratings = spark_train_order_df.withColumn(
    "CID x LOC_NUM",
    concat(
        spark_train_order_df["customer_id"],
        lit(" X "),
        spark_train_order_df["LOCATION_NUMBER"],
    ),
)
df_ratings = df_ratings.select(
    "customer_id", "vendor_id", "vendor_rating", "LOCATION_NUMBER", "CID x LOC_NUM"
).where(col("is_rated") == "Yes")

df_ratings.show()

+-----------+---------+-------------+---------------+-------------+
|customer_id|vendor_id|vendor_rating|LOCATION_NUMBER|CID x LOC_NUM|
+-----------+---------+-------------+---------------+-------------+
|    OH64IO0|      310|          5.0|              0|  OH64IO0 X 0|
|    FCPLE31|      157|          5.0|              0|  FCPLE31 X 0|
|    WB681BO|      271|          4.0|              0|  WB681BO X 0|
|    FS229TW|      157|          4.0|              0|  FS229TW X 0|
|    3P9113W|       85|          5.0|              0|  3P9113W X 0|
|    WB681BO|       90|          5.0|              0|  WB681BO X 0|
|    AA31G37|       85|          5.0|              0|  AA31G37 X 0|
|    E4ZWBIY|       83|          5.0|              0|  E4ZWBIY X 0|
|    Y8OGZS1|      196|          2.0|              0|  Y8OGZS1 X 0|
|    I3DAUFL|      300|          1.0|              0|  I3DAUFL X 0|
|    8BZR1IV|       33|          5.0|              0|  8BZR1IV X 0|
|    5O4E3Z3|       92|          4.0|           

On applique une transformation pour garder les index de nos colonnes `CID x LOC_NUM` et `vendor_id`.

In [None]:
indexer = [
    StringIndexer(inputCol=column, outputCol=column + "_index")
    for column in list(set(df_ratings.columns) - set(["vendor_rating"]))
]

pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(df_ratings).transform(df_ratings)
transformed = transformed.withColumn(
    "vendor_rating", transformed["vendor_rating"].cast("float")
)
transformed = transformed.withColumn(
    "vendor_id_index", transformed["vendor_id_index"].cast("float")
)
transformed.show(5)

On applique notre entrainement et on crée des `training` et `validation` sets.

In [None]:
(training, validation) = transformed.randomSplit([0.8, 0.2], seed=16)
customer_index_key = f"{CUSTOMER_KEY}_index"
als = ALS(
    maxIter=15,
    regParam=0.09,
    rank=25,
    userCol=customer_index_key,
    itemCol="vendor_id_index",
    ratingCol="vendor_rating",
    coldStartStrategy="drop",
    nonnegative=True,
)

model = als.fit(training)

On applique une évaluation de notre modèle

In [10]:
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="vendor_rating", predictionCol="prediction"
)

predictions = model.transform(validation)
rmse = evaluator.evaluate(predictions)

print("RMSE=" + str(rmse))
predictions.orderBy(col("prediction").desc()).show()

                                                                                

RMSE=1.2295336813077355


                                                                                

+-----------+---------+-------------+---------------+-------------+---------------------+-----------------+-------------------+---------------+----------+
|customer_id|vendor_id|vendor_rating|LOCATION_NUMBER|CID x LOC_NUM|LOCATION_NUMBER_index|customer_id_index|CID x LOC_NUM_index|vendor_id_index|prediction|
+-----------+---------+-------------+---------------+-------------+---------------------+-----------------+-------------------+---------------+----------+
|    PO53F9X|      115|          5.0|              1|  PO53F9X X 1|                  1.0|            170.0|              123.0|           86.0|  5.548597|
|    XZBMB2C|       92|          5.0|              0|  XZBMB2C X 0|                  0.0|            728.0|              626.0|           26.0|  5.195858|
|    PO53F9X|      401|          5.0|              1|  PO53F9X X 1|                  1.0|            170.0|              123.0|           49.0| 5.1810985|
|    T1F5SV5|       92|          5.0|              3|  T1F5SV5 X 3|   

On cherche les 20 restaurants à recommander à chaque `user`

In [11]:
recs = model.recommendForAllUsers(20).toPandas()
recs

                                                                                

Unnamed: 0,CID x LOC_NUM_index,recommendations
0,26,"[(15, 4.453946113586426), (50, 3.2801349163055..."
1,27,"[(78, 4.840857028961182), (86, 4.7396020889282..."
2,28,"[(90, 5.592973709106445), (95, 5.3681344985961..."
3,31,"[(95, 4.926567077636719), (69, 4.8624315261840..."
4,34,"[(77, 5.177370548248291), (95, 5.1696462631225..."
...,...,...
8067,9244,"[(46, 1.9630740880966187), (90, 1.765200614929..."
8068,9250,"[(59, 4.904534339904785), (95, 4.5978565216064..."
8069,9261,"[(95, 5.113649368286133), (5, 4.89529514312744..."
8070,9266,"[(90, 4.332350730895996), (95, 4.1721606254577..."


On a donc un Datarame comme ceci

| Client x num_localisation  | liste de couple (restaurant x note prédite) |   
| -- | -- |

On souhaite avoir un Datarame comme celui-là:
| Client | num_localisation  | restaurant | note prédite  |   
| -- | -- | -- |  -- |

### Transformation vers Pandas

In [12]:
df_recs = (
    recs.recommendations.apply(pd.Series)
    .merge(recs, right_index=True, left_index=True)
    .drop(["recommendations"], axis=1)
    .melt(id_vars=[customer_index_key], value_name="recommendation")
    .drop("variable", axis=1)
    .dropna()
)
df_recs.head()

Unnamed: 0,CID x LOC_NUM_index,recommendation
0,26,"(15, 4.453946113586426)"
1,27,"(78, 4.840857028961182)"
2,28,"(90, 5.592973709106445)"
3,31,"(95, 4.926567077636719)"
4,34,"(77, 5.177370548248291)"


On transforme les valeurs de la colonne `recommendation` en série

In [13]:
df_recs = df_recs.sort_values(customer_index_key)
df_recs = pd.concat(
    [df_recs["recommendation"].apply(pd.Series), df_recs[customer_index_key]], axis=1
)

On convertit ensuite en DataFrame Pandas.

In [14]:
df_recs.columns = [
    "vendor_id_index",
    "rating",
    customer_index_key,
]
tmp = transformed.select(
    transformed[CUSTOMER_KEY],
    transformed[customer_index_key],
    transformed["vendor_id"],
    transformed["vendor_id_index"],
)
tmp = tmp.toPandas()

In [None]:
dict1 = dict(zip(tmp[customer_index_key], tmp[CUSTOMER_KEY]))
dict2 = dict(zip(tmp["vendor_id_index"], tmp["vendor_id"]))
df_recs_copy = df_recs.copy()
# On affecte les valeurs réelle des CUSTOMER_KEY et vendor_id à partir de leur index
df_recs_copy.loc[:, CUSTOMER_KEY] = df_recs[customer_index_key].map(dict1)
df_recs_copy.loc[:, "vendor_id"] = df_recs["vendor_id_index"].map(dict2)
df_recs_copy = df_recs_copy.sort_values(CUSTOMER_KEY)
df_recs_copy.reset_index(drop=True, inplace=True)
new = df_recs_copy[[CUSTOMER_KEY, "vendor_id", "rating"]]
new["recommendations"] = list(zip(new.vendor_id, new.rating))

res = new[[CUSTOMER_KEY, "recommendations"]]
recommandation_df = (
    res["recommendations"].groupby([res[CUSTOMER_KEY]]).apply(list).reset_index()
)
recommandation_df[CUSTOMER_KEY] = recommandation_df[CUSTOMER_KEY].astype(str)
recommandation_df = recommandation_df.explode("recommendations")

# On génère une dataframe sous forme de liste de couples vendeur/client avec la note calculée pour chacun
recommandation_df["recommanded_vendor"] = recommandation_df["recommendations"].apply(
    lambda x: x[0]
)
recommandation_df["rating"] = recommandation_df["recommendations"].apply(lambda x: x[1])
recommandation_df = recommandation_df.drop(["recommendations"], axis=1)

# On plafonne les notes calculées à 5
recommandation_df["rating"] = recommandation_df["rating"].apply(
    lambda x: 5 if x >= 5 else x
)

# On introduit le LOC NUM
recommandation_df.sort_values(by="rating", ascending=False).head()
recommandation_df.describe()
split_col = lambda s: s.split(" X ")

recommandation_df["CID x LOC_NUM"] = recommandation_df["CID x LOC_NUM"].apply(split_col)
recommandation_df["customer_id"] = recommandation_df["CID x LOC_NUM"].apply(
    lambda x: x[0]
)
recommandation_df["LOCATION_NUMBER"] = recommandation_df["CID x LOC_NUM"].apply(
    lambda x: x[1]
)


recommandation_df = recommandation_df.drop("CID x LOC_NUM", axis=1)
recommandation_df = recommandation_df.reindex(
    columns=["customer_id", "LOCATION_NUMBER", "recommanded_vendor", "rating"]
)
recommandation_df.head()
recommandation_df = recommandation_df.astype({"LOCATION_NUMBER": int})

On obtient ainsi le dataframe escompté

In [51]:
recommandation_df.to_csv(os.path.join(DATA_DIRECTORY, "recommandation.csv"))
recommandation_df

Unnamed: 0,customer_id,LOCATION_NUMBER,recommanded_vendor,rating
0,00HWUU3,1,265,3.998636
0,00HWUU3,1,23,3.868816
0,00HWUU3,1,44,3.911969
0,00HWUU3,1,115,4.119711
0,00HWUU3,1,582,3.965674
...,...,...,...,...
8071,ZZY3N0D,1,86,4.813933
8071,ZZY3N0D,1,115,4.972408
8071,ZZY3N0D,1,20,4.833045
8071,ZZY3N0D,1,55,4.847807


### KMeans Clustering
Les notes prédites sont récupérée, nous souhaitons maintenant proposer des restaurants au clients de la même zone géographique. Pour cela il faut déterminer les régions de clients et faire ainsi du Kmeans Clustering.

In [19]:
# Charger votre dataframe avec les colonnes latitude et longitude (supposons que le dataframe s'appelle 'df')
# df = ...

# Sélectionner les colonnes latitude et longitude
train_locations_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "train_locations.csv"))
train_locations_df = train_locations_df.dropna()
data = train_locations_df[["latitude", "longitude"]].copy()
train_locations_df = train_locations_df.rename(
    columns={"location_number": "LOCATION_NUMBER"}
)
# Spécifier le nombre de clusters que vous souhaitez obtenir
k = 6

# Créer une instance de l'algorithme K-Means
kmeans = KMeans(n_clusters=k)

# Appliquer l'algorithme aux données
kmeans.fit(data)

# Obtenir les étiquettes de cluster pour chaque point
labels = kmeans.labels_

# Ajouter les étiquettes de cluster à votre dataframe
train_locations_df["cluster_label"] = labels.copy()

train_locations_df.head()



Unnamed: 0,customer_id,LOCATION_NUMBER,location_type,latitude,longitude,cluster_label
3,RU43CXC,0,Home,0.100853,0.438165,0
8,PB2B28D,0,Home,2.575605,0.70827,0
9,U9YKW1T,0,Work,0.100017,0.004357,0
14,4YWW8U8,1,Home,-0.575556,0.236555,0
15,393VD80,0,Home,-0.245396,-78.561908,1


On joint en suite cela à la DataFrame de recommandation.

In [20]:
recommandation_with_cluster = pd.merge(recommandation_df, train_locations_df)

recommandation_with_cluster.head()

Unnamed: 0,customer_id,LOCATION_NUMBER,recommanded_vendor,rating,location_type,latitude,longitude,cluster_label
0,00HWUU3,1,265,3.998636,Home,-1.117296,0.103313,0
1,00HWUU3,1,23,3.868816,Home,-1.117296,0.103313,0
2,00HWUU3,1,44,3.911969,Home,-1.117296,0.103313,0
3,00HWUU3,1,115,4.119711,Home,-1.117296,0.103313,0
4,00HWUU3,1,582,3.965674,Home,-1.117296,0.103313,0


On a fixé nos classes de localisation, il faut donc ensuite être en capacité d'évaluer l'appartenance d'une localisation à une région, pour cela on applique la classification **KNN** avec le même `k` que nous nous sommes fixés pour le **KMeans Clustering**

In [21]:
X_train = recommandation_with_cluster[["latitude", "longitude"]]
y_train = recommandation_with_cluster["cluster_label"]

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

Affectation des label de cluster aux vendeurs

In [22]:
vendors_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "vendors.csv"))
vendors_df["cluster_label"] = knn.predict(vendors_df[["latitude", "longitude"]])

# Phase de génération de test

In [23]:
test_locations_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "test_locations.csv"))
test_locations_df = test_locations_df.dropna()
test_locations_df.head()

Unnamed: 0,customer_id,location_number,location_type,latitude,longitude
1,0JP29SK,0,Home,0.278709,-78.623847
2,0JP29SK,1,Home,0.124485,-78.605621
7,0JP29SK,6,Home,0.12371,0.457748
9,D1CUMM9,0,Home,-0.090105,-78.58026
10,D1CUMM9,1,Home,-0.460103,-78.536533


Prédiction de l'appartenance des nouvaux clients de tests aux clusters.

In [24]:
test_locations_df["cluster_label"] = knn.predict(
    test_locations_df[["latitude", "longitude"]]
)

In [25]:
test_locations_df.head()

Unnamed: 0,customer_id,location_number,location_type,latitude,longitude,cluster_label
1,0JP29SK,0,Home,0.278709,-78.623847,1
2,0JP29SK,1,Home,0.124485,-78.605621,1
7,0JP29SK,6,Home,0.12371,0.457748,0
9,D1CUMM9,0,Home,-0.090105,-78.58026,1
10,D1CUMM9,1,Home,-0.460103,-78.536533,1


Faire en sorte de recommander selon le cluster des `customer_id`:

In [54]:
from functools import partial


def get_restaurant_suggestion(
    test_locations_df, cluster_label, recommandation_with_cluster, min_rating=3.9
):
    """Fonction qui fait une liste de suggestion des meilleurs restaurants selon le cluster"""
    df = recommandation_with_cluster[
        (recommandation_with_cluster["cluster_label"] == cluster_label)
    ]
    df = df.groupby("recommanded_vendor")[["rating"]].mean()
    df = df[df["rating"] >= min_rating].sort_values("rating", ascending=False)
    df = df.reset_index()
    df["recommanded_vendor"] = df["recommanded_vendor"].astype(int)
    return df.values[:5]


suggestions_func = partial(
    get_restaurant_suggestion,
    test_locations_df=test_locations_df,
    recommandation_with_cluster=recommandation_with_cluster,
)

suggest_by_cluster = lambda row: suggestions_func(cluster_label=row["cluster_label"])
suggestions_func(cluster_label=1)

array([[398.        ,   4.08236933],
       [195.        ,   4.02809029],
       [ 78.        ,   4.0101285 ],
       [843.        ,   3.95645619],
       [575.        ,   3.9325707 ]])

In [None]:
test_locations_df["recommandations_list"] = test_locations_df.apply(
    suggest_by_cluster, axis=1
)
test_locations_df = test_locations_df.explode("recommandations_list")
test_locations_df["vendor_id"] = test_locations_df["recommandations_list"].apply(
    lambda x: x[0]
)
test_locations_df["vendor_id"] = test_locations_df["vendor_id"].astype(int)

On formate la dataframe pour la submission

In [32]:
test_locations_df["CID X LOC_NUM X VENDOR"] = (
    test_locations_df["customer_id"]
    + " X "
    + test_locations_df["location_number"].astype(str)
    + " X "
    + test_locations_df["vendor_id"].astype(int).astype(str)
)
test_locations_df["target"] = 1
submission_df = test_locations_df[["CID X LOC_NUM X VENDOR", "target"]]

On fait en sorte de pouvoir soumettre toutes les données à l'aide du fichier `SampleSubmission.csv` et on sauvegarde nos tests dans `Submission.csv` 

In [18]:
sample_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "SampleSubmission.csv"))
sample_df = sample_df.drop("target", axis=1)
df_merged = pd.merge(sample_df, submission_df, how="left").fillna(0)
df_merged["target"] = df_merged["target"].astype(int)
df_merged.to_csv(os.path.join(DATA_DIRECTORY, "Submission.csv"), index=False)