# Final Project: Restaurant Recommander System

## Imports

In [163]:
import pandas as pd
import numpy as np
import yaml
import os
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import f1_score
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.functions import concat
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [103]:
config_file = open("config.yml", "r")
config = yaml.load(config_file, Loader=yaml.FullLoader)
DATA_DIRECTORY = config["DATA_DIRECTORY"]

In [104]:
spark = SparkSession.builder.appName("ALSMatrixFactorisation").getOrCreate()

## Chargement des données

In [105]:
spark_train_order_df = spark.read.option("header", True).csv(
    os.path.join(DATA_DIRECTORY, "orders.csv")
)
spark_train_order_df.show()

+--------------+-----------+----------+-----------+------------+----------+----------------------+------------------------------+-----------+--------+-------------+-------------+----------------+---------------+-------------------+-------------------+--------------------+---------------------+--------------+--------------+-------------------+---------+-------------------+---------------+-------------+----------------------+
|akeed_order_id|customer_id|item_count|grand_total|payment_mode|promo_code|vendor_discount_amount|promo_code_discount_percentage|is_favorite|is_rated|vendor_rating|driver_rating|deliverydistance|preparationtime|      delivery_time|order_accepted_time|driver_accepted_time|ready_for_pickup_time|picked_up_time|delivered_time|      delivery_date|vendor_id|         created_at|LOCATION_NUMBER|LOCATION_TYPE|CID X LOC_NUM X VENDOR|
+--------------+-----------+----------+-----------+------------+----------+----------------------+------------------------------+-----------+---

In [106]:
CUSTOMER_KEY = "CID x LOC_NUM"
df_ratings = spark_train_order_df.withColumn(
    "CID x LOC_NUM",
    concat(
        spark_train_order_df["customer_id"],
        lit(" X "),
        spark_train_order_df["LOCATION_NUMBER"],
    ),
)
df_ratings = df_ratings.select(
    "customer_id", "vendor_id", "vendor_rating", "LOCATION_NUMBER", "CID x LOC_NUM"
).where(col("is_rated") == "Yes")

df_ratings.show()

+-----------+---------+-------------+---------------+-------------+
|customer_id|vendor_id|vendor_rating|LOCATION_NUMBER|CID x LOC_NUM|
+-----------+---------+-------------+---------------+-------------+
|    OH64IO0|      310|          5.0|              0|  OH64IO0 X 0|
|    FCPLE31|      157|          5.0|              0|  FCPLE31 X 0|
|    WB681BO|      271|          4.0|              0|  WB681BO X 0|
|    FS229TW|      157|          4.0|              0|  FS229TW X 0|
|    3P9113W|       85|          5.0|              0|  3P9113W X 0|
|    WB681BO|       90|          5.0|              0|  WB681BO X 0|
|    AA31G37|       85|          5.0|              0|  AA31G37 X 0|
|    E4ZWBIY|       83|          5.0|              0|  E4ZWBIY X 0|
|    Y8OGZS1|      196|          2.0|              0|  Y8OGZS1 X 0|
|    I3DAUFL|      300|          1.0|              0|  I3DAUFL X 0|
|    8BZR1IV|       33|          5.0|              0|  8BZR1IV X 0|
|    5O4E3Z3|       92|          4.0|           

In [107]:
df_ratings.count()

20109

In [108]:
indexer = [
    StringIndexer(inputCol=column, outputCol=column + "_index")
    for column in list(set(df_ratings.columns) - set(["vendor_rating"]))
]

pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(df_ratings).transform(df_ratings)
transformed = transformed.withColumn(
    "vendor_rating", transformed["vendor_rating"].cast("float")
)
transformed = transformed.withColumn(
    "vendor_id_index", transformed["vendor_id_index"].cast("float")
)
transformed.show(5)

+-----------+---------+-------------+---------------+-------------+---------------+-----------------+-------------------+---------------------+
|customer_id|vendor_id|vendor_rating|LOCATION_NUMBER|CID x LOC_NUM|vendor_id_index|customer_id_index|CID x LOC_NUM_index|LOCATION_NUMBER_index|
+-----------+---------+-------------+---------------+-------------+---------------+-----------------+-------------------+---------------------+
|    OH64IO0|      310|          5.0|              0|  OH64IO0 X 0|           30.0|           6380.0|             7444.0|                  0.0|
|    FCPLE31|      157|          5.0|              0|  FCPLE31 X 0|           31.0|           5302.0|             5957.0|                  0.0|
|    WB681BO|      271|          4.0|              0|  WB681BO X 0|           71.0|           3354.0|             3331.0|                  0.0|
|    FS229TW|      157|          4.0|              0|  FS229TW X 0|           31.0|           2695.0|             6019.0|               

In [109]:
(training, validation) = transformed.randomSplit([0.8, 0.2], seed=16)

In [110]:
customer_index_key = f"{CUSTOMER_KEY}_index"
als = ALS(
    maxIter=15,
    regParam=0.09,
    rank=25,
    userCol=customer_index_key,
    itemCol="vendor_id_index",
    ratingCol="vendor_rating",
    coldStartStrategy="drop",
    nonnegative=True,
)

model = als.fit(training)

In [111]:
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="vendor_rating", predictionCol="prediction"
)

predictions = model.transform(validation)
rmse = evaluator.evaluate(predictions)

print("RMSE=" + str(rmse))
predictions.orderBy(col("prediction").desc()).show()

                                                                                

RMSE=1.2277909803242675


                                                                                

+-----------+---------+-------------+---------------+-------------+---------------+-----------------+-------------------+---------------------+----------+
|customer_id|vendor_id|vendor_rating|LOCATION_NUMBER|CID x LOC_NUM|vendor_id_index|customer_id_index|CID x LOC_NUM_index|LOCATION_NUMBER_index|prediction|
+-----------+---------+-------------+---------------+-------------+---------------+-----------------+-------------------+---------------------+----------+
|    PO53F9X|      115|          5.0|              1|  PO53F9X X 1|           86.0|            170.0|              123.0|                  1.0| 5.4733167|
|    ICN6274|      216|          5.0|              4|  ICN6274 X 4|           32.0|            283.0|              567.0|                  4.0| 5.1940565|
|    E5GIQ6X|       90|          5.0|              0|  E5GIQ6X X 0|           27.0|            833.0|              711.0|                  0.0| 5.1843333|
|    3CS6P25|      159|          5.0|              2|  3CS6P25 X 2|   

In [112]:
recs = model.recommendForAllUsers(20).toPandas()
recs

                                                                                

Unnamed: 0,CID x LOC_NUM_index,recommendations
0,26,"[(15, 4.467200756072998), (6, 3.15623736381530..."
1,27,"[(78, 4.825118064880371), (77, 4.7193622589111..."
2,28,"[(95, 5.924435615539551), (77, 5.4620547294616..."
3,31,"[(68, 4.956910133361816), (96, 4.8654012680053..."
4,34,"[(95, 5.558813571929932), (77, 5.2261157035827..."
...,...,...
8067,9244,"[(46, 1.9638583660125732), (95, 1.708905458450..."
8068,9250,"[(59, 4.902221202850342), (90, 4.2612195014953..."
8069,9261,"[(90, 5.0758442878723145), (98, 5.028432846069..."
8070,9266,"[(95, 3.924290657043457), (0, 3.91345810890197..."


In [113]:
df_recs = (
    recs.recommendations.apply(pd.Series)
    .merge(recs, right_index=True, left_index=True)
    .drop(["recommendations"], axis=1)
    .melt(id_vars=[customer_index_key], value_name="recommendation")
    .drop("variable", axis=1)
    .dropna()
)
df_recs.head()

Unnamed: 0,CID x LOC_NUM_index,recommendation
0,26,"(15, 4.467200756072998)"
1,27,"(78, 4.825118064880371)"
2,28,"(95, 5.924435615539551)"
3,31,"(68, 4.956910133361816)"
4,34,"(95, 5.558813571929932)"


In [114]:
df_recs = df_recs.sort_values(customer_index_key)
df_recs = pd.concat(
    [df_recs["recommendation"].apply(pd.Series), df_recs[customer_index_key]], axis=1
)

In [115]:
df_recs.columns = [
    "vendor_id_index",
    "rating",
    customer_index_key,
]
tmp = transformed.select(
    transformed[CUSTOMER_KEY],
    transformed[customer_index_key],
    transformed["vendor_id"],
    transformed["vendor_id_index"],
)
tmp = tmp.toPandas()

In [116]:
dict1 = dict(zip(tmp[customer_index_key], tmp[CUSTOMER_KEY]))
dict2 = dict(zip(tmp["vendor_id_index"], tmp["vendor_id"]))
df_recs_copy = df_recs.copy()
df_recs_copy.loc[:, CUSTOMER_KEY] = df_recs[customer_index_key].map(dict1)
df_recs_copy.loc[:, "vendor_id"] = df_recs["vendor_id_index"].map(dict2)
df_recs_copy = df_recs_copy.sort_values(CUSTOMER_KEY)
df_recs_copy.reset_index(drop=True, inplace=True)

In [117]:
new = df_recs_copy[[CUSTOMER_KEY, "vendor_id", "rating"]]
new["recommendations"] = list(zip(new.vendor_id, new.rating))

res = new[[CUSTOMER_KEY, "recommendations"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new["recommendations"] = list(zip(new.vendor_id, new.rating))


     CID x LOC_NUM                                    recommendations
0      00HWUU3 X 1  [(401, 4.2503814697265625), (13, 4.16069316864...
1      00OT8JX X 3  [(250, 4.7334699630737305), (115, 4.8346729278...
2      00OT8JX X 4  [(176, 4.80129337310791), (104, 4.807033538818...
3      010DVV3 X 0  [(85, 3.5217103958129883), (271, 3.73131346702...
4      018GT0H X 2  [(23, 4.056676864624023), (295, 3.930625677108...
...            ...                                                ...
8067   ZZJSJYC X 0  [(271, 5.021329879760742), (85, 5.007611751556...
8068   ZZJX0AR X 0  [(265, 4.1821770668029785), (85, 4.12745428085...
8069   ZZP5BHU X 0  [(23, 3.4059560298919678), (310, 3.37825155258...
8070   ZZRJABJ X 2  [(82, 4.156928062438965), (115, 4.724563598632...
8071   ZZY3N0D X 1  [(294, 5.369583606719971), (13, 5.132056236267...

[8072 rows x 2 columns]


In [155]:
recommandation_df = (
    res["recommendations"].groupby([res[CUSTOMER_KEY]]).apply(list).reset_index()
)
recommandation_df[CUSTOMER_KEY] = recommandation_df[CUSTOMER_KEY].astype(str)
recommandation_df = recommandation_df.explode("recommendations")

# On génère une dataframe sous forme de liste de couples vendeur/client avec la note calculée pour chacun
recommandation_df["recommanded_vendor"] = recommandation_df["recommendations"].apply(
    lambda x: x[0]
)
recommandation_df["rating"] = recommandation_df["recommendations"].apply(lambda x: x[1])
recommandation_df = recommandation_df.drop(["recommendations"], axis=1)

# On plafonne les notes calculées à 5
recommandation_df["rating"] = recommandation_df["rating"].apply(
    lambda x: 5 if x >= 5 else x
)

# On introduit le LOC NUM
recommandation_df.sort_values(by="rating", ascending=False).head()
recommandation_df.describe()
split_col = lambda s: s.split(" X ")

recommandation_df["CID x LOC_NUM"] = recommandation_df["CID x LOC_NUM"].apply(split_col)
recommandation_df["customer_id"] = recommandation_df["CID x LOC_NUM"].apply(
    lambda x: x[0]
)
recommandation_df["LOCATION_NUMBER"] = recommandation_df["CID x LOC_NUM"].apply(
    lambda x: x[1]
)


recommandation_df = recommandation_df.drop("CID x LOC_NUM", axis=1)
recommandation_df = recommandation_df.reindex(
    columns=["customer_id", "LOCATION_NUMBER", "recommanded_vendor", "rating"]
)
recommandation_df.head()
recommandation_df = recommandation_df.astype({"LOCATION_NUMBER": int})

In [156]:
recommandation_df.to_csv(os.path.join(DATA_DIRECTORY, "recommandation.csv"))
recommandation_df.dtypes

customer_id            object
LOCATION_NUMBER         int64
recommanded_vendor     object
rating                float64
dtype: object

KMeans Clustering

In [157]:
# Charger votre dataframe avec les colonnes latitude et longitude (supposons que le dataframe s'appelle 'df')
# df = ...

# Sélectionner les colonnes latitude et longitude
train_locations_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "train_locations.csv"))
train_locations_df = train_locations_df.dropna()
data = train_locations_df[["latitude", "longitude"]].copy()
train_locations_df = train_locations_df.rename(
    columns={"location_number": "LOCATION_NUMBER"}
)
# Spécifier le nombre de clusters que vous souhaitez obtenir
k = 6

# Créer une instance de l'algorithme K-Means
kmeans = KMeans(n_clusters=k)

# Appliquer l'algorithme aux données
kmeans.fit(data)

# Obtenir les étiquettes de cluster pour chaque point
labels = kmeans.labels_

# Ajouter les étiquettes de cluster à votre dataframe
train_locations_df["cluster_label"] = labels.copy()

train_locations_df.head()



Unnamed: 0,customer_id,LOCATION_NUMBER,location_type,latitude,longitude,cluster_label
3,RU43CXC,0,Home,0.100853,0.438165,0
8,PB2B28D,0,Home,2.575605,0.70827,0
9,U9YKW1T,0,Work,0.100017,0.004357,0
14,4YWW8U8,1,Home,-0.575556,0.236555,0
15,393VD80,0,Home,-0.245396,-78.561908,1


In [159]:
recommandation_with_cluster = pd.merge(recommandation_df, train_locations_df)

recommandation_with_cluster.head()

Unnamed: 0,customer_id,LOCATION_NUMBER,recommanded_vendor,rating,location_type,latitude,longitude,cluster_label
0,00HWUU3,1,401,4.250381,Home,-1.117296,0.103313,0
1,00HWUU3,1,13,4.160693,Home,-1.117296,0.103313,0
2,00HWUU3,1,298,4.226124,Home,-1.117296,0.103313,0
3,00HWUU3,1,577,4.471765,Home,-1.117296,0.103313,0
4,00HWUU3,1,55,4.227818,Home,-1.117296,0.103313,0


In [176]:
X_train = recommandation_with_cluster[["latitude", "longitude"]]
y_train = recommandation_with_cluster["cluster_label"]

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

In [177]:
vendors_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "vendors.csv"))
test_locations_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "test_locations.csv"))
vendors_df["cluster_label"] = knn.predict(vendors_df[["latitude", "longitude"]])

In [311]:
test_locations_df = test_locations_df.dropna()
test_locations_df.head()

Unnamed: 0,customer_id,location_number,location_type,latitude,longitude
1,0JP29SK,0,Home,0.278709,-78.623847
2,0JP29SK,1,Home,0.124485,-78.605621
7,0JP29SK,6,Home,0.12371,0.457748
9,D1CUMM9,0,Home,-0.090105,-78.58026
10,D1CUMM9,1,Home,-0.460103,-78.536533


In [312]:
test_locations_df["cluster_label"] = knn.predict(
    test_locations_df[["latitude", "longitude"]]
)

In [313]:
test_locations_df.head()

Unnamed: 0,customer_id,location_number,location_type,latitude,longitude,cluster_label
1,0JP29SK,0,Home,0.278709,-78.623847,1
2,0JP29SK,1,Home,0.124485,-78.605621,1
7,0JP29SK,6,Home,0.12371,0.457748,0
9,D1CUMM9,0,Home,-0.090105,-78.58026,1
10,D1CUMM9,1,Home,-0.460103,-78.536533,1


Faire en sorte de recommander selon le cluster des `customer_id`:

In [315]:
from functools import partial


def get_restaurant_suggestion(
    test_locations_df, cluster_label, recommandation_with_cluster, min_rating=3.9
):
    df = recommandation_with_cluster[
        (recommandation_with_cluster["cluster_label"] == cluster_label)
    ]
    df = df.groupby("recommanded_vendor")[["rating"]].mean()
    df = df[df["rating"] >= min_rating]
    df = df.reset_index()
    df["recommanded_vendor"] = df["recommanded_vendor"].astype(int)
    return df.values[:5]


suggestions_func = partial(
    get_restaurant_suggestion,
    test_locations_df=test_locations_df,
    recommandation_with_cluster=recommandation_with_cluster,
)
suggestions_func(cluster_label=1)

array([[113.        ,   4.36900194],
       [195.        ,   4.60990481],
       [225.        ,   4.89424133],
       [299.        ,   3.9169116 ],
       [ 33.        ,   3.91754552]])

In [316]:
u = lambda row: suggestions_func(cluster_label=row["cluster_label"])

In [None]:
test_locations_df["recommandations_list"] = test_locations_df.apply(u, axis=1)

In [319]:
len(test_locations_df)

9070

In [296]:
test_locations_df = test_locations_df.explode("recommandations_list")

In [298]:
test_locations_df["vendor_id"] = test_locations_df["recommandations_list"].apply(
    lambda x: x[0]
)

In [None]:
test_locations_df["CID X LOC_NUM X VENDOR"] = (
    test_locations_df["customer_id"]
    + " X "
    + test_locations_df["location_number"].astype(str)
    + " X "
    + test_locations_df["vendor_id"].astype(int).astype(str)
)
test_locations_df["target"] = 1
submission_df = test_locations_df[["CID X LOC_NUM X VENDOR", "target"]]

In [321]:
submission_df.to_csv(os.path.join(DATA_DIRECTORY, "SampleSubmission.csv"))