# Final Project: Restaurant Recommander System

## Imports

In [318]:
import pandas as pd
import numpy as np
import yaml
import os
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split

In [319]:
config_file = open("config.yml", "r")
config = yaml.load(config_file, Loader=yaml.FullLoader)
DATA_DIRECTORY = config["DATA_DIRECTORY"]

{'DATA_DIRECTORY': 'data',
 'output_schema': {'CID X LOC_NUM X VENDOR': str, 'target': int}}

## Chargement des données

In [7]:
# Données relatives aux commandes des customers aux vendors
order_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "orders.csv"))

# Données des vendeurs(localisation, identifiant)
vendors_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "vendors.csv"))

# Données des clients (date de naissance, ID, genre)
train_customer_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "train_customers.csv"))

# Localisations des clients
train_locations_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "train_locations.csv"))

# Ensemble de tests sur les données des clients
test_customer_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "test_customers.csv"))

# Ensemble de tests sur les localisations des clients
test_locations_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "test_locations.csv"))

  order_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "orders.csv"))


## Definition d'une super-classe de modèle

In [156]:
class RecommanderModel:
    model_type = "undefined"
    # Définition du schéma de sortie du modèle
    output_schema = config["output_schema"]

    def __init__(self, model_name: str) -> None:
        self.model_name = model_name
        pass

    def train(self, **kwargs) -> None:
        pass

    def predict(self, **kwargs) -> None:
        pass

    def save_predictions(self, **kwargs) -> None:
        pass

    def print_model(self):
        print(f"{self.model_type} recommander model\nmodel name: '{self.model_name}'")

In [328]:
order_df.columns

Index(['akeed_order_id', 'customer_id', 'item_count', 'grand_total',
       'payment_mode', 'promo_code', 'vendor_discount_amount',
       'promo_code_discount_percentage', 'is_favorite', 'is_rated',
       'vendor_rating', 'driver_rating', 'deliverydistance', 'preparationtime',
       'delivery_time', 'order_accepted_time', 'driver_accepted_time',
       'ready_for_pickup_time', 'picked_up_time', 'delivered_time',
       'delivery_date', 'vendor_id', 'created_at', 'LOCATION_NUMBER',
       'LOCATION_TYPE', 'CID X LOC_NUM X VENDOR'],
      dtype='object')

In [331]:
train_locations_df

Unnamed: 0,customer_id,location_number,location_type,latitude,longitude
0,02SFNJH,0,,1.682392,-78.789737
1,02SFNJH,1,,1.679137,0.766823
2,02SFNJH,2,,-0.498648,0.661241
3,RU43CXC,0,Home,0.100853,0.438165
4,BDFBPRD,0,,2.523125,0.733464
...,...,...,...,...,...
59498,9PP42SA,2,,-0.788515,-78.497721
59499,9PP42SA,3,Home,-1.445114,0.072558
59500,9PP42SA,4,,-0.001785,0.431695
59501,HWELAU8,0,,-0.066291,-78.583075


In [341]:
order_with_loc_df = pd.merge(
    order_df[["customer_id", "LOCATION_NUMBER", "vendor_id", "vendor_rating"]],
    train_locations_df,
    left_on=["customer_id", "LOCATION_NUMBER"],
    right_on=["customer_id", "location_number"],
)
order_with_loc_df

Unnamed: 0,customer_id,LOCATION_NUMBER,vendor_id,vendor_rating,location_number,location_type,latitude,longitude
0,92PEE24,0,105,,0,,-0.132100,-78.575297
1,92PEE24,0,105,,0,,-0.132100,-78.575297
2,92PEE24,0,105,,0,,-0.132100,-78.575297
3,QS68UD8,0,294,,0,Work,-0.393396,-78.544417
4,MB7VY5F,0,83,,0,,-1.072823,-78.464121
...,...,...,...,...,...,...,...,...
135298,3S6VG6R,1,199,5.0,1,,2.284875,0.717124
135299,ND4PIJL,0,907,,0,,13.380083,-1.387421
135300,1NRK5HF,0,105,,0,,-0.772600,0.231851
135301,QDXLWM7,1,28,,1,,1.751487,0.375234


In [343]:
train_order_df, test_order_df = train_test_split(
    order_with_loc_df, test_size=0.2, random_state=42
)
train_order_df, val_order_df = train_test_split(
    train_order_df, test_size=0.2, random_state=42
)

Unnamed: 0,customer_id,LOCATION_NUMBER,vendor_id,vendor_rating,location_number,location_type,latitude,longitude
121366,5ARTXD7,1,681,5.0,1,,-0.036682,-0.069342
63263,JTDL6KB,0,386,0.0,0,,-0.492942,0.179312
30695,H9REAWE,0,78,,0,Home,-0.025458,-78.587900
80592,7I7SY2D,1,4,,1,,0.415142,0.931376
132382,EIA3K80,3,92,,3,Other,0.107888,0.596460
...,...,...,...,...,...,...,...,...
25191,4QNVW0N,1,176,,1,Other,0.141303,-78.607609
35276,NBHJZZB,0,78,,0,,0.275762,-78.623499
124517,3I2NFF0,0,289,,0,,0.431060,-0.005375
51877,0Z52KAY,1,843,,1,,-0.795356,0.082256


Unnamed: 0,akeed_customer_id,gender,dob,status,verified,language,created_at,updated_at


In [290]:
vendor_cust_df = (
    order_df[["vendor_id", "customer_id", "LOCATION_NUMBER"]]
    .drop_duplicates()
    .sort_values(by="vendor_id")
    .copy()
)

In [291]:
def get_similarities(vendor_cust_df: pd.DataFrame, vendor_1: int, vendor_2: int) -> int:
    l1 = len(
        vendor_cust_df[vendor_cust_df["vendor_id"] == vendor_1]["customer_id"].unique()
    )
    l2 = len(
        vendor_cust_df[vendor_cust_df["vendor_id"] == vendor_2]["customer_id"].unique()
    )
    union_size = len(
        vendor_cust_df[
            (vendor_cust_df["vendor_id"] == vendor_2)
            | (vendor_cust_df["vendor_id"] == vendor_1)
        ]["customer_id"].unique()
    )
    union_size
    return union_size, l1, l2


def generate_similarities_df(vendor_cust_df, threshold=0.15):
    vendor_arr = vendor_cust_df["vendor_id"].unique()

    similarities = []

    for i in range(len(vendor_arr)):
        for j in range(i + 1, len(vendor_arr)):
            vendor_1 = vendor_arr[i]
            vendor_2 = vendor_arr[j]
            total_client_count, l1, l2 = get_similarities(
                vendor_cust_df, vendor_1, vendor_2
            )
            common_client_count = l1 + l2 - total_client_count
            similarity_index = common_client_count / total_client_count
            if l1 < l2:
                suggested_vendor, target_vendor = vendor_2, vendor_1
            else:
                suggested_vendor, target_vendor = vendor_1, vendor_2
            if similarity_index > threshold:
                similarities.append([suggested_vendor, target_vendor, similarity_index])

    # créer une dataframe à partir de la liste de résultats
    return pd.DataFrame(
        similarities,
        columns=[
            "suggested_vendor",
            "target_vendor",
            "similarity_index",
        ],
    )

In [292]:
df_similarities = generate_similarities_df(vendor_cust_df)

In [293]:
df_similarities

Unnamed: 0,suggested_vendor,target_vendor,similarity_index
0,33,92,0.153122
1,79,78,0.239098
2,84,83,0.204575
3,84,105,0.175231
4,86,85,0.155508
5,573,161,0.189189
6,192,188,0.164468
7,195,191,0.15513
8,221,216,0.169468
9,300,295,0.153846


In [294]:
def generate_recommandation(vendor_cust_df):
    df_similarities = generate_similarities_df(vendor_cust_df)
    recommandation_df = pd.merge(
        df_similarities, vendor_cust_df, left_on="target_vendor", right_on="vendor_id"
    )[["customer_id", "LOCATION_NUMBER", "suggested_vendor"]]

    df = pd.DataFrame()
    df["CID X LOC_NUM X VENDOR"] = (
        recommandation_df["customer_id"]
        + " X "
        + recommandation_df["LOCATION_NUMBER"].astype(str)
        + " X "
        + recommandation_df["suggested_vendor"].astype(str)
    )
    df["target"] = 1
    return df


output_df = generate_recommandation(vendor_cust_df)
try:
    pd.DataFrame(output_df).astype(RecommanderModel.output_schema)
    print("La DataFrame respecte le schéma donné.")
except ValueError:
    print("La DataFrame ne respecte pas le schéma donné.")

La DataFrame respecte le schéma donné.


In [317]:
output_df.to_csv(os.path.join(DATA_DIRECTORY, "SampleSubmission.csv"))
output_df.head()

Unnamed: 0,CID X LOC_NUM X VENDOR,target
0,BU9FASD X 1 X 33,1
1,WLWU8CR X 3 X 33,1
2,E04MFB6 X 1 X 33,1
3,RV9HRH7 X 0 X 33,1
4,VQMQ05M X 0 X 33,1


## Travail sur la donnée
### Recommandation non-personnalisée
Cherchons ici le restaurant le plus apprécié des clients. Sur quels critères peut-on déterminer le restaurant le plus apprécié des clients?

Dans le `order.csv` nous avons toutes les commandes faites par les clients pour des restaurants. Nous avons entre autres les caractéristiques qui peuvent mesurer l'appréciation d'un client pour un restaurant:
 * `'vendor_rating'` qui n'a de sens que si `'is_rated'` vaut `'Yes'`
 * `'is_favorite'`
 * `'grand_total'` qui montre l'engagement des clients

In [297]:
keys_to_select = [
    "vendor_id",
    "vendor_rating",
    "grand_total",
    "is_favorite",
    "is_rated",
]
vendor_scoring = order_df[keys_to_select].copy()
vendor_scoring

Unnamed: 0,vendor_id,vendor_rating,grand_total,is_favorite,is_rated
0,105,,7.6,,No
1,294,,8.7,,No
2,83,,14.4,,No
3,90,,7.1,,No
4,83,,27.2,,No
...,...,...,...,...,...
135298,67,,13.3,No,No
135299,79,5.0,9.5,No,Yes
135300,28,,18.2,No,No
135301,841,,7.7,No,No


In [308]:
vendor_scoring["is_favorite"] = vendor_scoring["is_favorite"].fillna(0)
vendor_scoring["is_favorite"] = vendor_scoring["is_favorite"].apply(
    lambda x: 1 if x == "Yes" else 0
)
vendor_scoring["is_rated"] = vendor_scoring["is_rated"].apply(
    lambda x: 1 if x == "Yes" else 0
)
vendor_scoring.head()

Unnamed: 0,vendor_id,vendor_rating,grand_total,is_favorite,is_rated
0,105,,7.6,0,0
1,294,,8.7,0,0
2,83,,14.4,0,0
3,90,,7.1,0,0
4,83,,27.2,0,0


In [309]:
rating_df = order_df[["customer_id", "vendor_rating", "vendor_id"]].copy()

In [310]:
pivot_table = pd.pivot_table(
    rating_df,
    index="customer_id",
    columns="vendor_id",
    values="vendor_rating",
    aggfunc="mean",
)

# Affichage de la table pivot
pivot_table

vendor_id,4,13,20,23,28,33,43,44,55,66,...,681,841,843,845,846,849,855,856,858,907
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000THBA,,,,,,,,,,,...,,,,,,,,,,
009UFS1,,,,,,,,,,,...,,,,,,,,,,
00GV4J4,,,,,,,,,,,...,,,,,,,,,,
00HWUU3,,,,,,,,,,,...,,,,,,,,,,
00OT8JX,2.307692,,,,,,,,,,...,,5.0,,,,,,,5.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZZRJABJ,,,,,,,,,,,...,,,,,,,,,,
ZZV76GY,,,,,,,,,,,...,,,,,,,,,,
ZZVGIVK,,,,,,,,,,,...,,,,,,,,,,
ZZVLIB5,,,,0.0,,,,,,,...,,,,,,,,,,


In [311]:
# train_locations_df.plot.scatter(x='latitude',y='longitude')
fig = px.scatter(x=train_locations_df["latitude"], y=train_locations_df["longitude"])

In [312]:
fig.show()

In [313]:
train_locations_df["location_number"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [314]:
order_df[["customer_id", "vendor_id"]].groupby(by="vendor_id").count().sort_values(
    by="customer_id"
)

Unnamed: 0_level_0,customer_id
vendor_id,Unnamed: 1_level_1
295,474
250,503
196,512
271,559
304,562
...,...
78,4643
84,5001
79,5117
105,5562
