# Final Project: Restaurant Recommander System

## Imports

In [1]:
import pandas as pd
import numpy as np
import yaml
import os
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split

In [35]:
config_file = open("config.yml", "r")
config = yaml.load(config_file, Loader=yaml.FullLoader)
DATA_DIRECTORY = config["DATA_DIRECTORY"]
config

{'DATA_DIRECTORY': 'data',
 'output_schema': {'CID X LOC_NUM X VENDOR': str, 'target': int}}

## Chargement des données

In [7]:
# Données relatives aux commandes des customers aux vendors
order_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "orders.csv"))

# Données des vendeurs(localisation, identifiant)
vendors_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "vendors.csv"))

# Données des clients (date de naissance, ID, genre)
train_customer_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "train_customers.csv"))

# Localisations des clients
train_locations_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "train_locations.csv"))

# Ensemble de tests sur les données des clients
test_customer_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "test_customers.csv"))

# Ensemble de tests sur les localisations des clients
test_locations_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "test_locations.csv"))

  order_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "orders.csv"))


{'CID X LOC_NUM X VENDOR': str, 'target': int}

## Definition d'une super-classe de modèle

In [36]:
class RecommanderModel:
    model_type = "undefined"
    outpu_schema = config["output_schema"]

    def __init__(self, model_name: str) -> None:
        self.model_name = model_name
        pass

    def train(self, **kwargs) -> None:
        pass

    def predict(self, **kwargs) -> None:
        pass

    def save_predictions(self, **kwargs) -> None:
        pass

    def print_model(self):
        print(f"{self.model_type} recommander model\nmodel name: '{self.model_name}'")

## Travail sur la donnée
### Recommandation non-personnalisée
Cherchons ici le restaurant le plus apprécié des clients. Sur quels critères peut-on déterminer le restaurant le plus apprécié des clients?

Dans le `order.csv` nous avons toutes les commandes faites par les clients pour des restaurants. Nous avons entre autres les caractéristiques qui peuvent mesurer l'appréciation d'un client pour un restaurant:
 * `'vendor_rating'` qui n'a de sens que si `'is_rated'` vaut `'Yes'`
 * `'is_favorite'`
 * `'grand_total'` qui montre l'engagement des clients

In [4]:
keys_to_select = [
    "vendor_id",
    "vendor_rating",
    "grand_total",
    "is_favorite",
    "is_rated",
]
vendor_scoring = order_df[keys_to_select].copy()

vendor_scoring["is_favorite"] = vendor_scoring["is_favorite"].fillna(0)
vendor_scoring["is_favorite"] = vendor_scoring["is_favorite"].apply(
    lambda x: 1 if x == "Yes" else 0
)
vendor_scoring["is_rated"] = vendor_scoring["is_rated"].apply(
    lambda x: 1 if x == "Yes" else 0
)
vendor_scoring.head()

Unnamed: 0,vendor_id,vendor_rating,grand_total,is_favorite,is_rated
0,105,,7.6,0,0
1,294,,8.7,0,0
2,83,,14.4,0,0
3,90,,7.1,0,0
4,83,,27.2,0,0


In [5]:
vendor_stat = (
    vendor_scoring.groupby(by="vendor_id")
    .sum()
    .sort_values(by="vendor_rating", ascending=False)
)
vendor_stat.head()

Unnamed: 0_level_0,vendor_rating,grand_total,is_favorite,is_rated
vendor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
79,5190.0,65363.1,114,1114
113,4852.0,114366.1,93,1031
84,3889.0,83253.4,73,893
105,3477.0,75863.6,59,777
386,3460.0,39981.9,66,755


In [32]:
(train_customer_df[train_customer_df["status"] == 1])

Unnamed: 0,akeed_customer_id,gender,dob,status,verified,language,created_at,updated_at
0,TCHWPBT,Male,,1,1,EN,2018-02-07 19:16:23,2018-02-07 19:16:23
1,ZGFSYCZ,Male,,1,1,EN,2018-02-09 12:04:42,2018-02-09 12:04:41
3,952DBJQ,Male,,1,1,EN,2018-03-15 19:47:07,2018-03-15 19:47:07
4,1IX6FXS,Male,,1,1,EN,2018-03-15 19:57:01,2018-03-15 19:57:01
5,IL9MJSW,Male,,1,1,EN,2018-03-16 18:22:00,2018-03-16 18:22:00
...,...,...,...,...,...,...,...,...
34669,JAI79PE,,,1,1,,2020-03-03 13:01:13,2020-03-03 13:02:21
34670,TR75VFL,,,1,1,,2020-03-03 13:22:35,2020-03-03 13:22:41
34671,SQMJ08H,,,1,1,,2020-03-03 13:24:27,2020-03-03 13:24:44
34672,9LW9CHN,,,1,1,,2020-03-03 13:24:28,2020-03-03 13:28:00


In [36]:
test_customer_df[test_customer_df["status"] == 1]
test_locations_df

Unnamed: 0,customer_id,location_number,location_type,latitude,longitude
0,Z59FTQD,0,,126.032278,-9.106019
1,0JP29SK,0,Home,0.278709,-78.623847
2,0JP29SK,1,Home,0.124485,-78.605621
3,0JP29SK,2,,-0.113891,-78.577449
4,0JP29SK,3,,-0.848796,0.136726
...,...,...,...,...,...
16715,L9G4OFV,4,,-0.197722,0.609199
16716,L9G4OFV,5,,-0.343042,0.626064
16717,FDZFYBA,0,Home,-0.974907,-0.177863
16718,UTKHR1C,0,Other,1.058539,0.001628


In [23]:
rating_df = order_df[["customer_id", "vendor_rating", "vendor_id"]].copy()

27445

In [37]:
pivot_table = pd.pivot_table(
    rating_df,
    index="customer_id",
    columns="vendor_id",
    values="vendor_rating",
    aggfunc="mean",
)

# Affichage de la table pivot
pivot_table

vendor_id,4,13,20,23,28,33,43,44,55,66,...,681,841,843,845,846,849,855,856,858,907
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000THBA,,,,,,,,,,,...,,,,,,,,,,
009UFS1,,,,,,,,,,,...,,,,,,,,,,
00GV4J4,,,,,,,,,,,...,,,,,,,,,,
00HWUU3,,,,,,,,,,,...,,,,,,,,,,
00OT8JX,2.307692,,,,,,,,,,...,,5.0,,,,,,,5.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZZRJABJ,,,,,,,,,,,...,,,,,,,,,,
ZZV76GY,,,,,,,,,,,...,,,,,,,,,,
ZZVGIVK,,,,,,,,,,,...,,,,,,,,,,
ZZVLIB5,,,,0.0,,,,,,,...,,,,,,,,,,


In [7]:
# train_locations_df.plot.scatter(x='latitude',y='longitude')
fig = px.scatter(x=train_locations_df["latitude"], y=train_locations_df["longitude"])

In [8]:
fig.show()

In [9]:
train_locations_df["location_number"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [16]:
order_df[["customer_id", "vendor_id"]].groupby(by="vendor_id").count().sort_values(
    by="customer_id"
)

Unnamed: 0_level_0,customer_id
vendor_id,Unnamed: 1_level_1
295,474
250,503
196,512
271,559
304,562
...,...
78,4643
84,5001
79,5117
105,5562
