# Final Project: Restaurant Recommander System

## Imports

In [3]:
import pandas as pd
import numpy as np
import yaml
import os
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import f1_score

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [4]:
config_file = open("config.yml", "r")
config = yaml.load(config_file, Loader=yaml.FullLoader)
DATA_DIRECTORY = config["DATA_DIRECTORY"]

## Chargement des données

In [5]:
# Données relatives aux commandes des customers aux vendors
order_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "orders.csv"))

# Données des vendeurs(localisation, identifiant)
vendors_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "vendors.csv"))

# Données des clients (date de naissance, ID, genre)
train_customer_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "train_customers.csv"))

# Localisations des clients
train_locations_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "train_locations.csv"))

# Ensemble de tests sur les données des clients
test_customer_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "test_customers.csv"))

# Ensemble de tests sur les localisations des clients
test_locations_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "test_locations.csv"))

  order_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "orders.csv"))


In [7]:
order_with_loc_df = pd.merge(
    order_df[
        ["customer_id", "LOCATION_NUMBER", "vendor_id", "vendor_rating", "is_rated"]
    ],
    train_locations_df,
    left_on=["customer_id", "LOCATION_NUMBER"],
    right_on=["customer_id", "location_number"],
)
order_with_loc_df["is_rated"] = order_with_loc_df["is_rated"].apply(
    lambda b: 1 if b == "Yes" else 0
)
order_with_loc_df["is_rated"] = order_with_loc_df["is_rated"].fillna(0)
order_with_loc_df["vendor_rating"] = order_with_loc_df["vendor_rating"].fillna(0)
order_with_loc_df

Unnamed: 0,customer_id,LOCATION_NUMBER,vendor_id,vendor_rating,is_rated,location_number,location_type,latitude,longitude
0,92PEE24,0,105,0.0,0,0,,-0.132100,-78.575297
1,92PEE24,0,105,0.0,0,0,,-0.132100,-78.575297
2,92PEE24,0,105,0.0,0,0,,-0.132100,-78.575297
3,QS68UD8,0,294,0.0,0,0,Work,-0.393396,-78.544417
4,MB7VY5F,0,83,0.0,0,0,,-1.072823,-78.464121
...,...,...,...,...,...,...,...,...,...
135298,3S6VG6R,1,199,5.0,1,1,,2.284875,0.717124
135299,ND4PIJL,0,907,0.0,0,0,,13.380083,-1.387421
135300,1NRK5HF,0,105,0.0,0,0,,-0.772600,0.231851
135301,QDXLWM7,1,28,0.0,0,1,,1.751487,0.375234


In [8]:
train_order_df, test_order_df = train_test_split(
    order_with_loc_df, test_size=0.2, random_state=42
)
train_order_df, val_order_df = train_test_split(
    train_order_df, test_size=0.2, random_state=42
)
train_order_df

Unnamed: 0,customer_id,LOCATION_NUMBER,vendor_id,vendor_rating,is_rated,location_number,location_type,latitude,longitude
121366,5ARTXD7,1,681,5.0,1,1,,-0.036682,-0.069342
63263,JTDL6KB,0,386,0.0,0,0,,-0.492942,0.179312
30695,H9REAWE,0,78,0.0,0,0,Home,-0.025458,-78.587900
80592,7I7SY2D,1,4,0.0,0,1,,0.415142,0.931376
132382,EIA3K80,3,92,0.0,0,3,Other,0.107888,0.596460
...,...,...,...,...,...,...,...,...,...
25191,4QNVW0N,1,176,0.0,0,1,Other,0.141303,-78.607609
35276,NBHJZZB,0,78,0.0,0,0,,0.275762,-78.623499
124517,3I2NFF0,0,289,0.0,0,0,,0.431060,-0.005375
51877,0Z52KAY,1,843,0.0,0,1,,-0.795356,0.082256


In [9]:
VAL_KEY = "val"
PRED_KEY = "pred"
VALIDATION_KEY = "validation"
TRAIN_KEY = "training"

In [10]:
def generate_comparison_df(train_set, val_set):
    train_set = train_set.rename(columns={PRED_KEY: TRAIN_KEY})
    val_set = val_set.rename(columns={PRED_KEY: VALIDATION_KEY})
    comparison_df = pd.merge(train_set, val_set, on=VAL_KEY)
    return comparison_df


def calculate_f1_score(train_set, val_set):
    df = generate_comparison_df(train_set, val_set)
    true_labels = df[VALIDATION_KEY]
    predicted_labels = df[TRAIN_KEY]

    f1 = f1_score(true_labels, predicted_labels)

    return f1


calculate_f1_score(train_set, val_set)

NameError: name 'train_set' is not defined