In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from itertools import combinations
import numpy as np
from scipy.stats import chi2_contingency

In [2]:
clean_df = pd.read_csv("../data/cleaned/clean_data.csv")
clean_df

Unnamed: 0,restaurant_name,country,region,province,city,latitude,longitude,price_range,meals,cuisines,vegetarian_friendly,vegan_options,gluten_free,avg_rating,total_reviews_count,default_language,meals_list
0,Le 147,France,Nouvelle-Aquitaine,Haute-Vienne,Saint-Jouvent,45.961674,1.169131,Not Available,"Lunch, Dinner",French,N,N,N,4.0,36.0,english,"['Dinner', 'Lunch']"
1,Le Saint Jouvent,France,Nouvelle-Aquitaine,Haute-Vienne,Saint-Jouvent,45.957040,1.205480,Not Available,Unknown,Unknown,N,N,N,4.0,5.0,all languages,[]
2,Au Bout du Pont,France,Centre-Val de Loire,Berry,Rivarennes,46.635895,1.386133,Not Available,"Dinner, Lunch, Drinks",French,N,N,N,5.0,13.0,english,"['Drinks', 'Dinner', 'Lunch']"
3,Le Relais de Naiade,France,Nouvelle-Aquitaine,Correze,Lacelle,45.642610,1.824460,Not Available,"Lunch, Dinner",French,N,N,N,4.0,34.0,english,"['Dinner', 'Lunch']"
4,Relais Du MontSeigne,France,Occitanie,Aveyron,Saint-Laurent-de-Levezou,44.208860,2.960470,Not Available,"Lunch, Dinner",French,N,N,N,4.5,11.0,all languages,"['Dinner', 'Lunch']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275086,Froggatts Tea Rooms & Vintage Shop,United Kingdom,Shropshire,Ludlow,Ashford Bowdler,52.322475,-2.712235,Not Available,"Breakfast, Lunch",Unknown,N,N,N,5.0,7.0,english,"['Breakfast', 'Lunch']"
275087,The Clive Arms,United Kingdom,Shropshire,Ludlow,Bromfield,52.388336,-2.761586,€21-€46,Unknown,European,Y,Y,Y,4.0,518.0,english,[]
275088,The Kings Arms,United Kingdom,Yorkshire,North Yorkshire,Goole,53.730110,-1.156390,€12-€23,"Lunch, Dinner, Drinks",European,N,N,N,5.0,2.0,english,"['Drinks', 'Dinner', 'Lunch']"
275089,Java Bar,United Kingdom,Yorkshire,North Yorkshire,Rawcliffe,53.979576,-1.107181,Not Available,"Breakfast, Lunch",Cafe,N,N,N,5.0,5.0,english,"['Breakfast', 'Lunch']"


In [3]:
clean_df.columns

Index(['restaurant_name', 'country', 'region', 'province', 'city', 'latitude',
       'longitude', 'price_range', 'meals', 'cuisines', 'vegetarian_friendly',
       'vegan_options', 'gluten_free', 'avg_rating', 'total_reviews_count',
       'default_language', 'meals_list'],
      dtype='str')

In [6]:
clean_df['meals'].unique

<bound method Series.unique of 0                 Lunch, Dinner
1                       Unknown
2         Dinner, Lunch, Drinks
3                 Lunch, Dinner
4                 Lunch, Dinner
                  ...          
275086         Breakfast, Lunch
275087                  Unknown
275088    Lunch, Dinner, Drinks
275089         Breakfast, Lunch
275090                  Unknown
Name: meals, Length: 275091, dtype: str>

In [6]:
clean_df.dtypes

restaurant_name            str
country                    str
region                     str
province                   str
city                       str
latitude               float64
longitude              float64
price_range                str
meals                      str
cuisines                   str
vegetarian_friendly        str
vegan_options              str
gluten_free                str
avg_rating             float64
total_reviews_count    float64
default_language           str
meals_list                 str
dtype: object

## Create CSV for tables that does not have Foreign Keys

In [None]:
"""
1. City: city_id, name
2. meals: meal_id, meal_type
3. review_summary: review_id, avg_rating, total_reviews_count, excellent, terrible, restaurant_id
4. dietary_options: diet_id, vegetarian_friendly, vegan_options, gluten_free
5. price_range: price_id, price_range
6. language: language_id, name
"""

#### CITY TABLE

In [3]:
city = pd.DataFrame({"city_id": [i+1 for i in range(clean_df["city"].nunique())], "city_name": clean_df["city"].unique()})
city

Unnamed: 0,city_id,city_name
0,1,Saint-Jouvent
1,2,Rivarennes
2,3,Lacelle
3,4,Saint-Laurent-de-Levezou
4,5,Le Crozet
...,...,...
22195,22196,Whissendine
22196,22197,Orleton
22197,22198,Ashford Bowdler
22198,22199,Bromfield


In [None]:
city.to_csv("city.csv", index=False, sep=";", encoding="utf-8")

#### MEALS TABLE

In [7]:
meal_types_df = clean_df["meals"].str.split(",", expand=True)

meal_types_list = []
for row in range(len(meal_types_df)):
    for col in range(5):
        if meal_types_df.iloc[row, col] and meal_types_df.iloc[row, col] not in meal_types_list:
            meal_types_list.append(meal_types_df.iloc[row, col])

In [None]:
meal_types_list = [ elem.strip() for elem in meal_types_list ]
meal_types_set = set(meal_types_list)
print(meal_types_set)

In [None]:
meals = pd.DataFrame({"meal_id": [i+1 for i in range(len(list(meal_types_set)))], "meal_type": list(meal_types_set)})
meals

In [None]:
meals.to_csv("meals.csv", index=False, sep=";", encoding="utf-8")

#### DIETARY OPTIONS TABLE

In [7]:
dietary_options1 = pd.DataFrame({"diet_id": [i+1 for i in range(clean_df["vegetarian_friendly"].nunique())  ], 
                                "vegetarian_friendly": clean_df["vegetarian_friendly"].unique(), 
                                "vegan_options": clean_df["vegan_options"].unique(), 
                                "gluten_free": clean_df["gluten_free"].unique()})

data = {"diet_id": 3, "vegetarian_friendly": "y", "vegan_options": "Y", "gluten_free": "N"}
data1 = {"diet_id": 4, "vegetarian_friendly": "n", "vegan_options": "N", "gluten_free": "Y"}
data2 = {"diet_id": 5, "vegetarian_friendly": "y", "vegan_options": "N", "gluten_free": "Y"}

dietary_options2 = pd.DataFrame([data, data1, data2])

dietary_options = pd.concat([dietary_options1, dietary_options2], axis=0)
dietary_options.reset_index(drop = True, inplace=True)
dietary_options

Unnamed: 0,diet_id,vegetarian_friendly,vegan_options,gluten_free
0,1,N,N,N
1,2,Y,Y,Y
2,3,y,Y,N
3,4,n,N,Y
4,5,y,N,Y


In [None]:
dietary_options.to_csv("dietary_options.csv", index=False, sep=";", encoding="utf-8")

#### PRICE RANGE TABLE

In [8]:
price = pd.DataFrame({"price_id": [i+1 for i in range(clean_df["price_range"].nunique())], 
                                "price_range": clean_df["price_range"].unique()})
price

Unnamed: 0,price_id,price_range
0,1,Not Available
1,2,€14-€29
2,3,€8-€17
3,4,€10-€35
4,5,€12-€26
...,...,...
3881,3882,"€231-€1,097"
3882,3883,€13-€68
3883,3884,€1-€8
3884,3885,€307-€827


In [None]:
price.to_csv("price.csv", index=False, sep=";", encoding="utf-8")

#### LANGUAGE TABLE

In [9]:
language = pd.DataFrame({"language_id": [i+1 for i in range(clean_df["default_language"].nunique())], 
                                "language_name": clean_df["default_language"].unique()})
language

Unnamed: 0,language_id,language_name
0,1,english
1,2,all languages


In [None]:
language.to_csv("language.csv", index=False, sep=";", encoding="utf-8")

#### REVIEW SUMMARY TABLE

In [13]:
review_summary = (
    clean_df[[
        "restaurant_name",
        "city",
        "avg_rating",
        "total_reviews_count"
    ]]
    .drop_duplicates()
    .reset_index(drop=True)
)

review_summary.insert(0, "review_id", review_summary.index + 1)
review_summary

Unnamed: 0,review_id,restaurant_name,city,avg_rating,total_reviews_count
0,1,Le 147,Saint-Jouvent,4.0,36.0
1,2,Le Saint Jouvent,Saint-Jouvent,4.0,5.0
2,3,Au Bout du Pont,Rivarennes,5.0,13.0
3,4,Le Relais de Naiade,Lacelle,4.0,34.0
4,5,Relais Du MontSeigne,Saint-Laurent-de-Levezou,4.5,11.0
...,...,...,...,...,...
274990,274991,Froggatts Tea Rooms & Vintage Shop,Ashford Bowdler,5.0,7.0
274991,274992,The Clive Arms,Bromfield,4.0,518.0
274992,274993,The Kings Arms,Goole,5.0,2.0
274993,274994,Java Bar,Rawcliffe,5.0,5.0


In [None]:
review_summary.to_csv("review_summary.csv", index=False, sep=";", encoding="utf-8")

## Create CSV for tables that does have Foreign Keys

In [None]:
"""
1. country: country_id, name, city_id

2. location: location_id, lat, long, city_id

3. restaurant_language: id, rest_id, lang_id

4. restaurant_meals: id, rest_id, meal_id

5. restaurant_dietary_options: id, rest_id, diet_id
"""

#### COUNTRY TABLE

In [18]:
city_table = (
    clean_df[["city", "country"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

city_table["city_id"] = city_table.index + 1

country = (
    city_table[["city_id", "country"]]
    .copy()
    .reset_index(drop=True)
)

country["country_id"] = country.index + 1
country.rename(columns={"country": "country_name"}, inplace=True)

country = country[["country_id", "city_id", "country_name"]]
country

Unnamed: 0,country_id,city_id,country_name
0,1,1,France
1,2,2,France
2,3,3,France
3,4,4,France
4,5,5,France
...,...,...,...
22212,22213,22213,United Kingdom
22213,22214,22214,United Kingdom
22214,22215,22215,United Kingdom
22215,22216,22216,United Kingdom


In [None]:
country.to_csv("country.csv", index=False, sep=";", encoding="utf-8")

#### LOCATION TABLE

In [14]:
city_location = (
    clean_df[["restaurant_name", "city", "latitude", "longitude"]]
    .drop_duplicates()
)

location = (
    city_location
    .merge(city, left_on="city", right_on = "city_name",how="left")
)
location = location.reset_index(drop=True)
location.insert(0, "location_id", location.index + 1)

location = location[["location_id", "city_id", "latitude", "longitude", "restaurant_name", "city"]]

location

Unnamed: 0,location_id,city_id,latitude,longitude,restaurant_name,city
0,1,1,45.961674,1.169131,Le 147,Saint-Jouvent
1,2,1,45.957040,1.205480,Le Saint Jouvent,Saint-Jouvent
2,3,2,46.635895,1.386133,Au Bout du Pont,Rivarennes
3,4,3,45.642610,1.824460,Le Relais de Naiade,Lacelle
4,5,4,44.208860,2.960470,Relais Du MontSeigne,Saint-Laurent-de-Levezou
...,...,...,...,...,...,...
275086,275087,22198,52.322475,-2.712235,Froggatts Tea Rooms & Vintage Shop,Ashford Bowdler
275087,275088,22199,52.388336,-2.761586,The Clive Arms,Bromfield
275088,275089,21019,53.730110,-1.156390,The Kings Arms,Goole
275089,275090,21376,53.979576,-1.107181,Java Bar,Rawcliffe


In [None]:
location.to_csv("location.csv", index=False, sep=";", encoding="utf-8")

#### RESTAURANT LANGUAGE TABLE

In [19]:
restaurant_language = (
    clean_df[[
        "restaurant_name", "default_language"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

#display(restaurant.head())
#display(language.head())

restaurant_language = restaurant_language.merge(
    language[["language_id", "language_name"]],
    left_on="default_language", right_on = "language_name",
    how="inner"
)


restaurant_language = restaurant_language.merge(
    restaurant[["restaurant_id", "restaurant_name"]],
    on="restaurant_name",
    how="inner"
)

restaurant_language = restaurant_language.reset_index(drop=True)

restaurant_language.insert(0, "id", restaurant_language.index + 1)

restaurant_language = restaurant_language[["id", "language_id", "restaurant_id"]]

display(restaurant_language.head())

Unnamed: 0,id,language_id,restaurant_id
0,1,1,1
1,2,1,58432
2,3,2,2
3,4,1,3
4,5,1,4


In [20]:
restaurant.shape

(767258, 5)

In [22]:
restaurant_language.to_csv("restaurant_language.csv", index=False, sep=";", encoding="utf-8")

#### RESTAURANT MEALS TABLE

In [None]:
restaurant_meals = (
    clean_df[[
        "restaurant_name", "meals"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

restaurant_meals = restaurant_meals.merge(
    meals[["meal_id", "meal_type"]],
    left_on="meals", right_on = "meal_type", # type doesn't match
    how="inner"
)


restaurant_meals = restaurant_meals.merge(
    restaurant[["restaurant_id", "restaurant_name"]],
    on="restaurant_name",
    how="inner"
)


restaurant_meals = restaurant_meals.reset_index(drop=True)

restaurant_meals.insert(0, "id", restaurant_meals.index + 1)

restaurant_meals = restaurant_meals[["id", "meal_id", "restaurant_id"]]

restaurant_meals.head()

In [None]:
restaurant_meals.to_csv("restaurant_meals.csv", index=False, sep=";", encoding="utf-8"))

#### RESTAURANT DIETARY OPTIONS TABLE

In [23]:
restaurant_dietary_options = (
    clean_df[[
        "restaurant_name", "vegetarian_friendly"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

dietary_options = dietary_options.drop_duplicates(
    subset=["vegetarian_friendly"]
)

restaurant_dietary_options = restaurant_dietary_options.merge(
    dietary_options[["diet_id", "vegetarian_friendly"]],
    on = "vegetarian_friendly",
    how="left"
)


restaurant_dietary_options = restaurant_dietary_options.merge(
    restaurant[["restaurant_id", "restaurant_name"]],
    on="restaurant_name",
    how="left"
)

restaurant_dietary_options = restaurant_dietary_options.reset_index(drop=True)
restaurant_dietary_options.insert(0, "id", restaurant_dietary_options.index + 1)

restaurant_dietary_options = restaurant_dietary_options[["id", "diet_id", "restaurant_id"]]

display(restaurant_dietary_options.head())

Unnamed: 0,id,diet_id,restaurant_id
0,1,1,1
1,2,1,58432
2,3,1,2
3,4,1,3
4,5,1,4


In [24]:
restaurant_dietary_options.shape

(1295677, 3)

In [25]:
restaurant_dietary_options.to_csv("restaurant_dietary_options.csv", index=False, sep=";", encoding="utf-8")

## Create CSV for the restaurant table

In [None]:
"""
Restaurant: restaurant_id, name, claimed, awards, open_days_per_week, original_open_hours, location_id, price_id, review_id
"""

In [15]:
restaurant = (
    clean_df[[
        "restaurant_name",
        "city",              # needed for location merge
        "price_range",       # needed for price merge
        "avg_rating",
        "total_reviews_count",
    ]]
    .drop_duplicates()
    .reset_index(drop=True)
)

restaurant.insert(0, "restaurant_id", restaurant.index + 1)

# restaurant.rename(columns={"restaurant_name": "name"}, inplace=True)

restaurant = restaurant.merge(
    location,
    on=["restaurant_name","city"],
    how="inner"
)

restaurant = restaurant.merge(
    price[["price_id", "price_range"]],
    on="price_range",
    how="inner"
)

restaurant = restaurant.merge(
    review_summary,
    on=["restaurant_name","city"],
    how="inner"
)

restaurant = restaurant.drop(columns=['avg_rating_y', 'total_reviews_count_y', "latitude", "longitude"])

restaurant = restaurant[[
    "restaurant_id",
    "restaurant_name",
    "location_id",
    "price_id",
    "review_id"
]]

display(restaurant.head())

Unnamed: 0,restaurant_id,restaurant_name,location_id,price_id,review_id
0,1,Le 147,1,1,1
1,2,Le Saint Jouvent,2,1,2
2,3,Au Bout du Pont,3,1,3
3,4,Le Relais de Naiade,4,1,4
4,5,Relais Du MontSeigne,5,1,5


In [17]:
restaurant.to_csv("restaurant.csv", index=False, sep=";", encoding="utf-8")

In [16]:
restaurant.shape

(767258, 5)