In [None]:
# -------------------------------------------------------- Imports --------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from imblearn.over_sampling import SMOTE

In [None]:
# -------------------------------------------------------- Load Data --------------------------------------------------------
df  = pd.read_csv('cuisines.csv')
df.head()
df.info()

In [None]:
df.cuisine.value_counts().plot.barh()

In [None]:

# -------------------------------------------------------- Shape Data --------------------------------------------------------
thai_df = df[(df.cuisine == "thai")]
japanese_df = df[(df.cuisine == "japanese")]
chinese_df = df[(df.cuisine == "chinese")]
indian_df = df[(df.cuisine == "indian")]
korean_df = df[(df.cuisine == "korean")]

print(f'thai df: {thai_df.shape}')
print(f'japanese df: {japanese_df.shape}')
print(f'chinese df: {chinese_df.shape}')
print(f'indian df: {indian_df.shape}')
print(f'korean df: {korean_df.shape}')

In [None]:
# ------------------------------------------------- Ingredients By Cuisine --------------------------------------------------
def create_ingredient_df(df):
    ingredient_df = df.T.drop(['cuisine','Unnamed: 0']).sum(axis=1).to_frame('value')
    ingredient_df = ingredient_df[(ingredient_df.T != 0).any()]
    ingredient_df = ingredient_df.sort_values(by='value', ascending=False,
    inplace=False)
    return ingredient_df

In [None]:
# --------------- Visualize ---------------
# ----- Thai -----
thai_ingredient_df = create_ingredient_df(thai_df)
thai_ingredient_df.head(10).plot.barh()

In [None]:
# ----- Japanese -----
japanese_ingredient_df = create_ingredient_df(japanese_df)
japanese_ingredient_df.head(10).plot.barh()

In [None]:
# ----- Chinese -----
chinese_ingredient_df = create_ingredient_df(chinese_df)
chinese_ingredient_df.head(10).plot.barh()

In [None]:
# ----- Indian -----
indian_ingredient_df = create_ingredient_df(indian_df)
indian_ingredient_df.head(10).plot.barh()

In [None]:
# ----- Korean -----
korean_ingredient_df = create_ingredient_df(korean_df)
korean_ingredient_df.head(10).plot.barh()

In [None]:
# --------------------------------------------- Drop Most Common Ingredients ---------------------------------------------
feature_df= df.drop(['cuisine','Unnamed: 0','rice','garlic','ginger'], axis=1)
labels_df = df.cuisine #.unique()
feature_df.head()

In [None]:
# ---------------------------------------------------- Balance Data ----------------------------------------------------
oversample = SMOTE()
transformed_feature_df, transformed_label_df = oversample.fit_resample(feature_df, labels_df)

In [None]:
print(f'new label count: {transformed_label_df.value_counts()}')
print(f'old label count: {df.cuisine.value_counts()}')

In [None]:
# ---------------------------------------------------- Correct Data ----------------------------------------------------
transformed_df = pd.concat([transformed_label_df,transformed_feature_df],axis=1, join='outer')

In [None]:
# --------------------------------------------------- Output To CSV ---------------------------------------------------
transformed_df.head()
transformed_df.info()
transformed_df.to_csv("cleaned_cuisines.csv")