In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.multiclass import OneVsRestClassifier
from imblearn.over_sampling import RandomOverSampler

# Import cleaned df and binary encode target (food)

In [8]:
def create_binary_df(file_path):
    mlb_harm = MultiLabelBinarizer(sparse_output=False)
    mlb_grape = MultiLabelBinarizer(sparse_output=False)

    wine_df = pd.read_csv(file_path)

    wine_df.drop(columns=['Grapes'], inplace=True)

    # Binary encode Harmonize(kinds of food)
    wine_df_bin = wine_df.join(pd.DataFrame(
        mlb_harm.fit_transform(eval(element) for element in wine_df.Harmonize),
        index=wine_df.index,
        columns=mlb_harm.classes_
        ))
    wine_df_bin.drop(columns=['Harmonize'], inplace=True)
    
    # Create a list of the kind of grapes that are mentioned less then 2.000 times
    harm_list = wine_df_bin.iloc[:,15:].sum() # sum the number of times a food is mentioned via column
    harm_to_drop = harm_list[harm_list<=15_000].index.to_list() # create a list withe kind of food mentioned less then 50 times
    wine_df_bin.drop(columns=harm_to_drop, inplace=True) # drop columns with food not mentioned more then 50 times
    wine_df_bin = wine_df_bin[wine_df_bin.iloc[:,15:].eq(1).any(axis=1)] # drop wines which are not represented by a food anymore

    #Drop addional columns not used for model
    wine_df_bin_cleaned = wine_df_bin.drop(columns=['WineName', 'WineID','Code','Country','RegionID','RegionName','WineryID','Website','Vintages', 'WineryName'])

    return wine_df_bin_cleaned

In [9]:
save_file_name = "Cleaned_Full_100K_wines"
file_path = f'~/code/ArjanAngenent/VinoDine/raw_data/{save_file_name}.csv'
wine_df = create_binary_df(file_path)

In [10]:
wine_df

Unnamed: 0,Type,Elaborate,ABV,Body,Acidity,Beef,CuredMeat,GameMeat,Lamb,Pasta,Pork,Poultry,RichFish,Shellfish,Veal,Vegetarian
0,Sparkling,Varietal/100%,7.5,Medium-bodied,High,0,0,0,0,0,1,0,1,1,0,0
1,Red,Varietal/100%,12.0,Medium-bodied,Medium,1,0,0,0,1,0,0,0,0,0,0
2,Red,Varietal/100%,12.0,Full-bodied,High,1,0,0,1,0,0,1,0,0,0,0
4,Red,Assemblage/Bordeaux Red Blend,11.0,Full-bodied,Medium,1,0,1,1,0,0,1,0,0,0,0
5,Red,Varietal/100%,12.5,Full-bodied,High,1,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100640,Red,Varietal/100%,12.0,Medium-bodied,High,1,0,1,1,0,0,0,0,0,0,0
100641,White,Varietal/100%,13.0,Medium-bodied,Medium,0,0,0,0,0,0,0,1,1,0,0
100642,Dessert,Varietal/100%,13.5,Medium-bodied,High,1,0,1,0,0,0,0,0,0,0,0
100644,White,Varietal/100%,12.5,Medium-bodied,High,0,1,0,0,0,1,1,0,1,0,0


In [106]:
def preprocessing():
    # Define which columns need to be encoded
    cat_cols = make_column_selector(dtype_include='object')
    num_cols = make_column_selector(dtype_include='number')
    cat_pre = make_pipeline(OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
                            MinMaxScaler())
    cat_num = MinMaxScaler()

    # Create preprocessor pipeline
    preprocessing = make_column_transformer((cat_pre, cat_cols),(cat_num, num_cols))
    return preprocessing

In [111]:
def train_model(X_train, y_train):
    # Create binary classifier
    binary_classifier = SGDClassifier(max_iter=500, random_state=42)

    ova_classifier = OneVsRestClassifier(binary_classifier)

    pipeline = make_pipeline(preprocessing(), ova_classifier)
    return pipeline.fit(X_train, y_train)

In [112]:
def create_X_train_y_train(df, test_size=0.3):
    X = df[['Type','Elaborate','Body','Acidity', 'ABV']]
    y = df.drop(columns=['Type','Elaborate','Body','Acidity', 'ABV'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=test_size)
    return X_train, X_test, y_train, y_test

In [113]:
X_train, X_test, y_train, y_test = create_X_train_y_train(wine_df)

In [114]:
model = train_model(X_train, y_train)

In [115]:
model

In [145]:
model.predict(X_test)

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 1, 0]])

In [146]:
X_test.head(1)

Unnamed: 0,Type,Elaborate,Body,Acidity,ABV
97845,Red,Assemblage/Blend,Full-bodied,High,13.0


In [147]:
foods = y_train.columns.to_list()

In [119]:
def predict(model, 
            type_of_wine: str, 
            elaborate: str,
            body: str, 
            acidity: str, 
            ABV: float):
    X_pred = pd.DataFrame.from_dict({'Type': [type_of_wine],
                                    'Elaborate': [elaborate],
                                    'Body': [body],
                                    'Acidity': [acidity],
                                    'ABV': [ABV]},
                                   orient='columns')
    y_pred = model.predict(X_pred)
    return y_pred

In [160]:
def show_foods(y_train, y_pred):
    foods = y_train.columns.to_list()
    foods_index = np.where(y_pred[0]==1)[0].tolist()
    foods_to_choose = []
    for i in foods_index:
        foods_to_choose.append(foods[i])
    return foods_to_choose


In [161]:
y_pred = predict(model, 'White', 'Varietal/100%', 'Full-bodied', 'Medium', 1)

In [174]:
show_foods(y_train, predict(model, 'Rose', 'Varietal/100%', 'Full-bodied', 'Low', 100))

['Beef', 'Lamb', 'Poultry']