In [1]:
from tensorflow.keras import Model, Sequential, layers, regularizers, optimizers, models
from tensorflow.keras.callbacks import EarlyStopping

#from colorama import Fore, Style
from typing import Tuple
import pickle
import time
import os
import glob

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.multiclass import OneVsRestClassifier
from imblearn.over_sampling import RandomOverSampler

In [54]:
pd.set_option('display.max_columns', None)

# Functions

## Preprocessing Functions

### create binary df

In [99]:
def create_binary_df_2(file_path):
    mlb_harm = MultiLabelBinarizer(sparse_output=False)
    mlb_grape = MultiLabelBinarizer(sparse_output=False)

    wine_df = pd.read_csv(file_path)
    
    #Drop addional columns not used for model
    wine_df = wine_df.drop(columns=['WineName', 'WineID','Code','Country','RegionID','RegionName','WineryID','Website','Vintages', 'WineryName'])
    
    # Binary encode grapes
    wine_df_bin = wine_df.join(pd.DataFrame(
        mlb_grape.fit_transform(eval(element) for element in wine_df.Grapes),
        index=wine_df.index,
        columns=mlb_grape.classes_
        ))
    wine_df_bin.drop(columns=['Grapes'], inplace=True)

    # Create a list of the kind of grapes that are mentioned less then 2.000 times
    grapes_list = wine_df_bin.iloc[:,16:].sum() # sum the number of times a grape is mentioned via column
    grapes_to_drop = grapes_list[grapes_list<1_000].index.to_list() # create a list with kind of grapes mentioned less then 2.000 times
    
    wine_df_bin.drop(columns=grapes_to_drop, inplace=True) # drop columns with grapes not mentioned more then 2.000 times
    final_column_grapes = wine_df_bin.shape[1]
    
    # Binary encode Harmonize(kinds of food)
    wine_df_bin = wine_df_bin.join(pd.DataFrame(
        mlb_harm.fit_transform(eval(element) for element in wine_df.Harmonize),
        index=wine_df.index,
        columns=mlb_harm.classes_
        ))
    wine_df_bin.drop(columns=['Harmonize'], inplace=True)
    
    # Create a list of the kind of grapes that are mentioned less then 2.000 times
    harm_list = wine_df_bin.iloc[:,(final_column_grapes+1):].sum() # sum the number of times a food is mentioned via column
    harm_to_drop = harm_list[harm_list<=15_000].index.to_list() # create a list withe kind of food mentioned less then 50 times
    wine_df_bin.drop(columns=harm_to_drop, inplace=True) # drop columns with food not mentioned more then 50 times
    wine_df_bin = wine_df_bin[wine_df_bin.iloc[:,(final_column_grapes+1):].eq(1).any(axis=1)] # drop wines which are not represented by a food anymore

    return wine_df_bin, final_column_grapes

In [100]:
file_name = 'XWines_Full_100K_wines.csv'
file_path = f'~/code/ArjanAngenent/VinoDine/raw_data/{file_name}'
wine_df, final_column_grapes = create_binary_df_2(file_path)
wine_df

Unnamed: 0,Type,Elaborate,ABV,Body,Acidity,Abbuoto,Abouriou,Abrostine,Acolon,Agiorgitiko,Aglianico,Aidani,Airen,Albalonga,Albana,Barbera,Cabernet Franc,Cabernet Sauvignon,Carignan/Cariñena,Carmenère,Chardonnay,Chenin Blanc,Corvina,Garnacha,Glera/Prosecco,Grenache,Malbec,Merlot,Mourvedre,Nebbiolo,Petit Verdot,Pinot Noir,Riesling,Rondinella,Sangiovese,Sauvignon Blanc,Syrah/Shiraz,Sémillon,Tempranillo,Tinta Roriz,Touriga Franca,Touriga Nacional,Viognier,Zinfandel,Aperitif,Appetizer,Beef,Cured Meat,Game Meat,Lamb,Pasta,Pork,Poultry,Rich Fish,Shellfish,Veal,Vegetarian
0,Sparkling,Varietal/100%,7.5,Medium-bodied,High,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0
1,Red,Varietal/100%,12.0,Medium-bodied,Medium,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
2,Red,Varietal/100%,12.0,Full-bodied,High,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0
4,Red,Assemblage/Bordeaux Red Blend,11.0,Full-bodied,Medium,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0
5,Red,Varietal/100%,12.5,Full-bodied,High,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100640,Red,Varietal/100%,12.0,Medium-bodied,High,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
100641,White,Varietal/100%,13.0,Medium-bodied,Medium,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
100642,Dessert,Varietal/100%,13.5,Medium-bodied,High,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
100644,White,Varietal/100%,12.5,Medium-bodied,High,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0


In [44]:
def create_binary_df(file_path):
    mlb_harm = MultiLabelBinarizer(sparse_output=False)
    mlb_grape = MultiLabelBinarizer(sparse_output=False)

    wine_df = pd.read_csv(file_path)

    
    wine_df.drop(columns=['Grapes'], inplace=True)

    # Binary encode Harmonize(kinds of food)
    wine_df_bin = wine_df.join(pd.DataFrame(
        mlb_harm.fit_transform(eval(element) for element in wine_df.Harmonize),
        index=wine_df.index,
        columns=mlb_harm.classes_
        ))
    wine_df_bin.drop(columns=['Harmonize'], inplace=True)
    
    # Create a list of the kind of grapes that are mentioned less then 2.000 times
    harm_list = wine_df_bin.iloc[:,15:].sum() # sum the number of times a food is mentioned via column
    harm_to_drop = harm_list[harm_list<=15_000].index.to_list() # create a list withe kind of food mentioned less then 50 times
    wine_df_bin.drop(columns=harm_to_drop, inplace=True) # drop columns with food not mentioned more then 50 times
    wine_df_bin = wine_df_bin[wine_df_bin.iloc[:,15:].eq(1).any(axis=1)] # drop wines which are not represented by a food anymore

    #Drop addional columns not used for model
    wine_df_bin_cleaned = wine_df_bin.drop(columns=['WineName', 'WineID','Code','Country','RegionID','RegionName','WineryID','Website','Vintages', 'WineryName'])

    return wine_df_bin_cleaned

### preprocessing for X

In [122]:
def preprocessing(X_train):
    # Define which columns need to be encoded
    cat_cols = make_column_selector(dtype_include='object')
    num_cols = make_column_selector(dtype_include='number')
    cat_pre = make_pipeline(OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
                            MinMaxScaler())
    cat_num = MinMaxScaler()

    # Create preprocessor pipeline
    preprocessing = make_column_transformer((cat_pre, cat_cols),(cat_num, num_cols), handle_unknown='ignore')
    preprocessing.fit(X_train)
    return preprocessing

In [69]:
def preprocessing_2(X_train):
    # Define which columns need to be encoded
    cat_cols = make_column_selector(dtype_include='object')
    num_cols = make_column_selector(dtype_include='number')
    cat_pre = make_pipeline(OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
                            MinMaxScaler())
    cat_num = MinMaxScaler()

    # Create preprocessor pipeline
    preprocessing = make_column_transformer((cat_pre, cat_cols),(cat_num, num_cols))
    preprocessing.fit(X_train)
    return preprocessing

### creation of X_train, X_test, y_train, y_test

In [5]:
def create_X_train_y_train(df, test_size=0.3):
    X = df[['Type','Body','Acidity', 'ABV']]
    y = df.drop(columns=['Type','Elaborate','Body','Acidity', 'ABV'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=test_size)
    return X_train, X_test, y_train, y_test

In [106]:
def create_X_train_y_train_2(df, grape_column, test_size=0.3):
    X = df.iloc[:,:grape_column-1]
    y = df.iloc[:,grape_column:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=test_size)
    return X_train, X_test, y_train, y_test

## Model functions

### Initializing model

In [114]:
def initialize_model(input_shape: tuple, output_shape: int) -> Model:
    '''
    Initialize neural Nework
    '''
    
    reg = regularizers.l1_l2(l2=0.005)
    
    model = Sequential()
    model.add(layers.Input(shape=input_shape))
    model.add(layers.Dense(100, kernel_initializer='he_uniform', activation='relu'))
    model.add(layers.Dropout(rate=0.1))
    model.add(layers.Dense(50, kernel_initializer='he_uniform', activation='relu'))
    model.add(layers.Dropout(rate=0.1))
    model.add(layers.Dense(25,kernel_initializer='he_uniform', activation='relu'))
    model.add(layers.Dropout(rate=0.1))
    model.add(layers.Dense(output_shape, activation='sigmoid'))
    
    print("✅ Model initialized")
    
    return model

### Compiling model

In [7]:
def compile_model(model: Model, learning_rate=0.0005):
    '''
    Compile Neural Network
    '''
    
    optimizer = optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    print("✅ Model compiled")
    
    return model

### Training model

In [8]:
def train_model(
        model: Model, 
        X: np.array,
        y: np.array,
        batch_size = 32,
        patience = 5,
        validation_split=0.3
    ) ->[Model, dict]:
    '''
    Fit the model and return a tuple (fitted model, history)
    '''
    
    print(f'Training model ...')
    
    es = EarlyStopping(
        patience=patience,
        restore_best_weights=True,
        verbose = 1)
    
    history = model.fit(
        X,
        y,
        validation_split=validation_split,
        epochs = 1_000,
        batch_size=batch_size,
        callbacks=[es],
        verbose = 1)
    
    print(f"✅ Model trained on {len(X)} rows with max val Accuracy: {round(np.max(history.history['val_accuracy']), 2)}")
    
    return model, history

### Evaluatin model

In [9]:
def evaluate_model(
        model: Model,
        X: np.array,
        y: np.array,
        batch_size =64
    ) -> Tuple[Model, dict]:
    '''
    Evaluate the trained model performance on dataset
    '''
    
    if model is None:
        print(f"\n❌ No model to evaluate")
        return None
    
    metrics = model.evaluate(
        x=X,
        y=y,
        batch_size=batch_size,
        verbose=1,
        return_dict=True,)
    
    accuracy = metrics['accuracy']
    
    print(f"✅ Model evaluated, Accuracy: {round(accuracy, 2)}")
    
    return metrics

### Save model

In [10]:
def save_model(model):
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    
    model_path = os.path.join(f'~/code/ArjanAngenent/VinoDine/tensorflow_model', f'{timestamp}.keras')
    model.save(model_path)
    
    print("✅ Model saved locally")
    
    return None

### Load model

In [11]:
def load_model():
    
    local_model_directory = f'~/code/ArjanAngenent/VinoDine/tensorflow_model'
    local_model_paths = glob.glob(f'{local_model_directory}/*')
    
    most_recent_model_path_on_disk = sorted(local_model_paths)[-1]
    
    latest_model = models.load_model(most_recent_model_path_on_disk)
    
    print("✅ Model loaded from local disk")
    
    return latest_model
    

## Prediciton

### Creating X_pred

In [12]:
def create_X_pred(
            type_of_wine: str,
            body: str, 
            acidity: str, 
            ABV: float
            # elaborate: str,
            ):
    X_pred = pd.DataFrame.from_dict({'Type': [type_of_wine],
                                    'Body': [body],
                                    'Acidity': [acidity],
                                    'ABV': [ABV]
                                    #'Elaborate': [elaborate],
                                    },
                                   orient='columns')
    return X_pred

### Predicting food

In [13]:
def pred(X_pred, y_train, preprocessing=preprocessing):
    
    model = load_model()
    
    X_pred_pre = preprocessing.transform(X_pred)
    
    y_pred = model.predict(X_pred_pre)
    
    
    
    return y_pred

In [15]:
def show_foods(y_train, y_pred):
        foods = y_train.columns.to_list()
        foods_index = np.where(y_pred[0]==1)[0].tolist()
        foods_to_choose = []
        for i in foods_index:
            foods_to_choose.append(foods[i])
        return foods_to_choose
    
    foods = show_foods(y_train, y_pred)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 9)

# Application

## Load Wine DataFrame

In [111]:
file_name = 'XWines_Full_100K_wines.csv'
file_path = f'~/code/ArjanAngenent/VinoDine/raw_data/{file_name}'
wine_df, grape_column = create_binary_df_2(file_path)

## Create X_train, X_test, y_train, y_test

In [112]:
X_train, X_test, y_train, y_test = create_X_train_y_train_2(wine_df, grape_column)

preprocessed = preprocessing(X_train)

X_train_processed = preprocessed.transform(X_train)

X_test_processed = preprocessed.transform(X_test)

## Creating model & training model

In [117]:
# Initializing model
model = initialize_model(input_shape=X_train_processed.shape[1:], output_shape=y_train.shape[1])

# Compiling model
model = compile_model(model)

# Training model
model, history = train_model(model, X_train_processed, y_train)

# Evaluating model
metrics = evaluate_model(model, X_test_processed, y_test)

✅ Model initialized
✅ Model compiled
Training model ...
Epoch 1/1000
[1m1518/1518[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - accuracy: 0.2032 - loss: 10640.3867 - val_accuracy: 0.5832 - val_loss: 79338.5625
Epoch 2/1000
[1m1518/1518[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - accuracy: 0.1915 - loss: 305327.3125 - val_accuracy: 0.0147 - val_loss: 563993.5625
Epoch 3/1000
[1m1518/1518[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 11ms/step - accuracy: 0.1808 - loss: 1388803.3750 - val_accuracy: 0.0147 - val_loss: 1653627.8750
Epoch 4/1000
[1m1518/1518[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 11ms/step - accuracy: 0.1795 - loss: 3484519.7500 - val_accuracy: 0.0147 - val_loss: 3772356.2500
Epoch 5/1000
[1m1518/1518[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 11ms/step - accuracy: 0.1808 - loss: 6863256.5000 - val_accuracy: 0.0147 - val_loss: 6447855.0000
Epoch 6/1000
[1m1518/1518[0m [32m━━━━━━━━━━━━━━

In [118]:
# Saving model
save_model(model)

✅ Model saved locally


In [119]:
# Loading most recent model
model = load_model()

✅ Model loaded from local disk


## Predicting

In [123]:
X_pred = create_X_pred('Rose', 'Medium-bodied', 'Medium', 100)


In [124]:
pred(X_pred, y_train, preprocessing=preprocessed)

✅ Model loaded from local disk


ValueError: columns are missing: {'Abouriou', 'Mourvedre', 'Acolon', 'Airen', 'Tinta Roriz', 'Aidani', 'Chardonnay', 'Corvina', 'Syrah/Shiraz', 'Carignan/Cariñena', 'Touriga Nacional', 'Abrostine', 'Cabernet Franc', 'Abbuoto', 'Zinfandel', 'Cabernet Sauvignon', 'Petit Verdot', 'Rondinella', 'Sémillon', 'Garnacha', 'Grenache', 'Sauvignon Blanc', 'Merlot', 'Riesling', 'Touriga Franca', 'Carmenère', 'Barbera', 'Albalonga', 'Nebbiolo', 'Pinot Noir', 'Albana', 'Tempranillo', 'Aglianico', 'Chenin Blanc', 'Glera/Prosecco', 'Elaborate', 'Agiorgitiko', 'Malbec', 'Viognier', 'Sangiovese'}