In [1]:
import numpy as np


def make_binary_data(cookbook, ingredient_list):
    # X is a binary matrix with recipe rows and ingredients columns, 
    # where (recipe, ingredients) is 1 if the ingredient is present in the recipe

    n_recipies = len(cookbook)
    n_ingredients = len(ingredient_list)

    X = np.zeros((n_recipies, n_ingredients))
    y = np.zeros(n_recipies)
    for idx, recipe in enumerate(cookbook):
        y[idx] = recipe['kitchen_id']
        for ingredient_id in recipe['ingredients']:
            X[idx][ingredient_id] = 1
            
    return X,y


def make_recipe_embedding_data(cookbook, embedding, avg=False):
    # Make the data. X is a matrix with recipe rows and embedding dimension columns, 
    # where every row is the average of the embeddings in the recipes

    n_recipies = len(cookbook)
    embedding_dim = len(embedding[0])

    X = np.zeros((n_recipies, embedding_dim))
    y = np.zeros(n_recipies)
    for idx, recipe in enumerate(cookbook):
        embedding_avg = np.zeros(embedding_dim) 
        ingredient_count = len(recipe['ingredients'])
        y[idx] = recipe['kitchen_id']
        for ingredient_id in recipe['ingredients']:
            if avg:
                embedding_avg += embedding[ingredient_id] / ingredient_count
            else:
                embedding_avg += embedding[ingredient_id]
            
        X[idx, :] = embedding_avg
            
    return X,y


def make_ingredient_embedding_data(embedding):
    # A matrix with as rows the ingredients and as columns the embedding
    matrix = np.empty( (len(embedding), len(embedding[0])) )
    for ingredient_idx, embedding_vector in embedding.items():
        matrix[ingredient_idx, :] = embedding_vector
    return matrix

In [2]:
from csv import reader
import pandas as pd
import torch

__kitchen_list = []


def get_cookbook_train():

    cookbook_train = []

    with open('train.csv', 'r') as file:

        csv_reader = reader(file, delimiter=",")

        for i, row in enumerate(csv_reader):
            
            kitchen = row[-1]
            ingredient_strings = row[:-1]
            ingredients = [ int(s) for s in ingredient_strings ]
            
            cookbook_train.append({
                'recipe_id': i,
                'ingredients': ingredients,
                'kitchen_name': kitchen
            })      

    fill_kitchen_list(cookbook_train)
    add_kitchen_id_to_cookbook(cookbook_train)
    return cookbook_train


def get_cookbook_valid_question():

    cookbook_valid_question = []

    with open('validation_classification_question.csv', 'r') as file:

        csv_reader = reader(file, delimiter=",")

        for i, row in enumerate(csv_reader):
            ingredients = [ int(s) for s in row ]
            cookbook_valid_question.append({
                'recipe_id': i,
                'ingredients': ingredients,
                'kitchen_name': "UNKNOWNKITCHEN",
                'kitchen_id': "-999"
            })     

    return cookbook_valid_question


       
def get_cookbook_valid_answer():

    cookbook_valid_answer = []

    with open('validation_classification_answer.csv', 'r') as file:

        csv_reader = reader(file, delimiter=",")

        for i, row in enumerate(csv_reader):        
            kitchen = row[0]        
            cookbook_valid_answer.append({
                'recipe_id': i,
                'ingredients': [],
                'kitchen_name': kitchen
            })

    fill_kitchen_list(cookbook_valid_answer)
    add_kitchen_id_to_cookbook(cookbook_valid_answer)
    return cookbook_valid_answer


def get_ingredient_list():

    node_ingredient = pd.read_fwf('node_ingredient.csv', header=None)
    ingredient_list = [None] * node_ingredient.shape[0]
    for index , row in node_ingredient.iterrows():
        ingredient_list[index] = row[0]

    return ingredient_list


def fill_kitchen_list(cookbook):

    for recipe in cookbook:
        if recipe['kitchen_name'] not in __kitchen_list:
            __kitchen_list.append(recipe['kitchen_name'])



def add_kitchen_id_to_cookbook(cookbook):   

    for recipe in cookbook:
        kitchen_id = __kitchen_list.index(recipe['kitchen_name'])
        recipe['kitchen_id'] = kitchen_id

In [3]:
cookbook_train = get_cookbook_train()
cookbook_valid_question = get_cookbook_valid_question()
cookbook_valid_answer = get_cookbook_valid_answer()
ingredient_list = get_ingredient_list()


Preprocessing

In [4]:
binTest = make_binary_data(cookbook_train,ingredient_list)
Xtrain = binTest[0]
Xtrain = Xtrain.tolist()

Ytrain = []
for i in cookbook_train:
  Ytrain.append(i["kitchen_name"])

binVal = make_binary_data(cookbook_valid_question,ingredient_list)
Xval = binVal[0]
Xval = Xval.tolist()

Yval = []
for i in cookbook_valid_answer:
  Yval.append(i["kitchen_name"])

Random forest classifier

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(min_samples_leaf = 2)
clf.fit(Xtrain, Ytrain)

RandomForestClassifier(min_samples_leaf=2)

In [6]:
acc = 0
for i in range(1000):
  pred = clf.predict([Xval[i]])
  if pred[0] == Yval[i]:
    acc += 1
print('accuracy RF:',acc/1000)

accuracy RF: 0.623


In [7]:
from sklearn.neighbors import KNeighborsClassifier
clf2 = KNeighborsClassifier(n_neighbors=3)
clf2.fit(Xtrain, Ytrain)

KNeighborsClassifier(n_neighbors=3)

In [8]:
acc = 0
for i in range(1000):
  pred = clf2.predict([Xval[i]])
  if pred[0] == Yval[i]:
    acc += 1
print('accuracy 3nn:',acc/1000)

accuracy 3nn: 0.44


Using PCA to reduce dimensionality to train differents methods.

Create new input space with 256 and 64 dimension

In [9]:
from sklearn.decomposition import PCA

pca1 = PCA(n_components=256)
pca1.fit(Xtrain)

pca2 = PCA(n_components=64)
pca2.fit(Xtrain)

PCA(n_components=64)

In [10]:
XtrainPCA1 = Xtrain.copy()
XvalPCA1 = Xval.copy()
XtrainPCA1 = pca1.transform(XtrainPCA1)
XvalPCA1 = pca1.transform(XvalPCA1)

XtrainPCA2 = Xtrain.copy()
XvalPCA2 = Xval.copy()
XtrainPCA2 = pca2.transform(XtrainPCA2)
XvalPCA2 = pca2.transform(XvalPCA2)

Support vector classifie

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf3 = SVC()
clf3.fit(XtrainPCA1, Ytrain)

SVC()

In [12]:
acc = 0
for i in range(1000):
  pred = clf3.predict([XvalPCA1[i]])
  if pred[0] == Yval[i]:
    acc += 1
print('accuracy SVM PCA:', acc/1000)

accuracy SVM PCA: 0.673


XGboost

In [17]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
clf4 = GradientBoostingClassifier(learning_rate=0.1,max_depth=7, random_state=0,verbose=1,min_samples_leaf = 20)
clf4.fit(XtrainPCA2, Ytrain).score(XvalPCA2, Yval)

      Iter       Train Loss   Remaining Time 
         1           2.1641           45.04m
         2           1.9616           44.49m
         3           1.8102           44.04m
         4           1.6895           43.62m
         5           1.5873           43.14m
         6           1.4971           42.63m
         7           1.4190           42.16m
         8           1.3516           41.71m
         9           1.2886           41.23m
        10           1.2328           40.77m
        20           0.8456           36.10m
        30           0.6241           31.55m
        40           0.4735           27.01m
        50           0.3675           22.51m
        60           0.2898           18.02m
        70           0.2305           13.52m
        80           0.1874            9.02m
        90           0.1528            4.51m
       100           0.1256            0.00s


0.574796126401631

In [None]:
acc = 0
for i in range(1000):
  pred = clf4.predict([XvalPCA2[i]])
  if pred[0] == Yval[i]:
    acc += 1
print('accuracy XGboost PCA:', acc/1000)

KNN with reduce dimension

In [13]:
from sklearn.neighbors import KNeighborsClassifier
clf5 = KNeighborsClassifier(n_neighbors=3)
clf5.fit(XtrainPCA1, Ytrain)

KNeighborsClassifier(n_neighbors=3)

In [14]:
acc = 0
for i in range(1000):
  pred = clf5.predict([XvalPCA1[i]])
  if pred[0] == Yval[i]:
    acc += 1
print('accuracy 3nn PCA:', acc/1000)

accuracy 3nn PCA: 0.457


Gaussian Naive Bayes classifier

In [15]:
from sklearn.naive_bayes import GaussianNB

clf6 = GaussianNB()
clf6.fit(XtrainPCA1, Ytrain)

GaussianNB()

In [16]:
acc = 0
for i in range(1000):
  pred = clf6.predict([XvalPCA1[i]])
  if pred[0] == Yval[i]:
    acc += 1
print('Gaussian PCA:', acc/1000)

Gaussian PCA: 0.433


Stacking -> RandomForest + SVC + knn + GB

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

estimators = [
    ('rf', RandomForestClassifier()),
    ('svc', SVC()),
    ('knn', KNeighborsClassifier(n_neighbors=3)),
    ('bayes', GaussianNB())         
]
clfS = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(dual=False))

from sklearn.model_selection import train_test_split

clfS.fit(XtrainPCA2, Ytrain).score(XvalPCA2, Yval)

In [None]:
acc = 0
for i in range(1000):
  pred = clfS.predict([XvalPCA1[i]])
  if pred[0] == Yval[i]:
    acc += 1
print('Stacking PCA:', acc/1000)