In [21]:
import pandas as pd
import numpy as np
import re
from csv import reader
from sklearn.linear_model import LogisticRegression

In [22]:
cookbook_train = []

with open('train.csv', 'r') as file:

    csv_reader = reader(file, delimiter=",")

    for i, row in enumerate(csv_reader):
        
        kitchen = row[-1]
        ingredient_strings = row[1:-1]
        ingredients = [ int(s) for s in ingredient_strings ]
        
        cookbook_train.append({
            'ingredients': ingredients,
            'kitchen_name': kitchen,
            'recipe_id': i
        })      
        
cookbook_train[15]

{'ingredients': [2866, 4243, 4362, 5377, 5408, 6187, 6352, 6568],
 'kitchen_name': 'italian',
 'recipe_id': 15}

In [23]:
cookbook_valid_question = []

with open('validation_classification_question.csv', 'r') as file:

    csv_reader = reader(file, delimiter=",")

    for i, row in enumerate(csv_reader):
        ingredients = [ int(s) for s in row ]
        cookbook_valid_question.append({
            'ingredients': ingredients,
            'recipe_id': i,
            'kitchen_name': "UNKNOWNKITCHEN"
        })     

cookbook_valid_question[15]

{'ingredients': [909, 1046, 1308, 1679, 2809, 3152, 3653, 4162, 4799, 4917],
 'recipe_id': 15,
 'kitchen_name': 'UNKNOWNKITCHEN'}

In [24]:
cookbook_valid_answer = []
        
with open('validation_classification_answer.csv', 'r') as file:

    csv_reader = reader(file, delimiter=",")

    for i, row in enumerate(csv_reader):        
        kitchen = row[0]        
        cookbook_valid_answer.append({
            'ingredients': [],
            'kitchen_name': kitchen,
            'recipe_id': i
        })
        
cookbook_valid_answer[15]

{'ingredients': [], 'kitchen_name': 'southern_us', 'recipe_id': 15}

In [25]:
# Make ingredient list

node_ingredient = pd.read_fwf('node_ingredient.csv', header=None)
ingredient_list = {}
for index , row in node_ingredient.iterrows():
    ingredient_list[index] = row[0]

In [26]:
# Make kitchen list

kitchens = []
cookbook_full = cookbook_train + cookbook_valid_question + cookbook_valid_answer
for recipe in cookbook_full:
    print(recipe)
    if recipe['kitchen_name'] not in kitchens:
        kitchens.append(recipe['kitchen_name'])

kitchen_list = { index:kitchen_name for index, kitchen_name in enumerate(kitchens)  }

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [27]:
def add_kitchen_id_to_cookbook(kitchen_list, cookbook):   

    for recipe in cookbook:
        kitchen_id = kitchens.index(recipe['kitchen_name'])
        recipe['kitchen_id'] = kitchen_id

        
add_kitchen_id_to_cookbook(kitchen_list, cookbook_train)
add_kitchen_id_to_cookbook(kitchen_list, cookbook_valid_question)
add_kitchen_id_to_cookbook(kitchen_list, cookbook_valid_answer)
cookbook_train[15]

{'ingredients': [2866, 4243, 4362, 5377, 5408, 6187, 6352, 6568],
 'kitchen_name': 'italian',
 'recipe_id': 15,
 'kitchen_id': 5}

In [167]:
# Import the embedding
df = pd.read_csv('Embedding/SVD128.csv', delimiter=",")

In [168]:
# Create an embedding dictionary from the file
embedding = {}
for index, row in df.iterrows():
    ingredient_id = row[1]
    embedding_string = row[2]
    embedding_formatted = re.findall(r'[\d|\.|e|\+|\-]+', embedding_string)
    embedding_array = np.asarray(embedding_formatted, dtype=float)
    embedding[ingredient_id] = embedding_array
    
embedding[2813]

array([ 3.45249597e+01,  2.02921746e+01, -1.19782364e+00, -1.27103878e+00,
        2.47691231e-01,  5.51329427e+00,  8.30972158e+00, -3.15623142e-01,
        7.47054519e-02, -8.16607322e-01, -3.26105795e+00,  6.47322445e-01,
        1.01522626e+00, -1.34483497e+00,  4.65942814e+00, -2.55788922e+00,
        4.09197282e+00,  1.61748307e+00, -1.02612428e+00,  1.10609442e-01,
       -1.35513847e+00, -2.03709566e-01,  1.03572702e+00,  8.66335326e-01,
        1.13053678e+00, -3.58402071e+00,  1.90364661e+00,  4.35500270e+00,
        1.91934968e+00, -3.11331196e+00, -1.84232069e+00,  1.35631867e+00,
        3.91788621e+00, -2.31969255e+00,  2.08547498e-01, -1.99538172e+00,
        4.70833006e+00,  2.60743389e+00, -1.87653768e+00, -3.41893295e+00,
       -4.09679989e+00,  1.99068504e+00, -1.06446620e+00,  3.48641751e+00,
       -4.07176382e-01,  1.10113214e+00, -2.36934688e+00, -1.36836605e+00,
       -8.77848874e-01,  1.04493155e+00, -2.36000701e+00,  3.58100901e+00,
       -9.92826879e-01,  

In [169]:
embedding[5522]

KeyError: 5522

In [170]:
def make_data_from_cookbook(cookbook, embedding):
    # Make the data. X is a matrix with recipe rows and embedding dimension columns, 
    # where every row is the average of the embeddings in the recipes

    n_recipies = len(cookbook)
    embedding_dim = len(embedding[0])

    X = np.zeros((n_recipies, embedding_dim))
    y = np.zeros(n_recipies)
    for idx, recipe in enumerate(cookbook):
        embedding_avg = np.zeros(embedding_dim) 
        ingredient_count = len(recipe['ingredients'])
        y[idx] = recipe['kitchen_id']
        for ingredient_id in recipe['ingredients']:
            if ingredient_id in embedding:  # Is wrong, all ingredients should be embedded, but for example 5522 is not.
                embedding_avg += embedding[ingredient_id] / ingredient_count
            
        X[idx, :] = embedding_avg
            
    return X,y
    

In [171]:
X_train, y_train = make_data_from_cookbook(cookbook_train, embedding)
X_valid, _       = make_data_from_cookbook(cookbook_valid_question, embedding)
_      , y_valid = make_data_from_cookbook(cookbook_valid_answer, embedding)

In [175]:
# Logistic regression (takes roughly 10 sec on my machine)
clf = LogisticRegression(penalty='l2', max_iter=1000).fit(X_train, y_train)

In [176]:
# Training score
clf.score(X_train, y_train)

0.7389901048965898

In [177]:
# Validation score
clf.score(X_valid, y_valid)

0.7245158002038736