In [1]:
import pandas as pd
import numpy as np
import re
from csv import reader
from sklearn.linear_model import LogisticRegression

In [2]:
cookbook_train = []

with open('train.csv', 'r') as file:

    csv_reader = reader(file, delimiter=",")

    for i, row in enumerate(csv_reader):
        
        kitchen = row[-1]
        ingredient_strings = row[1:-1]
        ingredients = [ int(s) for s in ingredient_strings ]
        
        cookbook_train.append({
            'ingredients': ingredients,
            'kitchen_name': kitchen,
            'recipe_id': i
        })      
        
cookbook_train[15]

{'ingredients': [2866, 4243, 4362, 5377, 5408, 6187, 6352, 6568],
 'kitchen_name': 'italian',
 'recipe_id': 15}

In [3]:
cookbook_valid_question = []

with open('validation_classification_question.csv', 'r') as file:

    csv_reader = reader(file, delimiter=",")

    for i, row in enumerate(csv_reader):
        ingredients = [ int(s) for s in row ]
        cookbook_valid_question.append({
            'ingredients': ingredients,
            'recipe_id': i,
            'kitchen_name': "UNKNOWNKITCHEN"
        })     

cookbook_valid_question[15]

{'ingredients': [909, 1046, 1308, 1679, 2809, 3152, 3653, 4162, 4799, 4917],
 'recipe_id': 15,
 'kitchen_name': 'UNKNOWNKITCHEN'}

In [4]:
cookbook_valid_answer = []
        
with open('validation_classification_answer.csv', 'r') as file:

    csv_reader = reader(file, delimiter=",")

    for i, row in enumerate(csv_reader):        
        kitchen = row[0]        
        cookbook_valid_answer.append({
            'ingredients': [],
            'kitchen_name': kitchen,
            'recipe_id': i
        })
        
cookbook_valid_answer[15]

{'ingredients': [], 'kitchen_name': 'southern_us', 'recipe_id': 15}

In [5]:
# Make ingredient list

node_ingredient = pd.read_fwf('node_ingredient.csv', header=None)
ingredient_list = {}
for index , row in node_ingredient.iterrows():
    ingredient_list[index] = row[0]

In [6]:
# Make kitchen list

kitchens = []
cookbook_full = cookbook_train + cookbook_valid_question + cookbook_valid_answer
for recipe in cookbook_full:
    print(recipe)
    if recipe['kitchen_name'] not in kitchens:
        kitchens.append(recipe['kitchen_name'])

kitchen_list = { index:kitchen_name for index, kitchen_name in enumerate(kitchens)  }

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [7]:
def add_kitchen_id_to_cookbook(kitchen_list, cookbook):   

    for recipe in cookbook:
        kitchen_id = kitchens.index(recipe['kitchen_name'])
        recipe['kitchen_id'] = kitchen_id

        
add_kitchen_id_to_cookbook(kitchen_list, cookbook_train)
add_kitchen_id_to_cookbook(kitchen_list, cookbook_valid_question)
add_kitchen_id_to_cookbook(kitchen_list, cookbook_valid_answer)
cookbook_train[15]

{'ingredients': [2866, 4243, 4362, 5377, 5408, 6187, 6352, 6568],
 'kitchen_name': 'italian',
 'recipe_id': 15,
 'kitchen_id': 5}

In [8]:
# Import the embedding
df = pd.read_csv('Embedding/Embp1q2.csv', delimiter=",")

In [9]:
# Create an embedding dictionary from the file
embedding = {}
for index, row in df.iterrows():
    ingredient_id = row[1]
    embedding_string = row[2]
    embedding_formatted = re.findall(r'[\d|\.|e|\+|\-]+', embedding_string)
    embedding_array = np.asarray(embedding_formatted, dtype=float)
    embedding[ingredient_id] = embedding_array
    
embedding[2813]

array([ 0.20561211, -0.23588583,  0.28525352,  0.18780349,  0.06065833,
       -0.39907852, -0.01796227, -0.11046874, -0.5681695 , -0.06789849,
        0.2828858 , -0.44968304, -0.12082586, -0.2529383 ,  0.3063318 ,
        0.39635187, -0.16872615, -0.18257272,  0.04451386,  0.19073167,
        0.32635474,  0.44178614,  0.3493576 , -0.17027283, -0.20612109,
        0.2922662 , -0.18655235,  0.12261377,  0.10712708, -0.3737765 ,
       -0.23305595,  0.35114026, -0.11983725, -0.44269526, -0.08014395,
        0.0641107 ,  0.02052055, -0.04574165,  0.2765586 ,  0.08460651,
       -0.06599417,  0.14313757, -0.08939448, -0.2841781 ,  0.30076197,
       -0.20119154, -0.04180305, -0.33311573,  0.06491397,  0.4866584 ,
        0.07972738,  0.14844902,  0.3157956 ,  0.38712576,  0.44291887,
        0.10612391, -0.18014635, -0.22282842, -0.46312287,  0.26253107,
       -0.19294113,  0.26587847,  0.27891377, -0.12879719])

In [10]:
# Give zero vector as embedding for all ingredients not found in the embedding
embedding_dim = len(embedding[0])

for ingredient in ingredient_list.keys():
    if ingredient not in embedding:
        embedding[ingredient] = np.zeros(embedding_dim)

In [16]:
# random embedding, for if you want to see the absolute minimum baseline
# for ingredient in embedding.keys():
#    embedding[ingredient] = np.random.rand(embedding_dim)

In [17]:
def make_data_from_cookbook(cookbook, embedding):
    # Make the data. X is a matrix with recipe rows and embedding dimension columns, 
    # where every row is the average of the embeddings in the recipes

    n_recipies = len(cookbook)
    embedding_dim = len(embedding[0])

    X = np.zeros((n_recipies, embedding_dim))
    y = np.zeros(n_recipies)
    for idx, recipe in enumerate(cookbook):
        embedding_avg = np.zeros(embedding_dim) 
        ingredient_count = len(recipe['ingredients'])
        y[idx] = recipe['kitchen_id']
        for ingredient_id in recipe['ingredients']:
            embedding_avg += embedding[ingredient_id] / ingredient_count
            
        X[idx, :] = embedding_avg
            
    return X,y
    

In [18]:
X_train, y_train = make_data_from_cookbook(cookbook_train, embedding)
X_valid, _       = make_data_from_cookbook(cookbook_valid_question, embedding)
_      , y_valid = make_data_from_cookbook(cookbook_valid_answer, embedding)

In [19]:
# Logistic regression (takes roughly 10 sec on my machine)
clf = LogisticRegression(penalty='l2', max_iter=1000).fit(X_train, y_train)

In [20]:
# Training score
clf.score(X_train, y_train)

0.42892937529196923

In [21]:
# Validation score
clf.score(X_valid, y_valid)

0.41755861365953106