In [1]:
import pandas as pd
import numpy as np
from csv import reader
from sklearn.linear_model import LogisticRegression

In [2]:
cookbook_train = []

with open('train.csv', 'r') as file:

    csv_reader = reader(file, delimiter=",")

    for i, row in enumerate(csv_reader):
        
        kitchen = row[-1]
        ingredient_strings = row[1:-1]
        ingredients = [ int(s) for s in ingredient_strings ]
        
        cookbook_train.append({
            'ingredients': ingredients,
            'kitchen_name': kitchen,
            'recipe_id': i
        })      
        
cookbook_train[15]

{'ingredients': [2866, 4243, 4362, 5377, 5408, 6187, 6352, 6568],
 'kitchen_name': 'italian',
 'recipe_id': 15}

In [3]:
cookbook_valid_question = []

with open('validation_classification_question.csv', 'r') as file:

    csv_reader = reader(file, delimiter=",")

    for i, row in enumerate(csv_reader):
        ingredients = [ int(s) for s in row ]
        cookbook_valid_question.append({
            'ingredients': ingredients,
            'recipe_id': i,
            'kitchen_name': "UNKNOWNKITCHEN"
        })     

cookbook_valid_question[15]

{'ingredients': [909, 1046, 1308, 1679, 2809, 3152, 3653, 4162, 4799, 4917],
 'recipe_id': 15,
 'kitchen_name': 'UNKNOWNKITCHEN'}

In [4]:
cookbook_valid_answer = []
        
with open('validation_classification_answer.csv', 'r') as file:

    csv_reader = reader(file, delimiter=",")

    for i, row in enumerate(csv_reader):        
        kitchen = row[0]        
        cookbook_valid_answer.append({
            'ingredients': [],
            'kitchen_name': kitchen,
            'recipe_id': i
        })
        
cookbook_valid_answer[15]

{'ingredients': [], 'kitchen_name': 'southern_us', 'recipe_id': 15}

In [5]:
# Make ingredient list

node_ingredient = pd.read_fwf('node_ingredient.csv', header=None)
ingredient_list = {}
for index , row in node_ingredient.iterrows():
    ingredient_list[index] = row[0]

In [6]:
# Make kitchen list

kitchens = []
cookbook_full = cookbook_train + cookbook_valid_question + cookbook_valid_answer
for recipe in cookbook_full:
    print(recipe)
    if recipe['kitchen_name'] not in kitchens:
        kitchens.append(recipe['kitchen_name'])

kitchen_list = { index:kitchen_name for index, kitchen_name in enumerate(kitchens)  }

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [7]:
def add_kitchen_id_to_cookbook(kitchen_list, cookbook):   

    for recipe in cookbook:
        kitchen_id = kitchens.index(recipe['kitchen_name'])
        recipe['kitchen_id'] = kitchen_id

        
add_kitchen_id_to_cookbook(kitchen_list, cookbook_train)
add_kitchen_id_to_cookbook(kitchen_list, cookbook_valid_question)
add_kitchen_id_to_cookbook(kitchen_list, cookbook_valid_answer)
cookbook_train[15]

{'ingredients': [2866, 4243, 4362, 5377, 5408, 6187, 6352, 6568],
 'kitchen_name': 'italian',
 'recipe_id': 15,
 'kitchen_id': 5}

In [8]:
def make_data_from_cookbook(cookbook, ingredient_list):
    # Make the data. X is a binary matrix with recipe rows and ingredients columns, 
    # where (recipe, ingredients) is 1 if the ingredient is present in the recipe

    n_recipies = len(cookbook)
    n_ingredients = len(ingredient_list)

    X = np.zeros((n_recipies, n_ingredients))
    y = np.zeros(n_recipies)
    for idx, recipe in enumerate(cookbook):
        y[idx] = recipe['kitchen_id']
        for ingredient_id in recipe['ingredients']:
            X[idx][ingredient_id] = 1
            
    return X,y
    

In [9]:
# Logistic regression (takes roughly 60 sec on my machine)
X_train, y_train = make_data_from_cookbook(cookbook_train, ingredient_list)
X_valid, _       = make_data_from_cookbook(cookbook_valid_question, ingredient_list)
_      , y_valid = make_data_from_cookbook(cookbook_valid_answer, ingredient_list)
clf = LogisticRegression(max_iter=1000).fit(X_train, y_train)

In [10]:
# Training score
clf.score(X_train, y_train)

0.8786257272688666

In [11]:
# Validation score
clf.score(X_valid, y_valid)

0.761085626911315