# **Import**

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Data preview**

In [3]:
trainframe = pd.read_json("train.json")
testframe = pd.read_json("test.json")

In [4]:
trainframe.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [5]:
trainframe.tail()

Unnamed: 0,id,cuisine,ingredients
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."
39773,2362,mexican,"[green chile, jalapeno chilies, onions, ground..."


In [6]:
trainframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           39774 non-null  int64 
 1   cuisine      39774 non-null  object
 2   ingredients  39774 non-null  object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


In [7]:
trainframe['cuisine'].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

In [8]:
trainframe['ingredients'].values

array([list(['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']),
       list(['plain flour', 'ground pepper', 'salt', 'tomatoes', 'ground black pepper', 'thyme', 'eggs', 'green tomatoes', 'yellow corn meal', 'milk', 'vegetable oil']),
       list(['eggs', 'pepper', 'salt', 'mayonaise', 'cooking oil', 'green chilies', 'grilled chicken breasts', 'garlic powder', 'yellow onion', 'soy sauce', 'butter', 'chicken livers']),
       ...,
       list(['eggs', 'citrus fruit', 'raisins', 'sourdough starter', 'flour', 'hot tea', 'sugar', 'ground nutmeg', 'salt', 'ground cinnamon', 'milk', 'butter']),
       list(['boneless chicken skinless thigh', 'minced garlic', 'steamed white rice', 'baking powder', 'corn starch', 'dark soy sauce', 'kosher salt', 'peanuts', 'flour', 'scallions', 'Chinese rice vinegar', 'vodka', 'fresh ginger', 'egg whites', 'broccoli', 'toasted sesame seeds', 'sugar', 'store bought low

# **Ingredients analysis**

In [9]:
pattern = re.compile("^[a-zA-Z ]*$")

In [None]:
# that code is dumb - don't look at it

# check if all words are "reasonable" - just testing
uwu = [ingredient if pattern.match(ingredient) is None else None for ingredients_list in trainframe['ingredients'] for ingredient in ingredients_list]

# for x in uwu:
#     if x is not None:
#         print(x)

# of course they are not

owo = []
# uwu = [pattern.sub(" ", str(ingredient))  if ingredient is not None for ingredient in uwu]

for x in uwu:
    if x is not None:
        owo.append(re.sub("[^a-zA-Z $]", " ", str(x)))

print(sorted(owo))
owo



# **Data preparation**

In [11]:
lemmatizer = WordNetLemmatizer()

# just to compare with TfidfVectorize - we got 3 times less results using library for NLP
ingredients = list(set(lemmatizer.lemmatize(re.sub("[^a-zA-Z $]", " ", str(ingredient))) for ingredients_list in trainframe['ingredients'] for ingredient in ingredients_list))


In [None]:
ingredients

In [13]:
testframe['lemmatized_ingredients'] = [(' ').join(lemmatizer.lemmatize(re.sub("[^a-zA-Z $]", " ", str(ingredient))) for ingredient in ingredients_list) for ingredients_list in testframe['ingredients']]

trainframe['lemmatized_ingredients'] = [(' ').join(lemmatizer.lemmatize(re.sub("[^a-zA-Z $]", " ", str(ingredient))) for ingredient in ingredients_list) for ingredients_list in trainframe['ingredients']]

In [14]:
# drop redundant data

trainframe.drop(["id", "ingredients"], axis=1, inplace=True)
testframe.drop(["ingredients"], axis=1, inplace=True)

In [15]:
vectorizer = TfidfVectorizer()

trainingredients = np.asarray(vectorizer.fit_transform(trainframe['lemmatized_ingredients']).todense())
testingredients = np.asarray(vectorizer.transform(testframe['lemmatized_ingredients']).todense())

In [16]:
# just to visualize data, redundant code
df = pd.read_json("train.json")
df.drop(["ingredients", "id"], axis=1, inplace=True)
df1 = pd.DataFrame(trainingredients, columns=vectorizer.get_feature_names_out())
res = pd.concat([df, df1], axis=1)
res
trainframe['cuisine']

0              greek
1        southern_us
2           filipino
3             indian
4             indian
            ...     
39769          irish
39770        italian
39771          irish
39772        chinese
39773        mexican
Name: cuisine, Length: 39774, dtype: object

In [17]:
cuisine_encoder = LabelEncoder()
encoded_cuisine = cuisine_encoder.fit_transform(trainframe['cuisine'].values.reshape(-1,1))

  y = column_or_1d(y, warn=True)


# **Training model**

In [24]:
classifier = LogisticRegression(solver='liblinear')

X_train, X_test, y_train, y_test = train_test_split(trainingredients, encoded_cuisine, test_size=0.20)
classifier.fit(X_train, y_train)

y_predict = classifier.predict(X_test)
accuracy_score(y_test, y_predict)

# cross_val_score(classifier, trainingredients, encoded_cuisine)

0.774732872407291

In [27]:
parameters = {
    # 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

clf = GridSearchCV(classifier, parameters)

clf.fit(trainingredients, encoded_cuisine)

GridSearchCV(estimator=LogisticRegression(solver='liblinear'), param_grid={})

# **Decoding and exporting data**

In [28]:
y_test_predict = classifier.predict(testingredients)
result_decoded = cuisine_encoder.inverse_transform(y_test_predict)

In [29]:
result_decoded

array(['british', 'southern_us', 'italian', ..., 'italian', 'southern_us',
       'mexican'], dtype=object)

In [30]:
submission = pd.DataFrame({'id': testframe['id'], 'cuisine': result_decoded})
submission.to_csv('submission.csv', index=False)