In [2]:
import pandas as pd
import re

from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('once')

In [3]:
def normalize_ingreds(x: list[str]) -> str:
    skip_verbs = [
        "crushed","crumbles","ground","minced","powder","chopped",
        "sliced","grilled","boneless","skinless","steamed"]
    remove_verbs = lambda x: re.sub(r"|".join(skip_verbs),'', x)
    lemmatizer = WordNetLemmatizer()
    
    ingreds = list(map(remove_verbs, x))
    ingreds = [" ".join([lemmatizer.lemmatize(j) 
                    for j in i.lower().split(" ")]) 
                for i in ingreds]
    ingreds = [re.sub("[^A-Za-z ]", "", i) for i in ingreds]
    ingreds = [re.sub(" +", " ", i) for i in ingreds]
    ingreds = [i.strip().replace(" ", "_" ) for i in ingreds]

    return ",".join(ingreds)

In [4]:
yummly_df = pd.read_json("../assets/yummly.json")

yummly_df["ingredients"] = yummly_df["ingredients"].map(normalize_ingreds)
yummly_df = yummly_df[~yummly_df.duplicated(["cuisine", "ingredients"], keep="first")]

In [5]:
y = yummly_df["cuisine"]
X = yummly_df.drop(['id', 'cuisine'], axis=1)

le = LabelEncoder()
y_transformed = le.fit_transform(yummly_df["cuisine"])

### Hypertuning LinearSVC using GridSerachCV

In [6]:
param_grid = {
    'preprocessor__vectorizer__ngram_range': [(1,1), (1, 2), (2,2)],
    'estimator__C': [0.01, 0.1, 0.9, 1, 10, 50, 100], 
    'estimator__penalty': ['l2']
}

preprocessor = ColumnTransformer(
    transformers=[
        ('vectorizer', TfidfVectorizer(stop_words="english"), "ingredients")
    ])

clf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('estimator', LinearSVC())
])

grid = GridSearchCV(
    clf_pipe,
    param_grid,
    cv=2,
    refit=True,
    scoring='accuracy')

grid.fit(X, y_transformed)       



GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('vectorizer',
                                                                         TfidfVectorizer(stop_words='english'),
                                                                         'ingredients')])),
                                       ('estimator', LinearSVC())]),
             param_grid={'estimator__C': [0.01, 0.1, 0.9, 1, 10, 50, 100],
                         'estimator__penalty': ['l2'],
                         'preprocessor__vectorizer__ngram_range': [(1, 1),
                                                                   (1, 2),
                                                                   (2, 2)]},
             scoring='accuracy')

In [7]:

print("Best hyper parameters: ", grid.best_params_)
print("Best Accuracy score: ", grid.best_score_) 

Best hyper parameters:  {'estimator__C': 0.9, 'estimator__penalty': 'l2', 'preprocessor__vectorizer__ngram_range': (1, 1)}
Best Accuracy score:  0.7753553785663878


### Hypertuning kNN using GridSearchCV

In [8]:
param_grid = {
    'preprocessor__vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
    'estimator__n_neighbors': list(range(2, 20))
}

preprocessor = ColumnTransformer(
    transformers=[
        ('vectorizer', TfidfVectorizer(stop_words="english"), "ingredients")
    ])

knn_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('estimator', KNeighborsClassifier())
])

grid = GridSearchCV(
    knn_pipe,
    param_grid,
    cv=2,
    refit=True,
    scoring='accuracy')

grid.fit(X, y_transformed)   

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('vectorizer',
                                                                         TfidfVectorizer(stop_words='english'),
                                                                         'ingredients')])),
                                       ('estimator', KNeighborsClassifier())]),
             param_grid={'estimator__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10,
                                                    11, 12, 13, 14, 15, 16, 17,
                                                    18, 19],
                         'preprocessor__vectorizer__ngram_range': [(1, 1),
                                                                   (1, 2),
                                                                   (2, 2)]},
             scoring='accuracy')

In [9]:
print("Best hyper parameters: ", grid.best_params_)
print("Best Accuracy score: ", grid.best_score_) 

Best hyper parameters:  {'estimator__n_neighbors': 14, 'preprocessor__vectorizer__ngram_range': (1, 1)}
Best Accuracy score:  0.732281479987902
