# This notebook contains initial EDA (and 1st round classification) of the Yummly dataset. 
## Author: Aaron W Chen

In [64]:
import pandas as pd
import numpy as np
import itertools
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

ModuleNotFoundError: No module named 'matplotlib'

In [2]:
train_data_path = '../data/raw/train.json'
data = pd.read_json(train_data_path)
data.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [36]:
data.groupby('cuisine').count()

Unnamed: 0_level_0,id,ingredients
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1
brazilian,467,467
british,804,804
cajun_creole,1546,1546
chinese,2673,2673
filipino,755,755
french,2646,2646
greek,1175,1175
indian,3003,3003
irish,667,667
italian,7838,7838


In [3]:
targets = data.cuisine
targets

0              greek
1        southern_us
2           filipino
3             indian
4             indian
            ...     
39769          irish
39770        italian
39771          irish
39772        chinese
39773        mexican
Name: cuisine, Length: 39774, dtype: object

In [4]:
recipes = data.ingredients
recipes

0        [romaine lettuce, black olives, grape tomatoes...
1        [plain flour, ground pepper, salt, tomatoes, g...
2        [eggs, pepper, salt, mayonaise, cooking oil, g...
3                      [water, vegetable oil, wheat, salt]
4        [black pepper, shallots, cornflour, cayenne pe...
                               ...                        
39769    [light brown sugar, granulated sugar, butter, ...
39770    [KRAFT Zesty Italian Dressing, purple onion, b...
39771    [eggs, citrus fruit, raisins, sourdough starte...
39772    [boneless chicken skinless thigh, minced garli...
39773    [green chile, jalapeno chilies, onions, ground...
Name: ingredients, Length: 39774, dtype: object

In [5]:
skf = StratifiedKFold(n_splits=3)
for train_index, val_index in skf.split(recipes, targets):
    recipes_train, recipes_val = recipes[train_index], recipes[val_index]
    targets_train, targets_val = targets[train_index], targets[val_index]

In [6]:
le = LabelEncoder()
le.fit(targets_train)

LabelEncoder()

In [7]:
le.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'], dtype=object)

In [8]:
transformed_targets_train = le.transform(targets_train)

In [24]:
all_words = [ item for elem in recipes_train for item in elem]

In [29]:
all_words[:5], len(all_words)

(['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper'],
 285089)

In [19]:
recipes_train

0        [romaine lettuce, black olives, grape tomatoes...
1        [plain flour, ground pepper, salt, tomatoes, g...
2        [eggs, pepper, salt, mayonaise, cooking oil, g...
3                      [water, vegetable oil, wheat, salt]
4        [black pepper, shallots, cornflour, cayenne pe...
                               ...                        
26906    [milk, baking powder, all-purpose flour, sugar...
26954    [worcestershire sauce, celery, ground black pe...
26960    [garlic, fresh parsley, chicken stock, long gr...
26969    [green chile, seasoning salt, diced tomatoes, ...
26982    [Johnsonville Smoked Sausage, water, stewed to...
Name: ingredients, Length: 26516, dtype: object

In [17]:
tfidf_vec_unigram = TfidfVectorizer(token_pattern=r"(?u)\b[a-zA-Z]{2,}\b")

In [30]:
x = tfidf_vec_unigram.fit(all_words)

In [31]:
x

TfidfVectorizer(token_pattern='(?u)\\b[a-zA-Z]{2,}\\b')

In [38]:
recipes_train_uni_vec = tfidf_vec_unigram.transform(recipes_train.apply(" ".join).str.lower())
word_matrix = pd.DataFrame(recipes_train_uni_vec.toarray(), 
                            columns=tfidf_vec_unigram.get_feature_names(), 
                            index=recipes_train.index
                        )

In [39]:
word_matrix

Unnamed: 0,abalone,abbamele,absinthe,abura,acai,accent,accompaniment,achiote,acid,acini,...,yuzu,yuzukosho,za,zatarain,zero,zest,zesty,zinfandel,ziti,zucchini
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.231513,0.0,0.0,0.0,0.0
26954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
26960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
26969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [43]:
word_matrix.columns[201:220]

Index(['biga', 'bihon', 'bing', 'bird', 'biryani', 'biscotti', 'biscuit',
       'biscuits', 'bison', 'bisquick', 'bits', 'bitter', 'bitters',
       'bittersweet', 'blacan', 'black', 'blackberries', 'blackberry',
       'blackcurrant'],
      dtype='object')

In [47]:
word_matrix.iloc[0]

abalone      0.0
abbamele     0.0
absinthe     0.0
abura        0.0
acai         0.0
            ... 
zest         0.0
zesty        0.0
zinfandel    0.0
ziti         0.0
zucchini     0.0
Name: 0, Length: 2771, dtype: float64

In [48]:
word_matrix.iloc[0][word_matrix.iloc[0] > 0]

beans        0.224367
black        0.181163
cheese       0.178052
crumbles     0.311264
feta         0.291110
garbanzo     0.343606
garlic       0.159290
grape        0.321816
lettuce      0.266857
olives       0.260341
onion        0.198533
pepper       0.147376
purple       0.248575
romaine      0.317171
seasoning    0.240609
tomatoes     0.189996
Name: 0, dtype: float64

In [49]:
recipes_train.iloc[0]

['romaine lettuce',
 'black olives',
 'grape tomatoes',
 'garlic',
 'pepper',
 'purple onion',
 'seasoning',
 'garbanzo beans',
 'feta cheese crumbles']

In [53]:
lr_uni_tfidf_ovr = LogisticRegression(multi_class = 'ovr',
                                      class_weight = 'balanced',
                                      #solver = 'liblinear',
                                      max_iter = 1000,
                                      random_state = 50)
lr_uni_tfidf_ovr.fit(word_matrix, targets_train)
train_pred_tfidf_uni_ovr = lr_uni_tfidf_ovr.predict(word_matrix)

In [57]:
def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [58]:
acc_lr_uni_tfidf_ovr, prec_lr_uni_tfidf_ovr, recall_lr_uni_tfidf_ovr, f1_lr_uni_tfidf_ovr = get_metrics(targets_train, train_pred_tfidf_uni_ovr)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (acc_lr_uni_tfidf_ovr, prec_lr_uni_tfidf_ovr, recall_lr_uni_tfidf_ovr, f1_lr_uni_tfidf_ovr))

accuracy = 0.816, precision = 0.830, recall = 0.816, f1 = 0.820


In [59]:
reci_val_uni_vec = tfidf_vec_unigram.transform(recipes_val.apply(" ".join).str.lower())
word_matrix_val = pd.DataFrame(reci_val_uni_vec.toarray(), 
                            columns=tfidf_vec_unigram.get_feature_names(), 
                            index=recipes_val.index
                        )

In [60]:
val_pred_tfidf_uni_ovr = lr_uni_tfidf_ovr.predict(word_matrix_val)

In [61]:
val_acc_lr_uni_tfidf_ovr, val_prec_lr_uni_tfidf_ovr, val_recall_lr_uni_tfidf_ovr, val_f1_lr_uni_tfidf_ovr = get_metrics(targets_val, val_pred_tfidf_uni_ovr)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (val_acc_lr_uni_tfidf_ovr, val_prec_lr_uni_tfidf_ovr, val_recall_lr_uni_tfidf_ovr, val_f1_lr_uni_tfidf_ovr))

accuracy = 0.763, precision = 0.776, recall = 0.763, f1 = 0.768


In [63]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.winter):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=20)
    plt.yticks(tick_marks, classes, fontsize=20)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", 
                 color="white" if cm[i, j] < thresh else "black", fontsize=40)
    
    plt.tight_layout()
    plt.ylabel('True label', fontsize=30)
    plt.xlabel('Predicted label', fontsize=30)

    return plt

NameError: name 'plt' is not defined