In [1]:
import argparse
from collections import defaultdict, Counter
import cv2
from fuzzywuzzy import fuzz
from itertools import combinations # not mandatory
import imutils
from imutils import build_montages, paths
import os
import matplotlib.pyplot as plt
import networkx as nx
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re
import shutil
import seaborn as sns

from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import string

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from xgboost import cv, XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectFromModel
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

2023-08-25 15:11:28.204666: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Cleaning and Arranging the data


At first we want to remove unnecessary signs from the data, rearrange some of the columns that it will be easer to learn from it and train the model on it. Therefore, we define useful functions for cleaning the data.

In [2]:
def remove_parentheses(string):
    pattern = r'\([^()]*\)'  # Matches "(...)" pattern
    while re.search(pattern, string):
        string = re.sub(pattern, '', string)
    return string.strip()

def format_ingredients(string):
    string = string.replace('.', '')  # Remove periods
    string = re.sub(r'\s*,\s*', ',', string)  # Remove spaces after commas
    ingredients_list = string.split(',')  # Split by comma
    formatted_ingredients = [ingredient.strip() for ingredient in ingredients_list]  # Remove leading/trailing spaces for each ingredient
    return ', '.join(formatted_ingredients)  # Join formatted ingredients with commas

def clean_text_values(df):
    text_columns = df.select_dtypes(exclude=[np.number]).columns
    for column in text_columns:
        if column == 'category':
            continue
        df[column].fillna('NA', inplace=True)
        df[column] = df[column].map(str.lower)
        if column == 'ingredients':
            df[column] = df[column].apply(remove_parentheses)
            df[column] = df[column].apply(format_ingredients)
        if column == 'household_serving_fulltext':
            df[column] = df[column].map(lambda x: re.sub('[^a-z]+', '', x))
    return df

### Read Data

In [3]:
food_train = pd.read_csv('data/food_train.csv')
food_test = pd.read_csv('data/food_test.csv')
food_nutrients = pd.read_csv('data/food_nutrients.csv')
nutrients_names = pd.read_csv('data/nutrients.csv')

In [4]:
clean_text_values(food_train)
clean_text_values(food_test)
nutrients = pd.merge(food_nutrients, nutrients_names, how='left',on='nutrient_id')
pivoted_nutrients = pd.pivot_table(nutrients, values='amount', index='idx', columns='name')
data = pd.merge(food_train, pivoted_nutrients, how='left', on='idx')
data_test = pd.merge(food_test, pivoted_nutrients, how='left', on='idx')

### Remove columns with > 0.8 nulls

We don't want that missing data will hurt our data analysis. Hence, we decided that if there is a product which 80% of the his data is missing, we will ignore it and don't use it to get our conclusions.

In [5]:
cols_to_remove = data.columns[data.isnull().mean() > 0.8]
data = data.drop(columns=cols_to_remove)

cols_to_remove_test = data_test.columns[data_test.isnull().mean() > 0.8]
data_test = data_test.drop(columns=cols_to_remove_test)

### Enums

In [6]:
CANDY = 'candy'
COOKIES = 'cookies_biscuits'
CAKES = 'cakes_cupcakes_snack_cakes'
CHIPS_PRETZELS = 'chips_pretzels_snacks'
CHOCOLATE = 'chocolate'
POPCORN_PEANUTS = 'popcorn_peanuts_seeds_related_snacks'

CATEGORIES = [CANDY, COOKIES, CAKES, CHIPS_PRETZELS, CHOCOLATE, POPCORN_PEANUTS]

In [8]:
data_test['idx'].nunique()

3525

# Our way of work:

At first we will work on each type of data separately.  
*Part 1 - Images data:* We look over the train and test images for all snack products and build Convolutional Neural Network (CNN).  
*Part 2 - Tabular data:* We look over the 3 data sets - food_train.csv and food_test.csv, nutrients.csv and food_nutrients.csv. In this part we will handle each column and analyze the data.  
*Part 3* - Modeling: We will train the data and try to get the best model for Predicting the test products' category.  

# Part 1 - Images data

We built a CNN in different regular python file, using tensorflow model (as we saw in class) with the images data, as you can in "CNN.py".  
Then we calculate the probability for each image to be belong to each category and save it in a probabilities vector. Then, we export the probabilitis vector and save it in .csv file called 'photos_probs.csv'.  
Finally, we chose to add it to the tabular data .

So, now lets read it and add it to the tabular data.

In [9]:
photos_probs = pd.read_csv('photos_probs.csv')
photo_test_probs = pd.read_csv('photos_test_probs.csv')
idx = photos_probs['idx'].unique()
print(len(idx))

22077


Unfortunately, we didn't get probs for all the 30K rows.   
Therefore, we created 2 df for the data analysis part: 
- One with 30K rows and without the images probs (df1).   
- second with 20K rows but with the images probs (df2).  

We will decide based on the following results on which we should foucus.  
In Part 3 - We will try to use the images probs in a model and check if it helps for the prediction.

In [12]:
data_with_probs = pd.merge(data, photos_probs, how="left",on='idx')
data_with_probs_test = pd.merge(data_test, photo_test_probs, how="left",on='idx')
df1 = data.copy()
df2 = data_with_probs[data_with_probs['idx'].isin(idx)].copy()

As we told in the instruction, the classes are not balanced but they are not very imbalanced either. We will use later in SMOTE method to deal with that.   

In [9]:
df1['category'].value_counts()

popcorn_peanuts_seeds_related_snacks    7645
candy                                   7584
cookies_biscuits                        5284
cakes_cupcakes_snack_cakes              3786
chocolate                               3772
chips_pretzels_snacks                   3680
Name: category, dtype: int64

### Split the data

We'll split df1 to train and test sets. Then, we'll split the df1_train set into 2 data sets. One for feature engineering and the other for model tuning.  
We will use the FE data set also for fillNA methods and the modeltuning set for resampling.

In [10]:
X = df1.loc[:,df1.columns != 'category']
y = df1.loc[:,['category']]

y['category'].replace(['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate',
 'cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
                        [0, 1, 2, 3, 4, 5], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_fe, X_mt, y_fe, y_mt = train_test_split(X_train, y_train, test_size=0.4, random_state=42, stratify=y_train)

X_fe_train, X_fe_test, y_fe_train, y_fe_test = train_test_split(X_fe, y_fe, test_size=0.2, random_state=42, stratify=y_fe)
X_mt_train, X_mt_test, y_mt_train, y_mt_test = train_test_split(X_mt, y_mt, test_size=0.2, random_state=42, stratify=y_mt)

In [19]:
# X_y_train = pd.merge(X_train, y_train, how="left",left_index=True, right_index=True)
# X_y_train['idx'] = X_y_train['idx'].map(lambda x: str(x))
# X_y_test = pd.merge(X_test, y_test, how="left",left_index=True, right_index=True)
# X_y_test['idx'] = X_y_test['idx'].map(lambda x: str(x))

# X_y_mt = pd.merge(X_mt, y_mt, how="left",left_index=True, right_index=True)


#### #### concat for eda
X_fe_w_cat = pd.merge(X_fe_train, y_fe_train, how="left",left_index=True, right_index=True)

### Define useful functions

In [12]:
def better_name(df, col_name):
    df[col_name] = df[col_name].map(lambda x: str(x).translate(str.maketrans('', '', string.punctuation)))
    return(df)

def data_by_category(df, category):
    return df[df['category'] == category]

def select_top_words(row, dict_words, column):
    desc = row[column]
    for category, words in dict_words.items():
        for word in dict_words[category]:
            if word in desc:
                name = f"{column}_{category}_{word}"
                return name
    return 'None'

# Part 1 - Feature Engineering

### Top 15 words in each column in the data per category
We have found the most 15 common words in each column: 'brand', 'description', 'ingredients' and 'household' per category. That will help us understand which words are represent and imply each category.

You can see all the data analysis we performed in the "Final_Project-Part_2-Words_Selection" notebook.  

# 'brand' column Research & Treatment

We will look on top 15 common brands over each category. After that we will change the brand column and make sure that just the selected top 15 (from each category) brands will stay the same and all the other will sign as unbranded.

In [13]:
def extract_brand(df):
    brand_words =     brand_words = {
    "candy": ["ferrara candy company","frankford candy llc","sunmark","mars chocolate north america llc","just born inc","ross acquisition inc",
              "russell stover candies inc","maud borup inc","reeses","tops markets llc","supervalu inc","weis markets inc","tootsie roll industries inc",
              "holiday candy corp inc","wm wrigley jr company",
    ],
    "cookies": ["nabisco biscuit company","keebler company","lofthouse foods","ahold usa inc","bimbo bakeries usa inc","safeway inc","wegmans food markets inc",
                "lenny  larrys","abimar foods inc","kingston marketing co",
    ],
    "cakes": ["mckee foods corporation","hostess brands llc","tasty baking company","bimbo bakeries usa inc","sweet ps bake shop","twobite","schnuck markets inc",
              "fresh  easy","dawn food products inc","dierbergs markets inc","flowers foods inc","labrees bakery inc","rich products corporation",
              "aryzta llc","rocky mountain pies",
    ],
    "chips_pretzels": ["utz quality foods inc","the hain celestial group inc","herr foods inc","snyderslance inc","jays foods inc","cape cod potato chips inc",
                       "inventure foods inc","old dutch foods inc","better made snack foods inc","small planet foods inc","roundys","pinnacle foods group llc",
    ],
    "chocolate": ["lindt  sprungli schweiz ag","russell stover candies inc","mars chocolate north america llc","godiva chocolatier inc","ghirardelli chocolate company",
                  "frankford candy llc","moonstruck chocolate co","ross acquisition inc","rm palmer co","hammonds candies since 1920 llc","theo chocolate inc",
                  "demets candy company","ghirardelli","green  blacks","fannie may confections inc","nestle usa inc","harmons inc","whitmans candies inc",
    ],
    "popcorn_peanuts": ["american importing co inc","nabisco food company","john b sanfilippo  son inc","tops markets llc","star snacks co inc",
                        "safeway inc","supervalu inc","ahold usa inc","weis markets inc",
    ],
}

    num_rows = len(df)
    data_dict = {key: set(values) for key, values in brand_words.items()}
    columns_dict = {f"{key}_{word}": np.zeros(num_rows) for key, words in brand_words.items() for word in words}
    new_df = pd.DataFrame(columns_dict)
    result_df = pd.concat([df.reset_index(drop=True), new_df], axis=1)
    result_df.index = df.index

    for key, words in brand_words.items():
        for word in words:
            result_df[f"{key}_{word}"] = result_df['brand'].apply(lambda x: 1 if isinstance(x, str) and word in x else 0)
            
    result_df['brand'] = result_df.apply(lambda row: select_top_words(row, brand_words, 'brand'), axis=1)
    
    return result_df

# 'description' column Research & Treatment

After analyzing the results baised on the feature engineering train data set, we received the dict described above which includes indication words for every category.

We will add a column for every selected word. it will contain 1 if the word is in appear in the snack description, and 0 else.

In [14]:
def extract_description(df):
    desc_words = { CAKES:['cake', 'cakes', 'tastykake', 'cupcakes', 'cheezecake', 'bakery fresh goodness', 'apple pie', 'pie', 
           'pecan pie', 'pumpkin pie', 'pies', 'cupcake', 'coffeecake', 'brownie', 'brownies', 'slice', 'sliced', 
           'torte', 'donut', 'donuts', 'bakery', 'eclair'],
    CANDY:['candy', 'candies', 'gummy', 'gummi', 'gummies', 'twist', 'stick', 'sticks', 'marshmallow', 'marshmallows',
           'jelly', 'jell', 'snacks', "sweet's",  "brach's", 'cherry', 'strawberry', 'orange', 'watermelon', 
           'peppermint', 'grape', 'lolli', 'fruit', 'licorice', 'drops', 'confection', 'chicks', 'sour', 'sweet', 
           'peeps', 'dragee', 'fizz', 'patties', 'cane', 'chewy'],
    CHIPS_PRETZELS:['potato chips', 'tortilla chips', 'kettle cooked potato chips', 'sea salt', 'kettle chips', 'kettle',
                    "snyder's of hanover", 'sour cream & onion', 'wavy potato chips', "herr's", 'chips', 'chip', 
                    'tortilla', 'crisps', 'crisp', 'potato', 'pretzel', 'pretzels', 'fries'],
    CHOCOLATE:[ 'chocolate', 'chocolates', 'dark chocolate', 'lindt', 'ghirardelli chocolate', 'russell stover', 'godiva',  
                'truffle', 'truffles', 'dark chocolate bar', 'cocoa', 'praline', 'toffee', 'belgian', 'dark', 'caramel'],
    COOKIES:['cookie', 'cookies', 'chocolate chip', 'chocolate chip cookies', 'sandwich cookies', 'sandwich', 'shortbread cookies',
             'frosted sugar cookies', 'sugar cookies', 'cracker', 'crackers', 'frosted', 'wafer', 'wafers', 'biscuit', 
             'macaroon', 'waffle', 'gingernread', 'macarons'],
    POPCORN_PEANUTS:['popcorn', 'almond', 'almonds', 'mix', 'trail mix', 'peanuts', 'mixed nuts', 'nuts', "pistachios",
                    'dry roasted peanuts', 'roasted', 'cashews', 'kernel', 'shell', 'pecan', 'seeds', 'macadamias', 
                     'corn', 'nutty']}
    
    num_rows = len(df)
    data_dict = {key: set(values) for key, values in desc_words.items()}
    columns_dict = {f"{key}_{word}": np.zeros(num_rows) for key, words in desc_words.items() for word in words}
    new_df = pd.DataFrame(columns_dict)
    result_df = pd.concat([df.reset_index(drop=True), new_df], axis=1)
    result_df.index = df.index

    for key, words in desc_words.items():
        for word in words:
            result_df[f"{key}_{word}"] = result_df['description'].apply(lambda x: 1 if isinstance(x, str) and word in x else 0)
            
    result_df['description'] = result_df.apply(lambda row: select_top_words(row, desc_words, 'description'), axis=1)
    
    return result_df

# 'ingredients' column Research & Treatment

After analyzing the results baised on the feature engineering train data set, we received the dict described above which includes indication words for every category.

We will add a column for every selected word. it will contain 1 if the word is in appear in the snack ingredients, and 0 else.

In [15]:
def extract_ingredients(df):
    ingre_words = {
        CAKES:['leavening', 'eggs', 'water', 'soybean oil', 'whey', 'xanthan gum', 'polysorbate 60', 'guar gum'],
        CANDY:['gelatin', 'carnauba wax', 'red 40', 'blue 1', 'yellow 5', 'citric acid', 
           'natural and artificial flavors', 'malic acid', 'sodium citrate'],
        CHIPS_PRETZELS:['potatoes', 'onion powder', 'garlic powder', 'yeast extract', 'spices',
                   'monosodium glutamate'],
        CHOCOLATE:['cocoa butter', 'milk chocolate', 'chocolate', 'milk', 'dark chocolate', 'butter', 'chocolate liquor',
              'skim milk', 'vanilla', 'palm oil'],
        COOKIES:['baking soda', 'enriched flour', 'leavening', 'eggs', 'high fructose corn syrup', 'wheat flour', 'butter'],
        POPCORN_PEANUTS:['almonds', 'peanuts', 'cashews', 'popcorn', 'pecans', 'peanut oil', 'raisins']
    } 
    
    num_rows = len(df)
    data_dict = {key: set(values) for key, values in ingre_words.items()}
    columns_dict = {f"{key}_{word}": np.zeros(num_rows) for key, words in ingre_words.items() for word in words}
    new_df = pd.DataFrame(columns_dict)
    result_df = pd.concat([df.reset_index(drop=True), new_df], axis=1)
    result_df.index = df.index

    for key, words in ingre_words.items():
        for word in words:
            result_df[f"{key}_{word}"] = result_df['ingredients'].apply(lambda x: 1 if isinstance(x, str) and word in x else 0)
            
    result_df['ingredients'] = result_df.apply(lambda row: select_top_words(row, ingre_words, 'ingredients'), axis=1)
            
    return result_df

# 'household_serving_fulltext' column Research & Treatment

After analyzing the results baised on the feature engineering train data set, we received the dict described above which includes indication words for every category.

We will add a column for every selected word. it will contain 1 if the word is in appear in the snack household' column, and 0 else.

In [16]:
def extract_household(df):
    household_words = {CAKES:['cake', 'cakes', 'cupcakes', 'cupcake','brownie', 'pie', 'donut', 'muffin', 'tart', 
                              'torte', 'doughnut','slice', 'pastry', 'bun', 'loaf'],
                       CANDY:['candies', 'candy','gummies', 'gummy', 'marshmallow', 'pop', 'twist', 'stick','bear'],
                       CHIPS_PRETZELS:['chip', 'chips', 'fries', 'crisp', 'chipsabout', 'pretzelsabout'],
                       CHOCOLATE:['squares', 'square', 'balls', 'ball', 'truffle', 'praline', 'pralines', 'block', 'tablet', 'bar'],
                       COOKIES:['cookies', 'cookie', 'wafers', 'wafer', 'crackers', 'cracker', 'biscuit', 'macaroon', 'waffle'],
                       POPCORN_PEANUTS:['tbsp', 'kernel', 'popcorn', 'almond', 'shell', 'pecan']}
    
    num_rows = len(df)
    data_dict = {key: set(values) for key, values in household_words.items()}
    columns_dict = {f"{key}_{word}": np.zeros(num_rows) for key, words in household_words.items() for word in words}
    new_df = pd.DataFrame(columns_dict)
    result_df = pd.concat([df.reset_index(drop=True), new_df], axis=1)
    result_df.index = df.index

    for key, words in household_words.items():
        for word in words:
            result_df[f"{key}_{word}"] = result_df['household_serving_fulltext'].apply(lambda x: 1 if isinstance(x, str) and word in x else 0)
            
    result_df['household_serving_fulltext'] = result_df.apply(lambda row: select_top_words(row, household_words, 'household_serving_fulltext'), axis=1)
            
    return result_df

# 'serving_size' column Research & Treatment

In [20]:
def data_by_category(df, category):
    return df[df['category'] == category]

def find_mean_and_median(category):
    df = data_by_category(X_fe_w_cat, category)
    return df['serving_size'].mean(), df['serving_size'].median()

mean_median_df = {'Category': [CANDY, COOKIES, CAKES, CHIPS_PRETZELS, CHOCOLATE, POPCORN_PEANUTS],
              'Average Size': [find_mean_and_median(CANDY)[0],find_mean_and_median(COOKIES)[0], find_mean_and_median(CAKES)[0],
                               find_mean_and_median(CHIPS_PRETZELS)[0], find_mean_and_median(CHOCOLATE)[0],find_mean_and_median(POPCORN_PEANUTS)[0]],
              'Median Size': [find_mean_and_median(CANDY)[1],find_mean_and_median(COOKIES)[1], find_mean_and_median(CAKES)[1],
                               find_mean_and_median(CHIPS_PRETZELS)[1], find_mean_and_median(CHOCOLATE)[1],find_mean_and_median(POPCORN_PEANUTS)[1]]}
size_df = pd.DataFrame(mean_median_df)

size_df

Unnamed: 0,Category,Average Size,Median Size
0,candy,,
1,cookies_biscuits,,
2,cakes_cupcakes_snack_cakes,,
3,chips_pretzels_snacks,,
4,chocolate,,
5,popcorn_peanuts_seeds_related_snacks,,


It seems that product from cakes category are much heavy then all the other products. Therefore, this column may help us distinguish between cakes category and all the others, so we'll keep it.

# 'serving_size_unit' column Research & Treatment

In [21]:
serving_size_unit = {'values': data['serving_size_unit'].drop_duplicates(),
                      'Frequency': [data['serving_size_unit'].value_counts()['g'], 
                                    data['serving_size_unit'].value_counts()['ml']]
                    }
serving_size_unit_df = pd.DataFrame(serving_size_unit)
serving_size_unit_df

Unnamed: 0,values,Frequency
0,g,31743
8554,ml,8


We notice that most of the snacks has the value 'g' and just 8 of them has the value 'ml'.
Therefore, "size_unit" column is not important and has no effect on the data, so it can be removed. 

In [22]:
def drop_size_unit_column(df):
    return df.drop(['serving_size_unit'],axis=1)

# Nutrients & Numerical columns Research & Treatment

In [23]:
numeric_features = ['serving_size', 'Calcium, Ca', 'Carbohydrate, by difference','Cholesterol', 'Energy', 'Fatty acids, total saturated',
                    'Fatty acids, total trans', 'Fiber, total dietary', 'Iron, Fe','Protein', 'Sodium, Na', 'Sugars, total including NLEA',
                    'Total lipid (fat)', 'Vitamin A, IU', 'Vitamin C, total ascorbic acid']

- Dealing with missing values in the above columns - we will try different kinds of methods later.

# Part 3 - Models

Notice that in all the next Cross Validation (CV) we will use simple XGBClassifier and we will fine-tuning it later.

[Edit-Explain the modles that we are going to use]


## Feature Engineering - CV

[Model explanation]

In [24]:
def final_foo(df):
    x1 = extract_brand(df)
    x2 = extract_description(x1)
    x3 = extract_ingredients(x2)
    x4 = extract_household(x3)
    return x4

#### Proccess our fe-data and test if the dicts_words we built are good:

In [31]:
X_fe_train = final_foo(X_fe_train)
X_fe_test = final_foo(X_fe_test)

In [32]:
Brand = FunctionTransformer(extract_brand)
Description2 = FunctionTransformer(extract_description)
Ingredients2 = FunctionTransformer(extract_ingredients)
Household2 = FunctionTransformer(extract_household)
Drop = FunctionTransformer(drop_size_unit_column)

In [33]:
numerical_transformer = StandardScaler()

categorical_features = ["brand", "description", "ingredients","household_serving_fulltext",'serving_size_unit']

numeric_features = ['serving_size', 'Calcium, Ca', 'Carbohydrate, by difference','Cholesterol', 'Energy', 'Fatty acids, total saturated',
                    'Fatty acids, total trans', 'Fiber, total dietary', 'Iron, Fe','Protein', 'Sodium, Na', 'Sugars, total including NLEA',
                    'Total lipid (fat)', 'Vitamin A, IU', 'Vitamin C, total ascorbic acid']

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [34]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ('classifier', XGBClassifier(random_state=42))
])

In [35]:
scores = cross_val_score(pipeline, X_fe_train, y_fe_train.values.ravel(), cv=5, scoring='accuracy')
print("Cross-validation scores:", scores)

Cross-validation scores: [0.90569906 0.90733907 0.90648072 0.90812141 0.90689089]


Accuracy ~0.9 that grate! Now, Let's fit on all X_fe_train and check the test accuracy.

In [36]:
pipeline.fit(X_fe_train, y_fe_train.values.ravel())

In [37]:
test_score = pipeline.score(X_fe_test, y_fe_test.values.ravel())
print("Test score:", test_score)

Test score: 0.9124015748031497


## Fill Null values

We want to make maximum use of the data and not throw away rows that contain a little missing information. (Remmember, we throw away rows which 80% of it data is missing). Therefore, in order to overcome this issue will check a few methods like mean, median and KNN, then will fill in the missing value with the best value that will help us.

We built a pipleline that fill missing values in the numeric columns with different methods, after that it standrize these numeric columns and encode categorical columns.
The categorical columns doesnt have missing values, therefore there isnt a imputer for them.

In [38]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer()),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ]), categorical_features)
    ])

pipeline_nan = Pipeline([
    ("preprocessor", preprocessor),
    ('classifier', XGBClassifier(random_state=42))
])

param_grid = [
    {
        'preprocessor__num__imputer': [SimpleImputer(strategy='mean'), SimpleImputer(strategy='median')] + [KNNImputer(n_neighbors=k) for k in range(1, 6)]
    }
]

grid_search = GridSearchCV(pipeline_nan, param_grid, cv=5, n_jobs=1,verbose = 3,scoring = 'accuracy')

grid_search.fit(X_fe_train, y_fe_train.values.ravel())

In [None]:
best_imputer = grid_search.best_params_['preprocessor__num__imputer']
print("Best Imputer:", type(best_imputer).__name__)
print("Best Parameters:", best_imputer.get_params())

**Finally we got that KNN with 2_neigh method is the best way to fill in the missing value**

## SMOTE

We will fill our null values with the KNN(2) <br>
Since our data is imbalanced we will try apply SMOTE as we learn in class <br>
We will check different parameters using GridSearchCV

In [None]:
X_mt_train = final_foo(X_mt_train)
X_mt_test = final_foo(X_mt_test)

In [None]:
imputer = KNNImputer(n_neighbors=2)

X_mt_train[numeric_features] = imputer.fit_transform(X_mt_train[numeric_features])
X_mt_test[numeric_features] = imputer.transform(X_mt_test[numeric_features])

numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [25]:
# Lets see again that the data is imbalanced
y_mt_train['category'].value_counts()

5    1957
1    1941
4    1353
0     970
3     966
2     941
Name: category, dtype: int64

We will preproccess the data like before, but before the classifier in the pipeline we will add a SMOTE object with different kind of parameters,

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
    ])

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', SMOTE()), 
    ('classifier', XGBClassifier(random_state=42))
])

param_grid = [
    {
        'resampler__sampling_strategy': ['auto', {5: 1957, 1: 1957, 4: 1957, 0: 1957, 3: 1957, 2: 1957}], # 1957 - the max count
        'resampler__k_neighbors': [1, 3, 5, 7]
    }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=1, verbose=3)
grid_search.fit(X_mt_train, y_mt_train.values.ravel())

y_pred = grid_search.predict(X_mt_test)
print('Best Parameters:', grid_search.best_params_)
print('Accuracy:', accuracy_score(y_mt_test, y_pred))
print(classification_report(y_mt_test, y_pred))

## Model Tuning - CV

**Now will fine-tuning XGBC and 2 other popular classifiers: RF and GRADIENT_BOOST**
We will do the model-tuning with the same method as before:
- Building a pipeline with a specific model and we tune it using GridSearchCV

In [None]:
# classifiers = [
#     {
#         'classifier': XGBClassifier(random_state=42),
#         'params': {
#             'classifier__n_estimators': [50, 100, 200],
#             'classifier__learning_rate': [0.01, 0.1, 0.2],
#             'classifier__max_depth': [3, 5, 10],
#             'classifier__subsample': [0.5, 0.8, 1],
#             'classifier__colsample_bytree': [0.5, 0.8, 1]
#         }
#     },
#     {
#         'classifier': RandomForestClassifier(random_state=42),
#         'params': {
#             'classifier__n_estimators': [50, 100, 200],
#             'classifier__max_depth': [None, 10, 20, 30],
#             'classifier__min_samples_split': [2, 5, 10],
#             'classifier__min_samples_leaf': [1, 2, 4]
#         }
#     },
#     {
#         'classifier': HistGradientBoostingClassifier(random_state=42),
#         'params': {
#             'classifier__max_iter': [50, 100, 200],
#             'classifier__learning_rate': [0.01, 0.1, 0.2],
#             'classifier__max_depth': [None, 10, 20, 30],
#             'classifier__min_samples_leaf': [5, 10, 20]
#         }
#     }
# ]

# pipeline = ImbPipeline([
#     ('preprocessor', preprocessor),
#     ('resampler', SMOTE(k_neighbors=3, sampling_strategy='auto')),
#     ('classifier', None)
# ])


# for clf in classifiers:
#     c = clf['classifier']
#     pipeline.set_params(classifier=c)
#     grid_search = GridSearchCV(pipeline, clf['params'], cv=5, n_jobs=1, verbose=3,scoring = 'accuracy')
#     grid_search.fit(X_mt_train, y_mt_train)
    
#     print("Best parameters:", grid_search.best_params_)
#     print("Best score:", grid_search.best_score_)

#     score = grid_search.score(X_mt_test, y_mt_test.values.ravel())
#     print("Test set score:", score)

After LONG night of over 2000 fittings we got that the best parameters are:  
**xgb**:  
Best parameters: {'classifier__colsample_bytree': 0.5, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 10, 'classifier__n_estimators': 200, 'classifier__subsample': 1}  
Best score: 0.9120326237108525  
Test set score: 0.9232283464566929  

**RF**:  
Best parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}  
Best score: 0.9066192828082127  
Test set score: 0.9148622047244095  

**GB**:  
Best parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': None, 'classifier__max_iter': 200, 'classifier__min_samples_leaf': 20}  
Best score: 0.9130167092440156  
Test set score: 0.9178149606299213  

# Let's summarize what we got
### filling nan values with KNN(n_neigh = 2)
### resampling with SMOTE(k_neighbors=3, sampling_strategy='auto')
#### XGB - {'classifier__colsample_bytree': 0.5, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 10, 'classifier__n_estimators': 200, 'classifier__subsample': 1}
#### RF - {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
#### GB - {'classifier__learning_rate': 0.1, 'classifier__max_depth': None, 'classifier__max_iter': 200, 'classifier__min_samples_leaf': 20}

# Now for the final results

In [None]:
X_train = final_foo(X_train)
X_test = final_foo(X_test)

numerical_transformer = StandardScaler()
imputer = KNNImputer(n_neighbors=2)
X_train[numeric_features] = imputer.fit_transform(X_train[numeric_features])
X_test[numeric_features] = imputer.transform(X_test[numeric_features])

In [None]:
classifiers = [
    {
        'classifier': XGBClassifier(colsample_bytree=0.5,learning_rate=0.2,max_depth=10,n_estimators=200,subsample=1)
    },
    {
        'classifier': RandomForestClassifier(max_depth=None,min_samples_leaf=1,min_samples_split=2,n_estimators=200)
    },
    {
        'classifier': HistGradientBoostingClassifier(learning_rate=0.1,max_depth=None,max_iter=200,min_samples_leaf=20)
    }
]

In [None]:
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', SMOTE(k_neighbors=3, sampling_strategy='auto')),  # Resampler included directly in the pipeline
    ('classifier', None)
])


for clf in classifiers:
    c = clf['classifier']
    pipeline.set_params(classifier=c)
    print(f"fitting {c}...")
    pipeline.fit(X_train, y_train.values.ravel())

    test_score = pipeline.score(X_test, y_test.values.ravel())
    print(f"Test score for {c} on df1:", test_score)

### Nice results!
### Remember df2? Lets try our final pipeline on it and decide

In [39]:
X_p = df2.loc[:,df2.columns != 'category']
y_p = df2.loc[:,['category']]

y_p['category'].replace(['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate',
 'cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
                        [0, 1, 2, 3, 4, 5], inplace=True)

X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p, y_p, test_size=0.2, random_state=42, stratify=y_p)

In [40]:
X_p_train = final_foo(X_p_train)
X_p_test = final_foo(X_p_test)

In [None]:
numerical_transformer = StandardScaler()
imputer = KNNImputer(n_neighbors=2)
X_p_train[numeric_features] = imputer.fit_transform(X_p_train[numeric_features])
X_p_test[numeric_features] = imputer.transform(X_p_test[numeric_features])

In [None]:
for clf in classifiers:
    c = clf['classifier']
    pipeline.set_params(classifier=c)
    print(f"fitting {c}...")
    pipeline.fit(X_p_train, y_p_train.values.ravel())

    test_score = pipeline.score(X_p_test, y_p_test.values.ravel())
    print(f"Test score for {c} on df2:", test_score)

# WHOOOO WE R DONE!
## Our 3 final models are:
### ..... on df1
### ..... on df1
### .....on df2

#### We know the following code is not efficient and repeats some steps BUT we rather be safe than sorry

In [None]:
X = final_foo(X)
X_p = final_foo(X_p)
data_test = final_foo(data_test)
data_with_probs_test = final_foo(data_with_probs_test)

### Model 1

In [None]:
X1 = X.copy()
data_test1 = data_test.copy()
y1 = y.copy()

numerical_transformer = StandardScaler()
imputer = KNNImputer(n_neighbors=2)

X1[numeric_features] = imputer.fit_transform(X1[numeric_features])
data_test1[numeric_features] = imputer.transform(data_test1[numeric_features])

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', SMOTE(k_neighbors=3, sampling_strategy='auto')),
    ('classifier', RandomForestClassifier(max_depth=None,min_samples_leaf=1,min_samples_split=2,n_estimators=200))
])

pipeline.fit(X1, y1.values.ravel())

data_test1['pred_cat'] = pipeline.predict(data_test1)

data_test1['pred_cat'].replace([0, 1, 2, 3, 4, 5],
                              ['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate','cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
                              inplace=True)

data_test1.loc[:,['idx','pred_cat']]#.to_csv('model01.csv',index=False)

### Model 2

In [None]:
X2 = X_p.copy()
data_test2 = data_test.copy()
y2 = y_p.copy()

numerical_transformer = StandardScaler()
imputer = KNNImputer(n_neighbors=2)

X2[numeric_features] = imputer.fit_transform(X2[numeric_features])
data_test2[numeric_features] = imputer.transform(data_test2[numeric_features])

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', SMOTE(k_neighbors=3, sampling_strategy='auto')),
    ('classifier', RandomForestClassifier(max_depth=None,min_samples_leaf=1,min_samples_split=2,n_estimators=200))
])

pipeline.fit(X2, y2.values.ravel())

data_test2['pred_cat'] = pipeline.predict(data_test2)

data_test2['pred_cat'].replace([0, 1, 2, 3, 4, 5],
                              ['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate','cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
                              inplace=True)

data_test2.loc[:,['idx','pred_cat']]#.to_csv('model02.csv',index=False)

### Model 3

In [None]:
X3 = X.copy()
data_test3 = data_test3.copy()
y3 = y.copy()

numerical_transformer = StandardScaler()
imputer = KNNImputer(n_neighbors=2)

X3[numeric_features] = imputer.fit_transform(X3[numeric_features])
data_test3[numeric_features] = imputer.transform(data_test3[numeric_features])

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', SMOTE(k_neighbors=3, sampling_strategy='auto')),
    ('classifier', RandomForestClassifier(max_depth=None,min_samples_leaf=1,min_samples_split=2,n_estimators=200))
])

pipeline.fit(X3, y3.values.ravel())

data_test3['pred_cat'] = pipeline.predict(data_test3)

data_test3['pred_cat'].replace([0, 1, 2, 3, 4, 5],
                              ['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate','cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
                              inplace=True)

data_test3.loc[:,['idx','pred_cat']]#.to_csv('model03.csv',index=False)