In [3]:
import argparse
from collections import defaultdict, Counter
import cv2
from fuzzywuzzy import fuzz
from itertools import combinations # not mandatory
import imutils
from imutils import build_montages, paths
import os
import matplotlib.pyplot as plt
import networkx as nx
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re
import seaborn as sns

from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder
import string

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost
from xgboost import cv, XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import os
import shutil
from sklearn.model_selection import cross_val_score



## Cleaning and Arranging the data


### Define useful functions for cleaning the data

In [4]:
def remove_parentheses(string):
    pattern = r'\([^()]*\)'  # Matches "(...)" pattern
    while re.search(pattern, string):
        string = re.sub(pattern, '', string)
    return string.strip()

def format_ingredients(string):
    string = string.replace('.', '')  # Remove periods
    string = re.sub(r'\s*,\s*', ',', string)  # Remove spaces after commas
    ingredients_list = string.split(',')  # Split by comma
    formatted_ingredients = [ingredient.strip() for ingredient in ingredients_list]  # Remove leading/trailing spaces for each ingredient
    return ', '.join(formatted_ingredients)  # Join formatted ingredients with commas

def clean_text_values(df):
    text_columns = df.select_dtypes(exclude=[np.number]).columns
    for column in text_columns:
        if column == 'category':
            continue
        df[column].fillna('NA', inplace=True)
        df[column] = df[column].map(str.lower)
        if column == 'ingredients':
            df[column] = df[column].apply(remove_parentheses)
            df[column] = df[column].apply(format_ingredients)
        if column == 'household_serving_fulltext':
            df[column] = df[column].map(lambda x: re.sub('[^a-z]+', '', x))
    return df

### Read Data

In [5]:
food_train = pd.read_csv('data/food_train.csv')
food_test = pd.read_csv('data/food_test.csv')
food_nutrients = pd.read_csv('data/food_nutrients.csv')
nutrients_names = pd.read_csv('data/nutrients.csv')

In [6]:
clean_text_values(food_train)
nutrients = pd.merge(food_nutrients, nutrients_names, how='left',on='nutrient_id')
pivoted_nutrients = pd.pivot_table(nutrients, values='amount', index='idx', columns='name')
data = pd.merge(food_train, pivoted_nutrients, how='left', on='idx')
# data.head()

### Remove columns with > 0.8 nulls

In [49]:
cols_to_remove = data.columns[data.isnull().mean() > 0.8]
data = data.drop(columns=cols_to_remove)

df = data.copy()

### Splite the data

we'll split our df to train and test sets. Then, we'll split the train set into 2 data sets. One for feature engineering and the other for model tuning.

In [6]:
X = df.loc[:,df.columns != 'category']
y = df.loc[:,['category']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_fe, X_mt, y_fe, y_mt = train_test_split(X_train, y_train, test_size=0.5, random_state=42, stratify=y_train)
X_fe_train, X_fe_test, y_fe_train, y_fe_test = train_test_split(X_fe, y_fe, test_size=0.2, random_state=42)
X_mt_train, X_mt_test, y_mt_train, y_mt_test = train_test_split(X_mt, y_mt, test_size=0.2, random_state=42)

X_y_train = pd.merge(X_train, y_train, how="left",left_index=True, right_index=True)
X_y_train['idx'] = X_y_train['idx'].map(lambda x: str(x))
X_y_test = pd.merge(X_test, y_test, how="left",left_index=True, right_index=True)
X_y_test['idx'] = X_y_test['idx'].map(lambda x: str(x))

#### #### concat for eda
X_fe_w_cat = pd.merge(X_fe_train, y_fe_train, how="left",left_index=True, right_index=True)

### Enums

In [50]:
CANDY = 'candy'
COOKIES = 'cookies_biscuits'
CAKES = 'cakes_cupcakes_snack_cakes'
CHIPS_PRETZELS = 'chips_pretzels_snacks'
CHOCOLATE = 'chocolate'
POPCORN_PEANUTS = 'popcorn_peanuts_seeds_related_snacks'

CATEGORIES = [CANDY, COOKIES, CAKES, CHIPS_PRETZELS, CHOCOLATE, POPCORN_PEANUTS]

### Define useful functions

In [51]:
def better_name(df, col_name):
    df[col_name] = df[col_name].map(lambda x: str(x).translate(str.maketrans('', '', string.punctuation)))
    return(df)

def data_by_category(df, category):
    return df[df['category'] == category]

def get_most_popular_words(column):
    text = ', '.join(column.values)
    tokens = [word.strip() for word in text.split(',')]
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    word_counts = Counter(tokens)
    return word_counts

def select_top_words(row, dict_words, column):
    desc = row[column]
    for category, words in dict_words.items():
        for word in dict_words[category]:
            if word in desc:
                name = f"{column}_{category}_{word}"
                return name
    return 'None'

# Part 1 - Feature Engineering

### Top 15 words in each column in the data per category
We have found the most 15 common words in each column: 'brand', 'description', 'ingredients' and 'household' per category. That will help us understand which words are represent and imply each category.

You can see all the data analysis we performed in the "Final_Project-Part_2-Words_Selection" notebook.  

# 'brand' column Research & Treatment

We will look on top 15 common brands over each category. After that we will change the brand column and make sure that just the selected top 15 (from each category) brands will stay the same and all the other will sign as unbranded.

In [52]:
def top_x_brands_by_category(df, category, x):
    df_by_category = df[df['category'] == category]
    return df_by_category.groupby('brand').size().sort_values(ascending=False).head(x)

def select_only_the_top_brand(x):
    df = X_y_train
    better_name(df, 'brand')
    top_brand_dict={'candy': top_x_brands_by_category(df, CANDY, x).index.tolist(),
                   'cookies': top_x_brands_by_category(df, COOKIES, x).index.tolist(),
                   'cakes': top_x_brands_by_category(df, CAKES, x).index.tolist(),
                   'chips_pretzels': top_x_brands_by_category(df, CHIPS_PRETZELS, x).index.tolist(),
                   'chocolate': top_x_brands_by_category(df, CHOCOLATE, x).index.tolist(),
                   'popcorn_peanuts': top_x_brands_by_category(df, POPCORN_PEANUTS, x).index.tolist()}
    
    return top_brand_dict

def get_top_brands(df, x):
    top_brand_dict = select_only_the_top_brand(x)
    df['brand'].fillna('unbranded',inplace=True)
    brands_list = []
    for key, value in top_brand_dict.items():
        if 'not a branded item' in value:
            value.remove('not a branded item')
        brands_list.extend(value)  
    df['brand'] = df['brand'].map(lambda x: x if x in brands_list else 'unbranded')
    return df

In [53]:
# for category in CATEGORIES:
#    common_words_per_category_plot(category, 'brand', 15)

In [54]:
# most_common_words_per_column('brand', 10)

In [55]:
def extract_brand(df):
    brand_words = {
    "candy": ["ferrara candy company", "frankford candy llc", "ross acquisition inc", "mars chocolate north america llc",
              "walgreens co", "russell stover candies inc", "just born inc", "sunmark", "jelly belly candy company",
              "holiday candy corp inc", "maud borup inc", "tops markets llc", "weis markets inc", "topco associates inc",
              "tootsie roll industries inc", "wm wrigley jr company", "reeses", "ahold usa inc"],
    "cookies": ["nabisco biscuit company", "target stores", "keebler company", "the kroger co", "lofthouse foods",
                "topco associates inc", "meijer inc", "ahold usa inc", "hyvee inc", "safeway inc", "walgreens co",
                "mckee foods corporation", "wegmans food markets inc", "kingston marketing co", "too good gourmet inc",
                "d f stauffer biscuit co inc", "abimar foods inc",],
    "cakes": ["mckee foods corporation", "bimbo bakeries usa inc", "tasty baking company", "the kroger co", 
              "hostess brands llc", "sweet ps bake shop", "meijer inc", "dawn food products inc", "flowers foods inc",
              "rich products corporation", "twobite", "schnuck markets inc", "fresh  easy", "dierbergs markets inc",
              "maplehurst bakeries llc", "rocky mountain pies", "the fathers table llc"],
    "chips_pretzels": ["utz quality foods inc", "the hain celestial group inc", "topco associates inc", "herr foods inc",
                       "meijer inc", "diamond foods inc", "hyvee inc", "target stores", "walmart stores inc", "snyderslance inc",
                       "inventure foods inc", "giant eagle inc", "good health natural products inc", "the kroger co",
                       "whole foods market inc", "old dutch foods inc", "jays foods inc", "cape cod potato chips inc",
                       "wise foods inc", "ahold usa inc"],
    "chocolate": ["lindt  sprungli schweiz ag", "russell stover candies inc", "mars chocolate north america llc",
                  "ghirardelli chocolate company", "godiva chocolatier inc", "ghirardelli", "fannie may confections inc",
                  "moonstruck chocolate co", "ross acquisition inc", "walgreens co", "rm palmer co", "frankford candy llc",
                  "theo chocolate inc", "hammonds candies since 1920 llc", "green  blacks", "alter eco americas inc",
                  "demets candy company", "whitmans candies inc", "european chocolate ltd"],
    "popcorn_peanuts": ["meijer inc", "target stores", "nabisco food company", "cvs pharmacy inc", "walgreens co",
                        "hyvee inc", "topco associates inc", "diamond foods inc", "ahold usa inc", "john b sanfilippo  son inc",
                        "american importing co inc", "giant eagle inc", "the kroger co", "supervalu inc", "tops markets llc",
                        "kar nut products company", "safeway inc", "whole foods market inc"],
}

    num_rows = len(df)
    # Convert the dictionary values to sets to ensure uniqueness
    data_dict = {key: set(values) for key, values in brand_words.items()}
    # Create a new dictionary with the column names as keys and zeros as values
    columns_dict = {f"{key}_{word}": np.zeros(num_rows) for key, words in brand_words.items() for word in words}
    # Create a DataFrame from the dictionary
    new_df = pd.DataFrame(columns_dict)
    # Concatenate the existing DataFrame with the new DataFrame
    result_df = pd.concat([df.reset_index(drop=True), new_df], axis=1)
    result_df.index = df.index

    for key, words in brand_words.items():
        for word in words:
            result_df[f"{key}_{word}"] = result_df['brand'].apply(lambda x: 1 if isinstance(x, str) and word in x else 0)
            
    result_df['brand'] = result_df.apply(lambda row: select_top_words(row, brand_words, 'brand'), axis=1)
    
    return result_df

extract_brand(X_fe_w_cat).head()

Unnamed: 0,idx,brand,description,ingredients,serving_size,serving_size_unit,household_serving_fulltext,"Calcium, Ca","Carbohydrate, by difference",Cholesterol,...,popcorn_peanuts_ahold usa inc,popcorn_peanuts_john b sanfilippo son inc,popcorn_peanuts_american importing co inc,popcorn_peanuts_giant eagle inc,popcorn_peanuts_the kroger co,popcorn_peanuts_supervalu inc,popcorn_peanuts_tops markets llc,popcorn_peanuts_kar nut products company,popcorn_peanuts_safeway inc,popcorn_peanuts_whole foods market inc
241,266,brand_candy_russell stover candies inc,"russell stover, nut cluster assortment","consist of milk chocolate1, sugar, whole milk,...",40.0,g,pieces,150.0,45.0,12.0,...,0,0,0,0,0,0,0,0,0,0
29772,33073,,candy corn,"sugar, corn syrup, confectioners glaze, honey,...",30.0,g,grm,0.0,96.67,0.0,...,0,0,0,0,0,0,0,0,0,0
516,580,,amande milk chocolate,"milk chocolate, almonds, salt and natural color",30.0,g,grm,200.0,50.0,23.0,...,0,0,0,0,0,0,0,0,0,0
20659,22932,,pumpkin face taffy,"corn syrup, sugar coconut oil, egg whites, sal...",42.0,g,pieces,0.0,90.48,0.0,...,0,0,0,0,0,0,0,0,0,0
5350,5968,,"megatoys, happy easter basket, salt water taff...","corn syrup, sugar, palm oil, citric acid, mono...",43.0,g,pieces,,86.05,0.0,...,0,0,0,0,0,0,0,0,0,0


# 'description' column Research & Treatment

After analyzing the results baised on the feature engineering train data set, we received the dict described above which includes indication words for every category.

We will add a column for every selected word. it will contain 1 if the word is in appear in the snack description, and 0 else.

In [56]:
#for category in CATEGORIES:
#    common_words_per_category_plot(category, 'description', 15)

In [58]:
#most_common_words_per_column('description', 15)

In [57]:
def extract_description(df):
    desc_words = {CAKES:['tastykake', 'cupcakes', 'cheezecake', 'bakery fresh goodness', 'apple pie', 'pie', 'pecan pie',
                                   'pumpkin pie', 'cake', 'cakes', 'cupcake', 'coffeecake', 'brownie', 'brownies', 'slice', 'sliced', 
                                    'torte', 'pies', 'donut', 'donuts', 'bakery'],
    CANDY:['candy', 'candies', 'gummy', 'gummi', 'gummies', 'twist', 'stick', 'marshmallow', 'marshmallows',
                       'jelly', 'snacks', "sweet's",  "brach's", 'cherry', 'strawberry', 'orange', 'watermelon', 
                       'peppermint', 'grape'],
    CHIPS_PRETZELS:['potato chips', 'tortilla chips', 'kettle cooked potato chips', 'sea salt', 'kettle chips', 
                               "snyder's of hanover", 'sour cream & onion', 'wavy potato chips', "herr's", 'chips', 'chip', 
                               'tortilla', 'crisps', 'crisp', 'potato', 'pretzel', 'pretzels', 'fries', 'corn'],
    CHOCOLATE:['dark chocolate', 'lindt', 'ghirardelli chocolate', 'russell stover', 'godiva', 'truffles', 
                          'dark chocolate bar', 'cocoa', 'praline', 'toffee', 'belgian', 'dark', 'truffle', 'chocolates'],
    COOKIES:['cookie', 'cookies', 'chocolate chip', 'chocolate chip cookies', 'sandwich cookies', 
                       'shortbread cookies', 'frosted sugar cookies', 'sugar cookies''crackers', 'frosted', 'wafer', 
                       'cracker', 'biscuit', 'macaroon', 'waffle'],
    POPCORN_PEANUTS:['popcorn', 'almond', 'almonds', 'trail mix', 'peanuts', 'mixed nuts', 
                                'dry roasted peanuts','cashews', 'kernel', 'shell', 'pecan']}
    
    num_rows = len(df)
    # Convert the dictionary values to sets to ensure uniqueness
    data_dict = {key: set(values) for key, values in desc_words.items()}
    # Create a new dictionary with the column names as keys and zeros as values
    columns_dict = {f"{key}_{word}": np.zeros(num_rows) for key, words in desc_words.items() for word in words}
    # Create a DataFrame from the dictionary
    new_df = pd.DataFrame(columns_dict)
    # Concatenate the existing DataFrame with the new DataFrame
    result_df = pd.concat([df.reset_index(drop=True), new_df], axis=1)
    result_df.index = df.index

    for key, words in desc_words.items():
        for word in words:
            result_df[f"{key}_{word}"] = result_df['description'].apply(lambda x: 1 if isinstance(x, str) and word in x else 0)
            
    result_df['description'] = result_df.apply(lambda row: select_top_words(row, desc_words, 'description'), axis=1)
    
    return result_df

extract_description(X_fe_w_cat).head()

Unnamed: 0,idx,brand,description,ingredients,serving_size,serving_size_unit,household_serving_fulltext,"Calcium, Ca","Carbohydrate, by difference",Cholesterol,...,popcorn_peanuts_seeds_related_snacks_almond,popcorn_peanuts_seeds_related_snacks_almonds,popcorn_peanuts_seeds_related_snacks_trail mix,popcorn_peanuts_seeds_related_snacks_peanuts,popcorn_peanuts_seeds_related_snacks_mixed nuts,popcorn_peanuts_seeds_related_snacks_dry roasted peanuts,popcorn_peanuts_seeds_related_snacks_cashews,popcorn_peanuts_seeds_related_snacks_kernel,popcorn_peanuts_seeds_related_snacks_shell,popcorn_peanuts_seeds_related_snacks_pecan
241,266,russell stover candies inc.,description_chocolate_russell stover,"consist of milk chocolate1, sugar, whole milk,...",40.0,g,pieces,150.0,45.0,12.0,...,0,0,0,0,0,0,0,0,0,0
29772,33073,just goodies,description_candy_candy,"sugar, corn syrup, confectioners glaze, honey,...",30.0,g,grm,0.0,96.67,0.0,...,0,0,0,0,0,0,0,0,0,0
516,580,sulpice chocolat,,"milk chocolate, almonds, salt and natural color",30.0,g,grm,200.0,50.0,23.0,...,0,0,0,0,0,0,0,0,0,0
20659,22932,wythe will distributing company,,"corn syrup, sugar coconut oil, egg whites, sal...",42.0,g,pieces,0.0,90.48,0.0,...,0,0,0,0,0,0,0,0,0,0
5350,5968,"p.c. woo, inc.",description_candy_candy,"corn syrup, sugar, palm oil, citric acid, mono...",43.0,g,pieces,,86.05,0.0,...,0,0,0,0,0,0,0,0,0,0


# 'ingredients' column Research & Treatment

After analyzing the results baised on the feature engineering train data set, we received the dict described above which includes indication words for every category.

We will add a column for every selected word. it will contain 1 if the word is in appear in the snack ingredients, and 0 else.

In [59]:
#for category in CATEGORIES:
#    common_words_per_category_plot(category, 'ingredients', 15)

In [60]:
#most_common_words_per_column('ingredients', 15)

In [61]:
def extract_ingredients(df):
    ingre_words = {CAKES:['leavening', 'eggs'],
                   CANDY:['gelatin', 'carnauba wax', 'red 40', 'blue 1'],
                   CHIPS_PRETZELS:['potatoes', 'onion powder', 'garlic powder', 'maltodextrin', 'yeast extract'],
                   CHOCOLATE:['cocoa butter', 'milk chocolate', 'chocolate', 'milk', 'dark chocolate', 'butter'],
                   COOKIES:['baking soda', 'enriched flour', 'leavening', 'eggs'],
                   POPCORN_PEANUTS:['almonds', 'peanuts', 'cashews', 'popcorn', 'pecans']}
    
    num_rows = len(df)
    # Convert the dictionary values to sets to ensure uniqueness
    data_dict = {key: set(values) for key, values in ingre_words.items()}
    # Create a new dictionary with the column names as keys and zeros as values
    columns_dict = {f"{key}_{word}": np.zeros(num_rows) for key, words in ingre_words.items() for word in words}
    # Create a DataFrame from the dictionary
    new_df = pd.DataFrame(columns_dict)
    # Concatenate the existing DataFrame with the new DataFrame
    result_df = pd.concat([df.reset_index(drop=True), new_df], axis=1)
    result_df.index = df.index

    for key, words in ingre_words.items():
        for word in words:
            result_df[f"{key}_{word}"] = result_df['ingredients'].apply(lambda x: 1 if isinstance(x, str) and word in x else 0)
            
    result_df['ingredients'] = result_df.apply(lambda row: select_top_words(row, ingre_words, 'ingredients'), axis=1)
            
    return result_df

extract_ingredients(X_fe_w_cat).head()

Unnamed: 0,idx,brand,description,ingredients,serving_size,serving_size_unit,household_serving_fulltext,"Calcium, Ca","Carbohydrate, by difference",Cholesterol,...,chocolate_butter,cookies_biscuits_baking soda,cookies_biscuits_enriched flour,cookies_biscuits_leavening,cookies_biscuits_eggs,popcorn_peanuts_seeds_related_snacks_almonds,popcorn_peanuts_seeds_related_snacks_peanuts,popcorn_peanuts_seeds_related_snacks_cashews,popcorn_peanuts_seeds_related_snacks_popcorn,popcorn_peanuts_seeds_related_snacks_pecans
241,266,russell stover candies inc.,"russell stover, nut cluster assortment",ingredients_chocolate_cocoa butter,40.0,g,pieces,150.0,45.0,12.0,...,1,0,0,0,0,1,1,1,0,1
29772,33073,just goodies,candy corn,ingredients_candy_carnauba wax,30.0,g,grm,0.0,96.67,0.0,...,0,0,0,0,0,0,0,0,0,0
516,580,sulpice chocolat,amande milk chocolate,ingredients_chocolate_milk chocolate,30.0,g,grm,200.0,50.0,23.0,...,0,0,0,0,0,1,0,0,0,0
20659,22932,wythe will distributing company,pumpkin face taffy,,42.0,g,pieces,0.0,90.48,0.0,...,0,0,0,0,0,0,0,0,0,0
5350,5968,"p.c. woo, inc.","megatoys, happy easter basket, salt water taff...",ingredients_candy_red 40,43.0,g,pieces,,86.05,0.0,...,0,0,0,0,0,0,0,0,0,0


# 'household_serving_fulltext' column Research & Treatment

After analyzing the results baised on the feature engineering train data set, we received the dict described above which includes indication words for every category.

We will add a column for every selected word. it will contain 1 if the word is in appear in the snack household' column, and 0 else.

In [62]:
#for category in CATEGORIES:
#    common_words_per_category_plot(category, 'household_serving_fulltext', 15)

In [63]:
#most_common_words_per_column('household_serving_fulltext', 15)

In [64]:
def extract_household(df):
    household_words = {CAKES:['cake', 'cakes', 'cupcakes', 'cupcake','brownie', 'pie', 'donut', 'muffin', 'tart', 
                              'torte', 'doughnut','slice', 'pastry', 'bun', 'loaf'],
                       CANDY:['candies', 'candy','gummies', 'gummy', 'marshmallow', 'pop', 'twist', 'stick','bear'],
                       CHIPS_PRETZELS:['chip', 'chips', 'fries', 'crisp', 'chipsabout', 'pretzelsabout'],
                       CHOCOLATE:['squares', 'square', 'balls', 'ball', 'truffle', 'praline', 'pralines', 'block', 'tablet', 'bar'],
                       COOKIES:['cookies', 'cookie', 'wafers', 'wafer', 'crackers', 'cracker', 'biscuit', 'macaroon', 'waffle'],
                       POPCORN_PEANUTS:['tbsp', 'kernel', 'popcorn', 'almond', 'shell', 'pecan']}
    
    num_rows = len(df)
    # Convert the dictionary values to sets to ensure uniqueness
    data_dict = {key: set(values) for key, values in household_words.items()}
    # Create a new dictionary with the column names as keys and zeros as values
    columns_dict = {f"{key}_{word}": np.zeros(num_rows) for key, words in household_words.items() for word in words}
    # Create a DataFrame from the dictionary
    new_df = pd.DataFrame(columns_dict)
    # Concatenate the existing DataFrame with the new DataFrame
    result_df = pd.concat([df.reset_index(drop=True), new_df], axis=1)
    result_df.index = df.index

    for key, words in household_words.items():
        for word in words:
            result_df[f"{key}_{word}"] = result_df['household_serving_fulltext'].apply(lambda x: 1 if isinstance(x, str) and word in x else 0)
            
    result_df['household_serving_fulltext'] = result_df.apply(lambda row: select_top_words(row, household_words, 'household_serving_fulltext'), axis=1)
            
    return result_df

extract_household(X_fe_w_cat).head()

Unnamed: 0,idx,brand,description,ingredients,serving_size,serving_size_unit,household_serving_fulltext,"Calcium, Ca","Carbohydrate, by difference",Cholesterol,...,cookies_biscuits_cracker,cookies_biscuits_biscuit,cookies_biscuits_macaroon,cookies_biscuits_waffle,popcorn_peanuts_seeds_related_snacks_tbsp,popcorn_peanuts_seeds_related_snacks_kernel,popcorn_peanuts_seeds_related_snacks_popcorn,popcorn_peanuts_seeds_related_snacks_almond,popcorn_peanuts_seeds_related_snacks_shell,popcorn_peanuts_seeds_related_snacks_pecan
241,266,russell stover candies inc.,"russell stover, nut cluster assortment","consist of milk chocolate1, sugar, whole milk,...",40.0,g,household_serving_fulltext_cakes_cupcakes_snac...,150.0,45.0,12.0,...,0,0,0,0,0,0,0,0,0,0
29772,33073,just goodies,candy corn,"sugar, corn syrup, confectioners glaze, honey,...",30.0,g,,0.0,96.67,0.0,...,0,0,0,0,0,0,0,0,0,0
516,580,sulpice chocolat,amande milk chocolate,"milk chocolate, almonds, salt and natural color",30.0,g,,200.0,50.0,23.0,...,0,0,0,0,0,0,0,0,0,0
20659,22932,wythe will distributing company,pumpkin face taffy,"corn syrup, sugar coconut oil, egg whites, sal...",42.0,g,household_serving_fulltext_cakes_cupcakes_snac...,0.0,90.48,0.0,...,0,0,0,0,0,0,0,0,0,0
5350,5968,"p.c. woo, inc.","megatoys, happy easter basket, salt water taff...","corn syrup, sugar, palm oil, citric acid, mono...",43.0,g,household_serving_fulltext_cakes_cupcakes_snac...,,86.05,0.0,...,0,0,0,0,0,0,0,0,0,0


# 'serving_size' column Research & Treatment

In [65]:
def data_by_category(df, category):
    return df[df['category'] == category]

def find_mean_and_median(category):
    df = data_by_category(X_fe_w_cat, category)
    return df['serving_size'].mean(), df['serving_size'].median()

mean_median_df = {'Category': [CANDY, COOKIES, CAKES, CHIPS_PRETZELS, CHOCOLATE, POPCORN_PEANUTS],
              'Average Size': [find_mean_and_median(CANDY)[0],find_mean_and_median(COOKIES)[0], find_mean_and_median(CAKES)[0],
                               find_mean_and_median(CHIPS_PRETZELS)[0], find_mean_and_median(CHOCOLATE)[0],find_mean_and_median(POPCORN_PEANUTS)[0]],
              'Median Size': [find_mean_and_median(CANDY)[1],find_mean_and_median(COOKIES)[1], find_mean_and_median(CAKES)[1],
                               find_mean_and_median(CHIPS_PRETZELS)[1], find_mean_and_median(CHOCOLATE)[1],find_mean_and_median(POPCORN_PEANUTS)[1]]}
size_df = pd.DataFrame(mean_median_df)

size_df

Unnamed: 0,Category,Average Size,Median Size
0,candy,32.02588,35.0
1,cookies_biscuits,33.071227,30.0
2,cakes_cupcakes_snack_cakes,74.718705,71.0
3,chips_pretzels_snacks,29.253617,28.0
4,chocolate,38.55694,40.0
5,popcorn_peanuts_seeds_related_snacks,31.747849,30.0


It seems that product from cakes category are much heavy then all the other products. Therefore, this column may help us distinguish between cakes category and all the others, so we'll keep it.

# 'serving_size_unit' column Research & Treatment

In [28]:
serving_size_unit = {'values': data['serving_size_unit'].drop_duplicates(),
                      'Frequency': [data['serving_size_unit'].value_counts()['g'], 
                                    data['serving_size_unit'].value_counts()['ml']]
                    }
serving_size_unit_df = pd.DataFrame(serving_size_unit)
serving_size_unit_df

Unnamed: 0,values,Frequency
0,g,31743
8554,ml,8


We notice that most of the snacks has the value 'g' and just 8 of them has the value 'ml'.
Therefore, "size_unit" column is not important and has no effect on the data, so it can be removed. 

In [66]:
def drop_size_unit_column(df):
    return df.drop(['serving_size_unit'],axis=1)

drop_size_unit_column(X_fe_w_cat).head()

Unnamed: 0,idx,brand,description,ingredients,serving_size,household_serving_fulltext,"Calcium, Ca","Carbohydrate, by difference",Cholesterol,Energy,...,"Fatty acids, total trans","Fiber, total dietary","Iron, Fe",Protein,"Sodium, Na","Sugars, total including NLEA",Total lipid (fat),"Vitamin A, IU","Vitamin C, total ascorbic acid",category
241,266,russell stover candies inc.,"russell stover, nut cluster assortment","consist of milk chocolate1, sugar, whole milk,...",40.0,pieces,150.0,45.0,12.0,575.0,...,0.0,5.0,2.7,12.5,50.0,35.0,40.0,250.0,0.0,chocolate
29772,33073,just goodies,candy corn,"sugar, corn syrup, confectioners glaze, honey,...",30.0,grm,0.0,96.67,0.0,367.0,...,0.0,0.0,0.0,6.67,183.0,76.67,0.0,0.0,0.0,candy
516,580,sulpice chocolat,amande milk chocolate,"milk chocolate, almonds, salt and natural color",30.0,grm,200.0,50.0,23.0,560.0,...,0.0,3.3,1.2,10.0,433.0,46.67,40.0,333.0,0.0,chocolate
20659,22932,wythe will distributing company,pumpkin face taffy,"corn syrup, sugar coconut oil, egg whites, sal...",42.0,pieces,0.0,90.48,0.0,381.0,...,0.0,0.0,0.0,0.0,131.0,54.76,4.76,0.0,0.0,candy
5350,5968,"p.c. woo, inc.","megatoys, happy easter basket, salt water taff...","corn syrup, sugar, palm oil, citric acid, mono...",43.0,pieces,,86.05,0.0,395.0,...,0.0,0.0,,0.0,81.0,46.51,5.81,,,candy


# Nutrients columns Research & Treatment

# Part 2 - Image handling

We will build a CNN, then calculating the probabilities vector and adding it to the tabular data [Edit]

In [67]:
# Set the paths
train_base_directory = 'images/train/'  # Replace with the actual train path
test_base_directory = 'images/train/'  # Replace with the actual test path

train_output_directory = 'images/train_categorized/'  # Replace with your desired train output path
test_output_directory = 'images/test_categorized/'  # Replace with your desired test output path

# Function to create folders for each category
def create_category_folders(output_directory, unique_labels):
    for label in unique_labels:
        label_path = os.path.join(output_directory, label)
        if os.path.exists(label_path):
            shutil.rmtree(label_path)  # Delete existing folder
        os.makedirs(label_path)

# Load the numpy arrays of training and testing image paths
train_image_paths = X_y_train['path'].unique()
test_image_paths = X_y_test['path'].unique()

# Extract category labels from paths
def extract_category_label(image_path):
    category = os.path.basename(os.path.dirname(image_path))
    return category

# Create folders for each category in the train output directory
unique_train_labels = np.unique([extract_category_label(path) for path in train_image_paths])
create_category_folders(train_output_directory, unique_train_labels)

# Copy train images to categorized folders
for image_path in train_image_paths:
    category = extract_category_label(image_path)
    output_folder = os.path.join(train_output_directory, category)
    shutil.copy(image_path, output_folder)

print("Train images copied to categorized folders.")

# Create folders for each category in the test output directory
unique_test_labels = np.unique([extract_category_label(path) for path in test_image_paths])
create_category_folders(test_output_directory, unique_test_labels)

# Copy test images to categorized folders
for image_path in test_image_paths:
    category = extract_category_label(image_path)
    output_folder = os.path.join(test_output_directory, category)
    shutil.copy(image_path, output_folder)

print("Test images copied to categorized folders.")

KeyError: 'path'

In [None]:
# Set the paths and image size
train_data_dir = 'images/train'
image_dir = 'images/train'
image_size = (128, 128)

# Data augmentation for training data
train_datagen = ImageDataGenerator(
    rescale=1.0/255.0,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.2  # Split data into training and validation
)

test_datagen = ImageDataGenerator(rescale=1.0/255.0)


# Load and preprocess training data
train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=image_size,
    batch_size=32,
    class_mode='categorical',
    subset='training'  # Use the training subset of data
)


# Load and preprocess validation data
validation_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=image_size,
    batch_size=32,
    class_mode='categorical',
    subset='validation'  # Use the validation subset of data
)

# Build the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(image_size[0], image_size[1], 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(6, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
num_epochs = 10
model.fit(train_generator, epochs=num_epochs, validation_data=validation_generator)

# Evaluate the model on test data
test_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=image_size,
    batch_size=32,
    class_mode='categorical',
    subset='validation'  # Use the validation subset of data for testing
)

eval_result = model.evaluate(test_generator)
print("Test Loss:", eval_result[0])
print("Test Accuracy:", eval_result[1])


# Save the model
model.save('cnn_model.h5')

We wrote a regular .py script that will export the probabilitis vector to .csv file [Edit]  
Lets read it and add it to the tabular data

In [68]:
photos_probs = pd.read_csv('photos_probs.csv')
idx = photos_probs['idx'].unique()
print(len(idx))

FileNotFoundError: [Errno 2] No such file or directory: 'photos_probs.csv'

[We didnt get probs for all the 30K rows,
So we have 2 df, one with 30K rows and without the images probs
The second will have 20K rows but with images probs
We will decide based on the following results on which we should foucus-Edit!!!]

In [None]:
data_with_probs = pd.merge(data, photos_probs, how="left",on='idx')

In [None]:
df1 = data.copy()
df2 = data_with_probs[data_with_probs['idx'].isin(idx)].copy()

# images --- need to delete this at the end

In [None]:
def path(row):
    idx = str(row['idx'])
    category = row['category']
    path = f"images/train/{category}/{idx}.jpg"
    return path

X_y_train['path'] = X_y_train.apply(path,axis=1)
X_y_test['path'] = X_y_test.apply(path,axis=1)

In [None]:
import os
import shutil
import numpy as np

# Set the paths
train_base_directory = 'images/train/'  # Replace with the actual train path
test_base_directory = 'images/train/'  # Replace with the actual test path

train_output_directory = 'images/train_categorized/'  # Replace with your desired train output path
test_output_directory = 'images/test_categorized/'  # Replace with your desired test output path

# Function to create folders for each category
def create_category_folders(output_directory, unique_labels):
    for label in unique_labels:
        label_path = os.path.join(output_directory, label)
        if os.path.exists(label_path):
            shutil.rmtree(label_path)  # Delete existing folder
        os.makedirs(label_path)

# Load the numpy arrays of training and testing image paths
train_image_paths = X_y_train['path'].unique()
test_image_paths = X_y_test['path'].unique()

# Extract category labels from paths
def extract_category_label(image_path):
    category = os.path.basename(os.path.dirname(image_path))
    return category

# Create folders for each category in the train output directory
unique_train_labels = np.unique([extract_category_label(path) for path in train_image_paths])
create_category_folders(train_output_directory, unique_train_labels)

# Copy train images to categorized folders
for image_path in train_image_paths:
    category = extract_category_label(image_path)
    output_folder = os.path.join(train_output_directory, category)
    shutil.copy(image_path, output_folder)

print("Train images copied to categorized folders.")

# Create folders for each category in the test output directory
unique_test_labels = np.unique([extract_category_label(path) for path in test_image_paths])
create_category_folders(test_output_directory, unique_test_labels)

# Copy test images to categorized folders
for image_path in test_image_paths:
    category = extract_category_label(image_path)
    output_folder = os.path.join(test_output_directory, category)
    shutil.copy(image_path, output_folder)

print("Test images copied to categorized folders.")

In [None]:
# Set the paths and image size
train_data_dir = 'images/train'
image_dir = 'images/train'
image_size = (128, 128)

# Data augmentation for training data
train_datagen = ImageDataGenerator(
    rescale=1.0/255.0,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.2  # Split data into training and validation
)

test_datagen = ImageDataGenerator(rescale=1.0/255.0)


# Load and preprocess training data
train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=image_size,
    batch_size=32,
    class_mode='categorical',
    subset='training'  # Use the training subset of data
)


# Load and preprocess validation data
validation_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=image_size,
    batch_size=32,
    class_mode='categorical',
    subset='validation'  # Use the validation subset of data
)

# Build the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(image_size[0], image_size[1], 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(6, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
num_epochs = 10
model.fit(train_generator, epochs=num_epochs, validation_data=validation_generator)

# Evaluate the model on test data
test_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=image_size,
    batch_size=32,
    class_mode='categorical',
    subset='validation'  # Use the validation subset of data for testing
)

eval_result = model.evaluate(test_generator)
print("Test Loss:", eval_result[0])
print("Test Accuracy:", eval_result[1])


# Save the model
model.save('cnn_model.h5')

In [None]:
model = load_model('cnn_model.h5')

# Part 3 - Models

# FE CV

In [69]:
Brand = FunctionTransformer(extract_brand)#,kw_args={'a':5})
# Description1 = FunctionTransformer(desc_indication_columns,kw_args={'b':5})
Description2 = FunctionTransformer(extract_description)
# Ingredients1 = FunctionTransformer(ingre_indication_columns,kw_args={'c':5})
Ingredients2 = FunctionTransformer(extract_ingredients)
# Household1 = FunctionTransformer(household_indication_columns,kw_args={'d':5})
Household2 = FunctionTransformer(extract_household)
Drop = FunctionTransformer(drop_columns)

In [None]:
numerical_transformer = StandardScaler()

categorical_features = ["brand", "description", "ingredients","household_serving_fulltext"]

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, X.select_dtypes(include=['int64', 'float64']).columns),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
pipeline = Pipeline([
    ('brand',Brand),
    ('description2',Description2),
    ('ingredients2',Ingredients2),
    ('household2',Household2),
    ('drop',Drop),
    ("preprocessor", preprocessor),
    ('classifier', XGBClassifier(random_state=42))
])

In [None]:
y_fe_train['category'].replace(['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate',
 'cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
                        [0, 1, 2, 3, 4, 5], inplace=True)
# Run 5-fold cross-validation
scores = cross_val_score(pipeline, X_fe_train, y_fe_train.values.ravel(), cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", scores)

### Let's fit on all X_fe_train and check the test acc


In [70]:
pipeline.fit(X_fe_train, y_fe_train.values.ravel())

NameError: name 'pipeline' is not defined

In [None]:
y_fe_test['category'].replace(['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate',
 'cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
                        [0, 1, 2, 3, 4, 5], inplace=True)


test_score = pipeline.score(X_fe_test, y_fe_test.values.ravel())
print("Test score:", test_score)

predicted_labels = pipeline.predict(X_fe_test)

accuracy = accuracy_score(y_fe_test, predicted_labels)
print("Accuracy:", accuracy)

report = classification_report(y_fe_test, predicted_labels)
print("Classification Report:\n", report)

### Maybe add this to gridsearch(choose the best words from the dicts)[Edit]

In [None]:
# param_grid = {
#     'brand__kw_args':[{'a':5},{'a':10},{'a':15},{'a':20}]
# }

In [None]:
# y_train['category'].replace(['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate',
#  'cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
#                         [0, 1, 2, 3, 4, 5], inplace=True)
# results = {}
# for name, clf in classifiers:
#     pipeline.set_params(classifier=clf)
    
#     grid_search = GridSearchCV(
#         pipeline,
#         param_grid=param_grid,
#         cv=5,
#         n_jobs=-1,
#         verbose = 3,
#         scoring = 'accuracy'
#     )
    
#     grid_search.fit(X_train, y_train.values.ravel())
#     results[name] = grid_search

# for name, grid_search in results.items():
#     print(f"Best parameters for {name}: {grid_search.best_params_}")
#     print(f"Best score for {name}: {grid_search.best_score_}")

In [None]:
# y_test['category'].replace(['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate',
#  'cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
#                         [0, 1, 2, 3, 4, 5], inplace=True)
# predicted_labels = grid_search.best_estimator_.predict(X_test)

# accuracy = accuracy_score(y_test, predicted_labels)
# print("Accuracy:", accuracy)

# report = classification_report(y_test, predicted_labels)
# print("Classification Report:\n", report)

### So we got a different parameter for brand for each model.
### We will take that into consideration in the model tuning

## SMOTE test

In [None]:
# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split data
X = X_mt
y = y_mt

y['category'].replace(['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate',
 'cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
                        [0, 1, 2, 3, 4, 5], inplace=True)

# Define model
model = XGBClassifier(random_state=42)

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline([
    ('brand',Brand),
    ('description2',Description2),
    ('ingredients2',Ingredients2),
    ('household2',Household2),
    ('drop',Drop),
    ("preprocessor", preprocessor),
    ('classifier', model)
])


# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_preprocessed = preprocessor.fit_transform(X_train)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_preprocessed, y_train)

# Train a model
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_resampled, y_resampled.values.ravel())


X_test_preprocessed = preprocessor.transform(X_test)
# Predict on the test set
y_pred = classifier.predict(X_test_preprocessed)

# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


## MT CV

In [None]:
y_mt_train['category'].replace(['cakes_cupcakes_snack_cakes', 'candy', 'chips_pretzels_snacks', 'chocolate',
 'cookies_biscuits', 'popcorn_peanuts_seeds_related_snacks'],
                        [0, 1, 2, 3, 4, 5], inplace=True)
# Define the classifiers
classifiers = [
    ('XGB', XGBClassifier()),
    ('HistGradientBoosting', HistGradientBoostingClassifier()),
    ('RandomForest', RandomForestClassifier())
]

pipeline = Pipeline([
    ('brand',Brand),
    ('description2',Description2),
    ('ingredients2',Ingredients2),
    ('household2',Household2),
    ('drop',Drop),
    ("preprocessor", preprocessor),
    ('classifier', None)
])

# Define hyperparameters to search for each classifier
param_grids = {
    'XGB': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [3, 5, 7]
    },
    'HistGradientBoosting': {
        'classifier__max_iter': [50, 100, 200],
        'classifier__max_depth': [3, 5, 7]
    },
    'RandomForest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20]
    }
}

# Perform GridSearchCV for each classifier
results = {}
for name, clf in classifiers:
    pipeline.set_params(classifier=clf)
    
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=5,  # Number of cross-validation folds
        n_jobs=-1,
        verbose = 3,
        scoring = 'accuracy'
    )
    
    grid_search.fit(X_mt_train, y_mt_train.values.ravel())  # Replace with your data
    results[name] = grid_search

# Print the best parameters and best score for each classifier
for name, grid_search in results.items():
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best score for {name}: {grid_search.best_score_}")

# Train the models on all the data