<h1> Machine Learning Notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from utils import *

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
filepath = '/Users/efkanturedi/Corteze/untitled folder/nutri_clean_prod.csv'
data = pd.read_csv(filepath)

In [3]:
data

Unnamed: 0,code,product_name,brands,nutriscore_grade,pnns_groups_1,energy-kcal_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,nutrition-score-fr_100g
0,0000000001281,Tarte noix de coco,"Crous Resto',Crous",d,Sugary snacks,381.0,22.00,15.5,27.30,21.90,4.4,4.60,0.1000,14.0
1,0000000001885,Compote de poire,Crous,a,Fruits and vegetables,157.0,0.00,0.0,36.00,27.00,3.6,0.60,0.0000,-2.0
2,0000000005470,BAguette bressan,Crousresto',a,Cereals and potatoes,160.0,2.20,0.5,25.20,0.60,1.6,9.50,0.3580,-4.0
3,0000000043595,Cranberries,Torn & Glasser,c,Fruits and vegetables,300.0,0.00,0.0,83.33,66.67,10.0,0.00,0.0000,3.0
4,0000000290616,Salade Cesar,Kirkland Signature,c,Fruits and vegetables,290.0,12.00,7.0,23.00,0.00,2.0,22.00,2.1600,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374513,9948240886,Lemon raspberry italian sparkling mineral water,"Whole foods, Whole Foods Market",b,Beverages,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0250,0.0
374514,99482467111,"Almondmilk beverage, chocolate",,b,Beverages,38.0,1.05,0.0,6.33,5.91,0.4,0.42,0.1575,1.0
374515,9950014911001,Oignons jaunes 40/60,Ferme De L'artois,a,Fruits and vegetables,0.0,5.00,5.0,2.00,0.50,25.0,32.00,0.2000,-11.0
374516,9999091865142,Paprikás Kukorica csemege,Spar,d,unknown,496.0,24.00,1.9,61.00,1.50,0.0,6.90,0.9600,11.0


In [4]:
# Revove comment if you want to use only a sample of the dataset

#data = data.sample(n=100000,random_state=42)

In [5]:
# We add the code to use it as primary key
X_cols = [
  'energy-kcal_100g',
  #'fat_100g',
  'saturated-fat_100g',
  #'carbohydrates_100g',
  'sugars_100g',
  'fiber_100g',
  'proteins_100g',
  'salt_100g',
]

In [6]:
X = data[X_cols].reset_index(drop=True)
y = data[[
  'nutriscore_grade'
]].reset_index(drop=True)

In [7]:
len(y)

374518

<h3> Standardising & Normalizing the data

In [8]:
# Creating Training and Test set
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8)

# Creating Validation set
# X_train, y_train, X_val, y_val = train_test_split(X_train, y_train, train_size=0.9)

In [9]:
X_test

Unnamed: 0,energy-kcal_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g
57881,24.0,0.00,4.88,0.8,0.81,0.5075
316388,363.0,1.00,16.50,10.3,10.50,0.0100
70233,368.0,1.32,28.95,7.9,10.53,0.6575
254884,41.0,0.70,4.80,0.0,3.20,0.1000
306798,257.0,2.00,9.30,0.0,12.00,1.9000
...,...,...,...,...,...,...
202150,257.0,3.57,24.29,0.0,14.29,0.9825
319247,46.0,0.00,10.20,0.8,0.20,0.0000
119772,350.0,0.00,67.50,2.5,0.00,0.3450
90184,536.0,3.57,3.57,3.6,7.14,1.8750


In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

<h3> Running the models

In [11]:
import math
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report
from xgboost import XGBClassifier

<h3> Logistic Regression

In [12]:
log_reg = OneVsRestClassifier(LogisticRegression())

In [13]:
log_reg.fit(X_train,y_train)

OneVsRestClassifier(estimator=LogisticRegression())

In [14]:
cf_matrix = confusion_matrix(log_reg.predict(X_test),y_test)

In [15]:
print(classification_report(log_reg.predict(X_test),y_test))

              precision    recall  f1-score   support

           a       0.92      0.64      0.75     22517
           b       0.12      0.54      0.19      2113
           c       0.37      0.54      0.44     10843
           d       0.73      0.53      0.62     29355
           e       0.52      0.64      0.57     10076

    accuracy                           0.58     74904
   macro avg       0.53      0.58      0.51     74904
weighted avg       0.69      0.58      0.61     74904



<h3> Random Forest Classifier

In [16]:
rfc = OneVsRestClassifier(RandomForestClassifier(
  n_jobs=-1,
  class_weight='balanced',
  n_estimators=100
  ))

In [17]:
rfc.fit(X_train,y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(class_weight='balanced',
                                                     n_jobs=-1))

In [18]:
print(classification_report(rfc.predict(X_test),y_test))

              precision    recall  f1-score   support

           a       0.95      0.94      0.95     15800
           b       0.86      0.89      0.87      9508
           c       0.91      0.90      0.90     15742
           d       0.95      0.94      0.95     21601
           e       0.93      0.95      0.94     12253

    accuracy                           0.93     74904
   macro avg       0.92      0.92      0.92     74904
weighted avg       0.93      0.93      0.93     74904



<h3>XGBoost

In [19]:
xgb = XGBClassifier()

In [20]:
xgb.fit(X_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [21]:
print(classification_report(xgb.predict(X_test),y_test))

              precision    recall  f1-score   support

           a       0.94      0.93      0.93     15946
           b       0.84      0.86      0.85      9566
           c       0.89      0.89      0.89     15534
           d       0.95      0.94      0.94     21583
           e       0.93      0.94      0.93     12275

    accuracy                           0.92     74904
   macro avg       0.91      0.91      0.91     74904
weighted avg       0.92      0.92      0.92     74904



<h3>GridSearchCV on Random Forest and XGBoost:

In [22]:
#xgb_parameters = {
#  'n_estimators':[2000,3000],
#  'learning_rate': [0.2,0.3]
#}

In [23]:
#xgb_grid = GridSearchCV(estimator = XGBRegressor(), 
#                      scoring='neg_root_mean_squared_error',
#                      param_grid = xgb_parameters,
#                      cv=3,
#                      verbose=False
#)
#
#xgb_grid.fit(X_train, y_train['nutrition-score-fr_100g'])

In [24]:
#xgb_best_params_ = xgb_grid.best_params_
#xgb_best_params_

In [25]:
#xgb_opt = xgb_grid.best_estimator_

In [26]:
#xgb_opt_rmse = math.sqrt(mean_squared_error(xgb_opt.predict(X_test), y_test['nutrition-score-fr_100g']))
#xgb_opt_r2 = r2_score(xgb_opt.predict(X_test), y_test['nutrition-score-fr_100g'])

#results = results.append({
#  'Model':'XGBoost Hypt',
#  'RMSE':xgb_opt_rmse,
#  'R2':xgb_opt_r2,
#  'Adj R2':1-((1-xgb_opt_r2)*(n-1)/(n-p-1))
#},ignore_index=True)

#results

In [27]:
#rfr_parameters = {
#  'n_estimators': [1000],
#  #'n_estimators': [200,500,1000],
#}

In [28]:
#rfr_grid = GridSearchCV(estimator = RandomForestRegressor(n_jobs=-1), 
#                      scoring='neg_root_mean_squared_error',
#                      param_grid = rfr_parameters,
#                      cv=3,
#                      verbose=False
#                     )
#
#rfr_grid.fit(X_train, y_train['nutrition-score-fr_100g'])

In [29]:
#rfr_best_params_ = rfr_grid.best_params_
#rfr_best_params_

In [30]:
#rfr_opt = rfr_grid.best_estimator_

In [31]:
#rfr_opt_rmse = math.sqrt(mean_squared_error(rfr_opt.predict(X_test), y_test['nutrition-score-fr_100g']))
#rfr_opt_r2 = r2_score(rfr_opt.predict(X_test), y_test['nutrition-score-fr_100g'])

#results = results.append({
#  'Model':'Random Forest Hypt',
#  'RMSE':rfr_opt_rmse,
#  'R2':rfr_opt_r2,
#  'Adj R2':1-((1-rfr_opt_r2)*(n-1)/(n-p-1))
#},ignore_index=True)

#results

<h3> Exporting the pipeline of model

In [32]:
from sklearn.pipeline import Pipeline
import pickle

In [33]:
pipeline = Pipeline([
    ('standard_scaler', scaler), 
    ('model', rfc)
])

In [34]:
with open('/Users/efkanturedi/Corteze/untitled folder/model.pickle', 'wb') as model_file:
  pickle.dump(pipeline, model_file)