### Importing python libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import log_loss
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectPercentile, chi2
from statistics import mean
from google.colab import drive
import glob

import warnings
warnings.filterwarnings('ignore')

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


### Classification models - Base models

Some base models are created to use in the stacking model
Random Forest, LightGBM, Logistic Regression, XGBoost, GNB and extra decision trees.

In [None]:
def run_training(fold: int, model: str) -> pd.DataFrame:
    df = pd.read_csv("drive/MyDrive/train_folds.csv")
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    xtrain = df_train.drop('Category', axis=1)
    xvalid = df_valid.drop('Category', axis=1)
    ytrain = df_train['Category']
    yvalid = df_valid['Category']
    
    scaler = MaxAbsScaler()
    selection = SelectPercentile(chi2, percentile=50)
    xvalid = scaler.fit_transform(xvalid)

    if model == 'Random_Forest':
      clf = RandomForestClassifier(max_depth=8, n_estimators=100)
      column_name = 'rf_pred'
      
    elif model == 'LightGBM':
      clf = LGBMClassifier()
      column_name = 'lgbm_pred'

    elif model == 'Logistic_Regression':
      clf = LogisticRegression()
      column_name = 'lr_pred'

    elif model == 'XGBoost':
      clf = XGBClassifier(verbosity=0)
      column_name = 'xgb_pred'

    elif model == 'GNB':
      clf = GaussianNB()
      column_name = 'gnb_pred'
      
    else:
      clf = ExtraTreesClassifier()
      column_name = 'ext_pred'


    clf_pipe = make_pipeline(scaler, selection, clf)
    clf_pipe.fit(xtrain, ytrain)  
    pred_train = clf_pipe.predict_proba(xtrain)
    pred_test = clf_pipe.predict_proba(xvalid)

    train_loss = log_loss(ytrain, pred_train)
    test_loss = log_loss(yvalid, pred_test)
    print(f"fold={fold}, train_loss={train_loss} - test_loss={test_loss}")

    prob_list = []
    for i in pred_test:
        max_prob = i.max()
        probs = list(i)
        prob = probs.index(max_prob)
        prob_list.append(prob)

    df_valid[column_name] = prob_list

    return df_valid[['id', 'Category','kfold', column_name]], train_loss, test_loss

In [None]:
base_models = ['Random_Forest', 'XGBoost', 'Logistic_Regression', 'Extra_Trees_Classifier', 'GNB']

for n, i in enumerate(base_models):
  print("-"*80)
  print(f"Base model {n+1} - " + i)
  model = i
  dfs = []
  train_loss = []
  test_loss = []

  for j in range(5):
      temp_df, train_, test_ = run_training(j, model)
      dfs.append(temp_df)
      train_loss.append(train_)
      test_loss.append(test_)

  fin_valid_df = pd.concat(dfs)
  print(fin_valid_df.shape)
  fin_valid_df.to_csv("drive/MyDrive/model_preds/" + i + ".csv", index=False)
print("-"*80)

--------------------------------------------------------------------------------
Base model 1 - Random_Forest
fold=0, train_loss=2.541369617026053 - test_loss=2.6219590180071726
fold=1, train_loss=2.543113449807293 - test_loss=2.6194181468083717
fold=2, train_loss=2.542666503651043 - test_loss=2.619097629244715
fold=3, train_loss=2.543054858750583 - test_loss=2.620417616286956
fold=4, train_loss=2.5429817045922265 - test_loss=2.6204241908129515
(878049, 4)
--------------------------------------------------------------------------------
Base model 2 - XGBoost
fold=0, train_loss=2.530676807569018 - test_loss=2.6716134783232053
fold=1, train_loss=2.531775544554962 - test_loss=2.670121415041145
fold=2, train_loss=2.53154569981889 - test_loss=2.6681256893175176
fold=3, train_loss=2.5313877291716236 - test_loss=2.670184492660298
fold=4, train_loss=2.5313150242660876 - test_loss=2.668180791586567
(878049, 4)
--------------------------------------------------------------------------------
Base

In [None]:
from tensorflow import keras
from keras.layers import Dense,Activation
from keras.models import Sequential

### Neural network

In [None]:
def run_neural_network(fold: int) -> pd.DataFrame:
    df = pd.read_csv("drive/MyDrive/train_folds.csv")
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    xtrain = df_train.drop('Category', axis=1)
    xvalid = df_valid.drop('Category', axis=1)
    ytrain = df_train['Category']
    yvalid = df_valid['Category']


    scaler = MaxAbsScaler()
    xvalid = scaler.fit_transform(xvalid)
    xtrain = scaler.fit_transform(xtrain)

    model = Sequential()
    model.add(Dense(100, input_shape=(xtrain.shape[1],)))
    model.add(Activation('relu'))
    model.add(Dense(100))
    model.add(Activation('relu'))
    model.add(Dense(80))
    model.add(Activation('relu'))
    model.add(Dense(60))
    model.add(Activation('relu'))
    model.add(Dense(39))
    model.add(Activation('softmax'))

    model.compile(optimizer ='adam',
            loss = 'sparse_categorical_crossentropy',
            metrics=['accuracy'])

    model.fit(xtrain, ytrain, 
            batch_size = 32, 
            epochs = 20, 
            verbose = 0, 
            validation_data=(xvalid,yvalid))

    pred_train = model.predict(xtrain)
    pred_test = model.predict(xvalid)
    train_loss = log_loss(ytrain, pred_train)
    test_loss = log_loss(yvalid, pred_test)
    print(f"fold={fold}, train_loss={train_loss} - test_loss={test_loss}")

    prob_list = []
    for i in pred_test:
        max_prob = i.max()
        probs = list(i)
        prob = probs.index(max_prob)
        prob_list.append(prob)

    df_valid['nn_pred'] = prob_list

    return df_valid[['id', 'Category','kfold', 'nn_pred']], train_loss, test_loss


In [None]:
print("-"*80)
print("Neural Network")

dfs = []
train_loss = []
test_loss = []

for j in range(5):
    temp_df, train_, test_ = run_neural_network(j)
    dfs.append(temp_df)
    train_loss.append(train_)
    test_loss.append(test_)

fin_valid_df = pd.concat(dfs)
print(fin_valid_df.shape)
print(f"Train loss mean: {mean(train_loss)} - Test loss mean: {mean(test_loss)}")
fin_valid_df.to_csv("drive/MyDrive/model_preds/nn_pred.csv", index=False)

--------------------------------------------------------------------------------
Neural Network
fold=0, train_loss=2.520209292880304 - test_loss=2.5351018119789015
fold=1, train_loss=2.520707105994488 - test_loss=2.529958650641439
fold=2, train_loss=2.519396584924139 - test_loss=2.5306858517655098
fold=3, train_loss=2.522297162887339 - test_loss=2.534251552782488
fold=4, train_loss=2.5212766518861836 - test_loss=2.5315826537911397
(878049, 4)
Train loss mean: 2.520777359714491 - Test loss mean: 2.532316104191896


In [None]:
pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[K     |████████████████████████████████| 100 kB 3.4 MB/s 
Collecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0


In [None]:
from skopt import gp_minimize

### Fine tuning - Random Forest (Stacking model)

In [None]:
def model_training_rf(params):

    files = glob.glob("drive/MyDrive/model_preds/*.csv")
    df = None
    for f in files:
        if df is None:
            df = pd.read_csv(f)
        else:
            temp_df = pd.read_csv(f)
            temp_df.drop(['Category', 'kfold'], axis=1, inplace=True)
            df = df.merge(temp_df, on="id", how="left")

    for fold in range(5):
        
      train_df = df[df.kfold != fold].reset_index(drop=True)
      valid_df = df[df.kfold == fold].reset_index(drop=True)

      xtrain = train_df[['ext_pred','rf_pred', 'gnb_pred', 'xgb_pred', 'lr_pred', 'nn_pred']].values
      xvalid = valid_df[['ext_pred', 'rf_pred', 'gnb_pred', 'xgb_pred', 'lr_pred', 'nn_pred']].values
      ytrain = train_df['Category']
      yvalid = valid_df['Category']

      scaler = StandardScaler()
      xvalid = scaler.fit_transform(xvalid)
      n_estimators = params[0]
      max_depth = params[1]
      max_features = params[2]
      
      print(params, '\n')
      
      mdl = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, random_state=0)
      mdl.fit(xtrain, ytrain)
      
      p = mdl.predict_proba(xvalid)
      
      return log_loss(yvalid, p)

space = [(100, 200), # number of estimators
         (5,6), # max depth
         ('auto', 'sqrt')] # max features

In [None]:
resultados_gp = gp_minimize(model_training_rf, space, random_state=1, verbose=1, n_calls=30, n_random_starts=10)

Iteration No: 1 started. Evaluating function at random point.
[200, 6, 'auto'] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 103.3020
Function value obtained: 2.7294
Current minimum: 2.7294
Iteration No: 2 started. Evaluating function at random point.
[200, 5, 'auto'] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 72.3022
Function value obtained: 2.7057
Current minimum: 2.7057
Iteration No: 3 started. Evaluating function at random point.
[139, 6, 'sqrt'] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 56.4989
Function value obtained: 2.7327
Current minimum: 2.7057
Iteration No: 4 started. Evaluating function at random point.
[185, 5, 'sqrt'] 

Iteration No: 4 ended. Evaluation done at random point.
Time taken: 70.3191
Function value obtained: 2.7033
Current minimum: 2.7033
Iteration No: 5 started. Evaluating function at random point.
[144, 5, 'sqrt'] 

Iteration No: 5 ended. Evaluation done at random point.
Time taken

___

### Fine tuning - LightGBM (Stacking model)

In [None]:
def model_training_light(params):

    files = glob.glob("drive/MyDrive/model_preds/*.csv")
    df = None
    for f in files:
        if df is None:
            df = pd.read_csv(f)
        else:
            temp_df = pd.read_csv(f)
            temp_df.drop(['Category', 'kfold'], axis=1, inplace=True)
            df = df.merge(temp_df, on="id", how="left")

    for fold in range(5):
        
      train_df = df[df.kfold != fold].reset_index(drop=True)
      valid_df = df[df.kfold == fold].reset_index(drop=True)

      xtrain = train_df[['ext_pred','rf_pred', 'gnb_pred', 'xgb_pred', 'lr_pred', 'nn_pred']].values
      xvalid = valid_df[['ext_pred', 'rf_pred', 'gnb_pred', 'xgb_pred', 'lr_pred', 'nn_pred']].values
      ytrain = train_df['Category']
      yvalid = valid_df['Category']

      scaler = StandardScaler()
      xvalid = scaler.fit_transform(xvalid)
      learning_rate = params[0]
      num_leaves = params[1]
      min_child_samples = params[2]
      subsample = params[3]
      colsample_bytree = params[4]
      
      print(params, '\n')
      
      mdl = LGBMClassifier(learning_rate=learning_rate, num_leaves=num_leaves, min_child_samples=min_child_samples,
                          subsample=subsample, colsample_bytree=colsample_bytree, random_state=0, subsample_freq=1, 
                          n_estimators=100)
      mdl.fit(xtrain, ytrain)
      
      p = mdl.predict_proba(xvalid)
      
      return log_loss(yvalid, p)

space = [(1e-3, 1e-1, 'log-uniform'), #learning rate
         (2, 128), # num_leaves
         (1, 100), # min_child_samples
         (0.05, 1.0), # subsample
         (0.1, 1.0)] # colsample bytree

In [None]:
resultados_light = gp_minimize(model_training_light, space, random_state=1, verbose=1, n_calls=30, n_random_starts=10)

Iteration No: 1 started. Evaluating function at random point.
[0.09871192514273254, 120, 14, 0.9990884895579377, 0.3124800792567785] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 169.0380
Function value obtained: 2.7106
Current minimum: 2.7106
Iteration No: 2 started. Evaluating function at random point.
[0.006210998932353835, 51, 67, 0.9387621172657304, 0.8616798250174156] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 279.5853
Function value obtained: 2.6819
Current minimum: 2.6819
Iteration No: 3 started. Evaluating function at random point.
[0.004232013397179603, 68, 45, 0.2680983530433343, 0.5809725180523154] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 322.7758
Function value obtained: 2.6805
Current minimum: 2.6805
Iteration No: 4 started. Evaluating function at random point.
[0.0672858974212934, 60, 44, 0.9421713999524447, 0.8005503127028804] 

Iteration No: 4 ended. Evaluation done at random point.
Time ta

___

Light_results = [0.001, 2, 100, 1.0, 0.1]

### Stacking model

### LightGBM - Level 2

In [None]:
def meta_model_training_light(pred_df: pd.DataFrame, fold: int, params: list):

    train_df = pred_df[pred_df.kfold != fold].reset_index(drop=True)
    valid_df = pred_df[pred_df.kfold == fold].reset_index(drop=True)

    xtrain = train_df[['ext_pred','rf_pred', 'gnb_pred', 'xgb_pred', 'lr_pred', 'nn_pred']].values
    xvalid = valid_df[['ext_pred', 'rf_pred', 'gnb_pred', 'xgb_pred', 'lr_pred', 'nn_pred']].values
    ytrain = train_df['Category']
    yvalid = valid_df['Category']

    scaler = StandardScaler()
    xvalid = scaler.fit_transform(xvalid)
    xtrain = scaler.fit_transform(xtrain)
    column_name = 'light_pred_level2'

    clf = LGBMClassifier(learning_rate=params[0], num_leaves=params[1], min_child_samples=params[2],
                    subsample=params[3], colsample_bytree=params[4], random_state=0, subsample_freq=1, 
                    n_estimators=100)
    
    clf.fit(xtrain, ytrain)  

    pred_train = clf.predict_proba(xtrain)
    pred_test = clf.predict_proba(xvalid)

    train_loss = log_loss(ytrain, pred_train)
    test_loss = log_loss(yvalid, pred_test)
    print(f"fold={fold}, train_loss={train_loss} - test_loss={test_loss}")


    prob_list = []
    for i in pred_test:
        max_prob = i.max()
        probs = list(i)
        prob = probs.index(max_prob)
        prob_list.append(prob)

    valid_df[column_name] = prob_list

    return valid_df[['id', 'Category','kfold', column_name]], train_loss, test_loss

In [None]:
files = glob.glob("drive/MyDrive/model_preds/*.csv")
df = None
for f in files:
    if df is None:
        df = pd.read_csv(f)
    else:
        temp_df = pd.read_csv(f)
        temp_df.drop(['Category', 'kfold'], axis=1, inplace=True)
        df = df.merge(temp_df, on="id", how="left")

In [None]:
resultados_light

[0.001, 2, 100, 1.0, 0.1]

In [None]:
print("-"*80)
print("Meta model")

dfs = []
train_loss = []
test_loss = []

for j in range(5):
    temp_df, train_, test_ = meta_model_training_light(df, j, resultados_light)
    dfs.append(temp_df)
    train_loss.append(train_)
    test_loss.append(test_)

fin_valid_df = pd.concat(dfs)
print(fin_valid_df.shape)
print(f"Train loss: {mean(train_loss)} - Test loss: {mean(test_loss)}")
fin_valid_df.to_csv("drive/MyDrive/model_preds/LightGBM_Level2.csv", index=False)
print("-"*80)

--------------------------------------------------------------------------------
Meta model
fold=0, train_loss=2.6803260358443297 - test_loss=2.6802891946778957
fold=1, train_loss=2.680317423840633 - test_loss=2.680324592549758
fold=2, train_loss=2.680315157425217 - test_loss=2.680334596279305
fold=3, train_loss=2.680307976471803 - test_loss=2.6803627334338076
fold=4, train_loss=2.680319773951876 - test_loss=2.680312669061236
(878049, 4)
Train loss: 2.6803172735067715 - Test loss: 2.6803247572004003
--------------------------------------------------------------------------------


GP_results = [180, 5, 'sqrt']

### Random Forest - Level 2

In [None]:
def meta_model_training_rf2(pred_df: pd.DataFrame, fold: int, params: list):

    train_df = pred_df[pred_df.kfold != fold].reset_index(drop=True)
    valid_df = pred_df[pred_df.kfold == fold].reset_index(drop=True)

    xtrain = train_df[['ext_pred','rf_pred', 'gnb_pred', 'xgb_pred', 'lr_pred', 'nn_pred']].values
    xvalid = valid_df[['ext_pred', 'rf_pred', 'gnb_pred', 'xgb_pred', 'lr_pred', 'nn_pred']].values
    ytrain = train_df['Category']
    yvalid = valid_df['Category']

    scaler = MinMaxScaler()
    xvalid = scaler.fit_transform(xvalid)
    xtrain = scaler.fit_transform(xtrain)
    column_name = 'rf_pred_level2'

    clf = RandomForestClassifier(n_estimators=params[0], max_depth=params[1], max_features=params[2], random_state=0)

    clf.fit(xtrain, ytrain)  

    pred_train = clf.predict_proba(xtrain)
    pred_test = clf.predict_proba(xvalid)

    train_loss = log_loss(ytrain, pred_train)
    test_loss = log_loss(yvalid, pred_test)
    print(f"fold={fold}, train_loss={train_loss} - test_loss={test_loss}")


    prob_list = []
    for i in pred_test:
        max_prob = i.max()
        probs = list(i)
        prob = probs.index(max_prob)
        prob_list.append(prob)

    valid_df[column_name] = prob_list

    return valid_df[['id', 'Category','kfold', column_name]], train_loss, test_loss

In [None]:
files = glob.glob("drive/MyDrive/model_preds/*.csv")
df = None
for f in files:
    if df is None:
        df = pd.read_csv(f)
    else:
        temp_df = pd.read_csv(f)
        temp_df.drop(['Category', 'kfold'], axis=1, inplace=True)
        df = df.merge(temp_df, on="id", how="left")

In [None]:
print("-"*80)
print("Meta model")

dfs = []
train_loss = []
test_loss = []

for j in range(5):
    temp_df, train_, test_ = meta_model_training_rf2(df, j, resultados_gp.x)
    dfs.append(temp_df)
    train_loss.append(train_)
    test_loss.append(test_)

fin_valid_df = pd.concat(dfs)
print(fin_valid_df.shape)
print(f"Train loss: {mean(train_loss)} - Test loss: {mean(test_loss)}")
fin_valid_df.to_csv("drive/MyDrive/model_preds/rf_Level2.csv", index=False)
print("-"*80)

--------------------------------------------------------------------------------
Meta model
fold=0, train_loss=2.679056107834159 - test_loss=2.681194517020028
fold=1, train_loss=2.679036707965257 - test_loss=2.682020009750325
fold=2, train_loss=2.6790299072017496 - test_loss=2.6807806600093813
fold=3, train_loss=2.6790347490391033 - test_loss=2.682145021547842
fold=4, train_loss=2.6790362698665104 - test_loss=2.681319678807353
(878049, 4)
Train loss: 2.679038748381356 - Test loss: 2.681491977426986
--------------------------------------------------------------------------------


### The best results were found using neural networks. It was evaluated based on the log loss metric.