# Find best model

In [1]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score
import plotly.express as px
import pickle
import os
from my_utils.visualization import get_3d_ensemble_fig



# Data

In [8]:
train_df = pd.read_feather('./data/train_processed.ftr')
val_df = pd.read_feather('./data/val_processed.ftr')
train_ensemble_df = pd.read_feather('./data/ensemble_train_df.ftr')
val_ensemble_df = pd.read_feather('./data/ensemble_val_df.ftr')

cols = list(train_df.columns)

## Add ensemble features

In [9]:
ensemble_cols = list(train_ensemble_df.columns)[:5]
train_df[ensemble_cols] = train_ensemble_df[ensemble_cols]
val_df[ensemble_cols] = val_ensemble_df[ensemble_cols]

In [12]:
train_x = train_df[cols[:-1] + ensemble_cols].to_numpy()
val_x = val_df[cols[:-1] + ensemble_cols].to_numpy()
train_y = train_df[cols[-1]].astype(int).to_numpy()
val_y = val_df[cols[-1]].astype(int).to_numpy()

# Baseline models comparison

## Train data

### AI models

In [16]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(train_x, train_x, train_y, train_y)

100%|██████████| 29/29 [00:13<00:00,  2.11it/s]


In [17]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.93,0.93,0.93,0.93,0.52
DecisionTreeClassifier,0.93,0.93,0.93,0.93,0.03
ExtraTreeClassifier,0.93,0.93,0.93,0.93,0.02
ExtraTreesClassifier,0.93,0.93,0.93,0.93,0.56
XGBClassifier,0.93,0.93,0.93,0.93,0.27
LGBMClassifier,0.93,0.93,0.93,0.93,0.13
BaggingClassifier,0.93,0.93,0.93,0.93,0.15
LabelPropagation,0.93,0.93,0.93,0.93,3.4
LabelSpreading,0.93,0.93,0.93,0.93,4.21
AdaBoostClassifier,0.92,0.92,0.92,0.92,0.34


### Mean proba

In [20]:
train_ensemble_df['mean_proba'] = train_ensemble_df[['neural', 'ada_boost', 'svc', 'lgbm']].mean(axis=1)
train_ensemble_df['mean_pred'] = train_ensemble_df['mean_proba'].apply(lambda x: 1 if x >= 0.6 else 0)

In [21]:
accuracy_score(train_ensemble_df['y'], train_ensemble_df['mean_pred'])

0.8173118125982003

## Validation data

### AI models

In [22]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(train_x, val_x, train_y, val_y)

100%|██████████| 29/29 [00:10<00:00,  2.83it/s]


In [23]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NearestCentroid,0.79,0.79,0.79,0.79,0.01
NuSVC,0.79,0.79,0.79,0.79,1.22
GaussianNB,0.79,0.79,0.79,0.79,0.02
BernoulliNB,0.79,0.79,0.79,0.79,0.02
QuadraticDiscriminantAnalysis,0.77,0.77,0.77,0.77,0.03
RidgeClassifierCV,0.77,0.77,0.77,0.77,0.03
RidgeClassifier,0.77,0.77,0.77,0.77,0.02
LinearDiscriminantAnalysis,0.77,0.77,0.77,0.77,0.03
LogisticRegression,0.75,0.75,0.75,0.75,0.06
SVC,0.75,0.75,0.75,0.75,0.5


### Mean proba

In [24]:
val_ensemble_df['mean_proba'] = val_ensemble_df[['ada_boost', 'svc', 'lgbm']].mean(axis=1)
val_ensemble_df['mean_pred'] = val_ensemble_df['mean_proba'].apply(lambda x: 1 if x >= 0.5 else 0)

In [25]:
accuracy_score(val_ensemble_df['y'], val_ensemble_df['mean_pred'])

0.7990543735224587

* AI models with ensemble cols seems to be overfitted (train accuracy 93%, val accuracy 78%)  
* The mean from ensemble features predicts target better (val accuracy 80%)