In [1]:
import pandas as pd
import umap.umap_ as umap
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss

# ML
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
data_train = pd.read_csv('data/train.csv').drop('id', axis=1)
data_test = pd.read_csv('data/test.csv').drop('id', axis=1)

In [3]:
data_train_num = data_train

X = data_train_num.drop('target', axis=1)
y = data_train_num['target']

In [4]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X))
# test_scaled = pd.DataFrame(scaler.fit_transform(data_test))

<h3> XGBoost </h3>

In [5]:
log_pred = np.zeros((len(X), 4))
test_pred = np.zeros((len(data_test), 4))

In [6]:
xgb_model = XGBClassifier()

In [7]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X, y)):
    print('Fold: ', fold_)
    model = xgb_model.fit(
        X.iloc[train_index],
        y.iloc[train_index],
        eval_set = [(X.iloc[train_index], y.iloc[train_index]), (X.iloc[val_index], y.iloc[val_index])],
        eval_metric = 'mlogloss',
        early_stopping_rounds = 50, 
        verbose = 0,
    )

    temp_pred = model.predict_proba(X.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(data_test)
    test_pred += temp_test

test_pred1 = test_pred/5

print(f'Overall Log Loss: {log_loss(y, log_pred)}')

Fold:  0
Log Loss: 1.1004433787360788
Fold:  1
Log Loss: 1.0986367761597038
Fold:  2
Log Loss: 1.0980183875322342
Fold:  3
Log Loss: 1.0989431204527615
Fold:  4
Log Loss: 1.098668839904666
Overall Log Loss: 1.0989421132776935
Wall time: 2min 51s


<h3> Light Gradient Boost </h3>

In [8]:
log_pred = np.zeros((len(X), 4))
test_pred = np.zeros((len(data_test), 4))

In [9]:
lg_model = LGBMClassifier()

In [19]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X, y)):
    print('Fold: ', fold_)
    model = lg_model.fit(
        X.iloc[train_index],
        y.iloc[train_index],
        eval_set = [(X.iloc[train_index], y.iloc[train_index]), (X.iloc[val_index], y.iloc[val_index])],
        eval_metric = 'multi_logloss',
        early_stopping_rounds = 50,
        verbose = 0
    )

    temp_pred = model.predict_proba(X.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(data_test)
    test_pred += temp_test

test_pred2 = test_pred/5

print(f'Overall Log Loss: {log_loss(y, log_pred)}')

Fold:  0
Log Loss: 1.0983001986518488
Fold:  1
Log Loss: 1.0969746236504785
Fold:  2
Log Loss: 1.094319908787031
Fold:  3
Log Loss: 1.0954988855585286
Fold:  4
Log Loss: 1.0957800561347766
Overall Log Loss: 1.0961747345565327
Wall time: 19.8 s


<h3> Catboost </h3>

In [11]:
log_pred = np.zeros((len(X), 4))
test_pred = np.zeros((len(data_test), 4))

In [12]:
cat_model = CatBoostClassifier()

In [20]:
%%time

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X_scaled, y)):
    print('Fold: ', fold_)
    model = cat_model.fit(
        X.iloc[train_index],
        y.iloc[train_index],
        eval_set = [(X.iloc[train_index], y.iloc[train_index]), (X.iloc[val_index], y.iloc[val_index])],
        early_stopping_rounds = 50,
        verbose = 0
    )

    temp_pred = model.predict_proba(X.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(data_test)
    test_pred += temp_test

test_pred3 = test_pred/10

print(f'Overall Log Loss: {log_loss(y, log_pred)}');

Fold:  0
Log Loss: 1.0945770827473644
Fold:  1
Log Loss: 1.0969776023595654
Fold:  2
Log Loss: 1.088788019787104
Fold:  3
Log Loss: 1.095371793889031
Fold:  4
Log Loss: 1.092453601548907
Fold:  5
Log Loss: 1.0888971785917936
Fold:  6
Log Loss: 1.0938404553107135
Fold:  7
Log Loss: 1.090716145401981
Fold:  8
Log Loss: 1.09185005740237
Fold:  9
Log Loss: 1.0911499762441252
Overall Log Loss: 1.0924621913282957
Wall time: 1min 49s


In [14]:
df_pred1 = pd.DataFrame(test_pred1)
df_pred2 = pd.DataFrame(test_pred2)
df_pred3 = pd.DataFrame(test_pred3)

In [15]:
data_test1 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test1['Class_1'] = df_pred1[0]
data_test1['Class_2'] = df_pred1[1]
data_test1['Class_3'] = df_pred1[2]
data_test1['Class_4'] = df_pred1[3]

In [16]:
data_test2 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test2['Class_1'] = df_pred2[0]
data_test2['Class_2'] = df_pred2[1]
data_test2['Class_3'] = df_pred2[2]
data_test2['Class_4'] = df_pred2[3]

In [17]:
data_test3 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test3['Class_1'] = df_pred3[0]
data_test3['Class_2'] = df_pred3[1]
data_test3['Class_3'] = df_pred3[2]
data_test3['Class_4'] = df_pred3[3]

In [18]:
data_test3.to_csv('submission_v4.csv', index=False)