In [1]:
import pandas as pd
import umap.umap_ as umap
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss

# ML
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
data_train = pd.read_csv('data/train.csv').drop('id', axis=1)
data_test = pd.read_csv('data/test.csv').drop('id', axis=1)

In [3]:
data_train_num = data_train

X = data_train_num.drop('target', axis=1)
y = data_train_num['target']

In [4]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X))
# test_scaled = pd.DataFrame(scaler.fit_transform(data_test))

<h3> XGBoost </h3>

In [5]:
log_pred = np.zeros((len(X), 4))
test_pred = np.zeros((len(data_test), 4))

In [6]:
xgb_model = XGBClassifier()

In [7]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X, y)):
    print('Fold: ', fold_)
    model = xgb_model.fit(
        X.iloc[train_index],
        y.iloc[train_index],
        eval_set = [(X.iloc[train_index], y.iloc[train_index]), (X.iloc[val_index], y.iloc[val_index])],
        eval_metric = 'mlogloss',
        early_stopping_rounds = 50, 
        verbose = 10
    )

    temp_pred = model.predict_proba(X.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(data_test)
    test_pred += temp_test

test_pred1 = test_pred/5

print(f'Overall Log Loss: {log_loss(y, log_pred)}')

Fold:  0
[0]	validation_0-mlogloss:1.34678	validation_1-mlogloss:1.34697
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 50 rounds.
[10]	validation_0-mlogloss:1.16438	validation_1-mlogloss:1.16599
[20]	validation_0-mlogloss:1.12018	validation_1-mlogloss:1.12332
[30]	validation_0-mlogloss:1.10709	validation_1-mlogloss:1.11167
[40]	validation_0-mlogloss:1.10158	validation_1-mlogloss:1.10753
[50]	validation_0-mlogloss:1.09814	validation_1-mlogloss:1.10532
[60]	validation_0-mlogloss:1.09548	validation_1-mlogloss:1.10381
[70]	validation_0-mlogloss:1.09317	validation_1-mlogloss:1.10267
[80]	validation_0-mlogloss:1.09118	validation_1-mlogloss:1.10184
[90]	validation_0-mlogloss:1.08936	validation_1-mlogloss:1.10093
[99]	validation_0-mlogloss:1.08783	validation_1-mlogloss:1.10044
Log Loss: 1.1004433787360788
Fold:  1
[0]	validation_0-mlogloss:1.34673	validation_1-mlogloss:1.34695
Multiple

<h3> Light Gradient Boost </h3>

In [8]:
log_pred = np.zeros((len(X), 4))
test_pred = np.zeros((len(data_test), 4))

In [9]:
lg_model = LGBMClassifier()

In [10]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X, y)):
    print('Fold: ', fold_)
    model = lg_model.fit(
        X.iloc[train_index],
        y.iloc[train_index],
        eval_set = [(X.iloc[train_index], y.iloc[train_index]), (X.iloc[val_index], y.iloc[val_index])],
        eval_metric = 'multi_logloss',
        early_stopping_rounds = 50,
        verbose = 10
    )

    temp_pred = model.predict_proba(X.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(data_test)
    test_pred += temp_test

test_pred2 = test_pred/5

print(f'Overall Log Loss: {log_loss(y, log_pred)}')

Fold:  0
Training until validation scores don't improve for 50 rounds
[10]	valid_0's multi_logloss: 1.09624	valid_1's multi_logloss: 1.10727
[20]	valid_0's multi_logloss: 1.08202	valid_1's multi_logloss: 1.10256
[30]	valid_0's multi_logloss: 1.07038	valid_1's multi_logloss: 1.09965
[40]	valid_0's multi_logloss: 1.06068	valid_1's multi_logloss: 1.09859
[50]	valid_0's multi_logloss: 1.05188	valid_1's multi_logloss: 1.09813
[60]	valid_0's multi_logloss: 1.0442	valid_1's multi_logloss: 1.09777
[70]	valid_0's multi_logloss: 1.03708	valid_1's multi_logloss: 1.09769
[80]	valid_0's multi_logloss: 1.03058	valid_1's multi_logloss: 1.09793
[90]	valid_0's multi_logloss: 1.02422	valid_1's multi_logloss: 1.0981
[100]	valid_0's multi_logloss: 1.01783	valid_1's multi_logloss: 1.0983
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 1.01783	valid_1's multi_logloss: 1.0983
Log Loss: 1.0983001986518488
Fold:  1
Training until validation scores don't improve for 50 rounds
[10]

<h3> Catboost </h3>

In [11]:
log_pred = np.zeros((len(X), 4))
test_pred = np.zeros((len(data_test), 4))

In [12]:
cat_model = CatBoostClassifier()

In [13]:
%%time

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X_scaled, y)):
    print('Fold: ', fold_)
    model = cat_model.fit(
        X.iloc[train_index],
        y.iloc[train_index],
        eval_set = [(X.iloc[train_index], y.iloc[train_index]), (X.iloc[val_index], y.iloc[val_index])],
        early_stopping_rounds = 50,
        verbose = 10
    )

    temp_pred = model.predict_proba(X.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(data_test)
    test_pred += temp_test

test_pred3 = test_pred/10

print(f'Overall Log Loss: {log_loss(y, log_pred)}')

: 1.0890761	test1: 1.0951644	best: 1.0951644 (80)	total: 2.39s	remaining: 27.1s
90:	learn: 1.0872008	test: 1.0872008	test1: 1.0942287	best: 1.0942287 (90)	total: 2.69s	remaining: 26.9s
100:	learn: 1.0853168	test: 1.0853168	test1: 1.0935055	best: 1.0935055 (100)	total: 3s	remaining: 26.7s
110:	learn: 1.0834235	test: 1.0834235	test1: 1.0926671	best: 1.0926671 (110)	total: 3.29s	remaining: 26.4s
120:	learn: 1.0815685	test: 1.0815685	test1: 1.0923056	best: 1.0922761 (117)	total: 3.59s	remaining: 26.1s
130:	learn: 1.0799798	test: 1.0799798	test1: 1.0917776	best: 1.0917776 (130)	total: 3.88s	remaining: 25.8s
140:	learn: 1.0783451	test: 1.0783451	test1: 1.0914023	best: 1.0914023 (140)	total: 4.19s	remaining: 25.5s
150:	learn: 1.0768035	test: 1.0768035	test1: 1.0909855	best: 1.0909855 (150)	total: 4.48s	remaining: 25.2s
160:	learn: 1.0753976	test: 1.0753976	test1: 1.0906919	best: 1.0906542 (159)	total: 4.77s	remaining: 24.9s
170:	learn: 1.0741034	test: 1.0741034	test1: 1.0905089	best: 1.090508

In [14]:
df_pred1 = pd.DataFrame(test_pred1)
df_pred2 = pd.DataFrame(test_pred2)
df_pred3 = pd.DataFrame(test_pred3)

In [15]:
data_test1 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test1['Class_1'] = df_pred1[0]
data_test1['Class_2'] = df_pred1[1]
data_test1['Class_3'] = df_pred1[2]
data_test1['Class_4'] = df_pred1[3]

In [16]:
data_test2 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test2['Class_1'] = df_pred2[0]
data_test2['Class_2'] = df_pred2[1]
data_test2['Class_3'] = df_pred2[2]
data_test2['Class_4'] = df_pred2[3]

In [17]:
data_test3 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test3['Class_1'] = df_pred3[0]
data_test3['Class_2'] = df_pred3[1]
data_test3['Class_3'] = df_pred3[2]
data_test3['Class_4'] = df_pred3[3]

In [18]:
data_test3.to_csv('submission_v4.csv', index=False)