In [1]:
import pandas as pd
import umap.umap_ as umap
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from imblearn.over_sampling import RandomOverSampler

from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss

# ML
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
data_train = pd.read_csv('data/train.csv').drop('id', axis=1)
data_test = pd.read_csv('data/test.csv').drop('id', axis=1)

In [3]:
data_train_num = data_train

X = data_train_num.drop('target', axis=1)
y = data_train_num['target']

In [4]:
oversample = RandomOverSampler(sampling_strategy = 'minority')
X_over, y_over = oversample.fit_resample(X, y)
print(y_over.value_counts())

Class_1    57497
Class_2    57497
Class_3    21420
Class_4    12593
Name: target, dtype: int64


In [5]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_over))
test_scaled = pd.DataFrame(scaler.transform(data_test))

<h3> XGBoost </h3>

In [6]:
log_pred = np.zeros((len(X_over), 4))
test_pred = np.zeros((len(data_test), 4))

In [7]:
xgb_model = XGBClassifier()

In [8]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X_scaled, y_over)):
    print('Fold: ', fold_)
    model = xgb_model.fit(
        X_scaled.iloc[train_index],
        y_over.iloc[train_index],
        eval_set = [(X_scaled.iloc[train_index], y_over.iloc[train_index]), (X_scaled.iloc[val_index], y_over.iloc[val_index])],
        eval_metric = 'mlogloss',
        early_stopping_rounds = 50, 
        verbose = 0,
    )

    temp_pred = model.predict_proba(X_scaled.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y_over.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(test_scaled)
    test_pred += temp_test

test_pred1 = test_pred/5

print(f'Overall Log Loss: {log_loss(y_over, log_pred)}')

Fold:  0
Log Loss: 1.2033105453351434
Fold:  1
Log Loss: 1.203603615078436
Fold:  2
Log Loss: 1.203194490923449
Fold:  3
Log Loss: 1.2028007222291572
Fold:  4
Log Loss: 1.2036758302064203
Overall Log Loss: 1.203317052360181
Wall time: 4min 11s


<h3> Light Gradient Boost </h3>

In [9]:
log_pred = np.zeros((len(X_over), 4))
test_pred = np.zeros((len(data_test), 4))

In [10]:
lg_model = LGBMClassifier()

In [11]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X_scaled, y_over)):
    print('Fold: ', fold_)
    model = lg_model.fit(
        X_scaled.iloc[train_index],
        y_over.iloc[train_index],
        eval_set = [(X_scaled.iloc[train_index], y_over.iloc[train_index]), (X_scaled.iloc[val_index], y_over.iloc[val_index])],
        eval_metric = 'multi_logloss',
        early_stopping_rounds = 50,
        verbose = 0
    )

    temp_pred = model.predict_proba(X_scaled.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y_over.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(test_scaled)
    test_pred += temp_test

test_pred2 = test_pred/5

print(f'Overall Log Loss: {log_loss(y_over, log_pred)}')

Fold:  0
Log Loss: 1.1570779109307452
Fold:  1
Log Loss: 1.157796982798342
Fold:  2
Log Loss: 1.1571630510860496
Fold:  3
Log Loss: 1.1569614299362623
Fold:  4
Log Loss: 1.1591827856299906
Overall Log Loss: 1.1576364294054609
Wall time: 26.7 s


<h3> Catboost </h3>

In [12]:
log_pred = np.zeros((len(X_over), 4))
test_pred = np.zeros((len(data_test), 4))

In [13]:
cat_model = CatBoostClassifier()

In [14]:
%%time

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X_scaled, y_over)):
    print('Fold: ', fold_)
    model = cat_model.fit(
        X_scaled.iloc[train_index],
        y_over.iloc[train_index],
        eval_set = [(X_scaled.iloc[train_index], y_over.iloc[train_index]), (X_scaled.iloc[val_index], y_over.iloc[val_index])],
        early_stopping_rounds = 50,
        verbose = 0
    )

    temp_pred = model.predict_proba(X_scaled.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y_over.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(test_scaled)
    test_pred += temp_test

test_pred3 = test_pred/10

print(f'Overall Log Loss: {log_loss(y_over, log_pred)}')

Fold:  0
Log Loss: 1.0986101541984499
Fold:  1
Log Loss: 1.0999919204272166
Fold:  2
Log Loss: 1.1013291459060564
Fold:  3
Log Loss: 1.1002181619976632
Fold:  4
Log Loss: 1.0978265155508768
Fold:  5
Log Loss: 1.0987718025361604
Fold:  6
Log Loss: 1.0983788316624317
Fold:  7
Log Loss: 1.0959364933621065
Fold:  8
Log Loss: 1.09917787963154
Fold:  9
Log Loss: 1.1027067872229077
Overall Log Loss: 1.09929476967333
Wall time: 7min 50s


In [15]:
df_pred1 = pd.DataFrame(test_pred1)
df_pred2 = pd.DataFrame(test_pred2)
df_pred3 = pd.DataFrame(test_pred3)

In [16]:
data_test1 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test1['Class_1'] = df_pred1[0]
data_test1['Class_2'] = df_pred1[1]
data_test1['Class_3'] = df_pred1[2]
data_test1['Class_4'] = df_pred1[3]

In [17]:
data_test2 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test2['Class_1'] = df_pred2[0]
data_test2['Class_2'] = df_pred2[1]
data_test2['Class_3'] = df_pred2[2]
data_test2['Class_4'] = df_pred2[3]

In [18]:
data_test3 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test3['Class_1'] = df_pred3[0]
data_test3['Class_2'] = df_pred3[1]
data_test3['Class_3'] = df_pred3[2]
data_test3['Class_4'] = df_pred3[3]

In [19]:
data_test3.to_csv('submission_v4.csv', index=False)