In [18]:
import pandas as pd
import umap.umap_ as umap
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss

# ML
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [2]:
data_train = pd.read_csv('data/train.csv').drop('id', axis=1)
data_test = pd.read_csv('data/test.csv').drop('id', axis=1)

In [3]:
data_train_num = data_train

X = data_train_num.drop('target', axis=1)
y = data_train_num['target']

In [4]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X))
# test_scaled = pd.DataFrame(scaler.fit_transform(data_test))

log_pred = np.zeros((len(X), 4))
test_pred = np.zeros((len(data_test), 4))

<h3> Catboost  </h3>

In [19]:
lg_model = LGBMClassifier()

In [None]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X, y)):
    print('Fold: ', fold_)
    model = lg_model.fit(
        X.iloc[train_index],
        y.iloc[train_index],
        eval_set = [(X.iloc[train_index], y.iloc[train_index]), (X.iloc[val_index], y.iloc[val_index])],
        eval_metric = 'multi_logloss'
        early_stopping_rounds = 50,
        verbose = 10
    )

    temp_pred = model.predict_proba(X.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(data_test)
    test_pred += temp_test

test_pred1 = test_pred/5

print(f'Overall Log Loss: {log_loss(y, log_pred)}')

In [5]:
cat_model = CatBoostClassifier()

In [6]:
%%time

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X, y)):
    print('Fold: ', fold_)
    model = cat_model.fit(
        X.iloc[train_index],
        y.iloc[train_index],
        eval_set = [(X.iloc[train_index], y.iloc[train_index]), (X.iloc[val_index], y.iloc[val_index])],
        early_stopping_rounds = 50,
        verbose = 10
    )

    temp_pred = model.predict_proba(X.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'Log Loss: {log_loss(y.iloc[val_index], temp_pred)}')

    temp_test = model.predict_proba(data_test)
    test_pred += temp_test

test_pred2 = test_pred/10

print(f'Overall Log Loss: {log_loss(y, log_pred)}')

 1.0951644	best: 1.0951644 (80)	total: 2.43s	remaining: 27.6s
90:	learn: 1.0872008	test: 1.0872008	test1: 1.0942287	best: 1.0942287 (90)	total: 2.75s	remaining: 27.5s
100:	learn: 1.0853168	test: 1.0853168	test1: 1.0935055	best: 1.0935055 (100)	total: 3.05s	remaining: 27.2s
110:	learn: 1.0834235	test: 1.0834235	test1: 1.0926671	best: 1.0926671 (110)	total: 3.35s	remaining: 26.8s
120:	learn: 1.0815685	test: 1.0815685	test1: 1.0923056	best: 1.0922761 (117)	total: 3.64s	remaining: 26.4s
130:	learn: 1.0799798	test: 1.0799798	test1: 1.0917776	best: 1.0917776 (130)	total: 3.97s	remaining: 26.4s
140:	learn: 1.0783451	test: 1.0783451	test1: 1.0914023	best: 1.0914023 (140)	total: 4.28s	remaining: 26.1s
150:	learn: 1.0768035	test: 1.0768035	test1: 1.0909855	best: 1.0909855 (150)	total: 4.61s	remaining: 25.9s
160:	learn: 1.0753976	test: 1.0753976	test1: 1.0906919	best: 1.0906542 (159)	total: 4.92s	remaining: 25.7s
170:	learn: 1.0741034	test: 1.0741034	test1: 1.0905089	best: 1.0905089 (170)	total: 

In [7]:
df_pred1 = pd.DataFrame(test_pred1)
df_pred2 = pd.DataFrame(test_pred2)

In [12]:
df_pred1

Unnamed: 0,0,1,2,3
0,0.092975,0.615122,0.169662,0.122241
1,0.085693,0.681441,0.148447,0.084420
2,0.086204,0.622103,0.184109,0.107584
3,0.084022,0.531593,0.282916,0.101468
4,0.072276,0.610124,0.196527,0.121074
...,...,...,...,...
49995,0.086505,0.723997,0.127859,0.061639
49996,0.083712,0.638884,0.138881,0.138523
49997,0.085968,0.523624,0.221560,0.168847
49998,0.077516,0.590777,0.166377,0.165331


In [None]:
df_pred2

In [13]:
data_test1 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test1['Class_1'] = df_pred1[0]
data_test1['Class_2'] = df_pred1[1]
data_test1['Class_3'] = df_pred1[2]
data_test1['Class_4'] = df_pred1[3]

In [None]:
data_test2 = pd.read_csv('data/sample_submission.csv').drop(['Class_1', 'Class_2', 'Class_3', 'Class_4'], axis=1)

data_test2['Class_1'] = df_pred2[0]
data_test2['Class_2'] = df_pred2[1]
data_test2['Class_3'] = df_pred2[2]
data_test2['Class_4'] = df_pred2[3]

In [15]:
data_test

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.092975,0.615122,0.169662,0.122241
1,100001,0.085693,0.681441,0.148447,0.084420
2,100002,0.086204,0.622103,0.184109,0.107584
3,100003,0.084022,0.531593,0.282916,0.101468
4,100004,0.072276,0.610124,0.196527,0.121074
...,...,...,...,...,...
49995,149995,0.086505,0.723997,0.127859,0.061639
49996,149996,0.083712,0.638884,0.138881,0.138523
49997,149997,0.085968,0.523624,0.221560,0.168847
49998,149998,0.077516,0.590777,0.166377,0.165331


In [16]:
data_test.to_csv('submission_v3.csv', index=False)