In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

* Importing data

In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import data
import pandas as pd

train_data = pd.read_csv('/content/drive/MyDrive/Тестовое задание на стажировку ML-инженер/trainee_train.csv', index_col=0)
test_data = pd.read_csv('/content/drive/MyDrive/Тестовое задание на стажировку ML-инженер/trainee_test_fish.csv', index_col=0)

* Exploration analysis

In [None]:
train_data.head()

Unnamed: 0,im,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v437,v438,v439,v440,v441,v442,v443,v444,v445,v446
0,0.0,128.100243,128.157072,129.255102,128.273006,126.532819,129.023232,129.854839,129.828431,129.868526,...,128.800985,128.232694,128.211893,130.512476,130.41116,130.650165,130.904685,131.017268,130.477398,130.49615
1,1.0,128.100243,128.157072,126.636364,128.273006,130.810403,129.023232,128.633333,128.994152,128.994152,...,128.914365,128.232694,128.211893,128.167519,130.41116,130.650165,130.904685,131.017268,130.477398,130.49615
2,1.0,128.100243,128.157072,129.255102,128.273006,130.810403,129.023232,126.769231,126.926295,127.19646,...,128.800985,128.232694,128.211893,128.167519,128.051724,127.709786,127.349282,127.063052,125.169118,125.274194
3,1.0,128.100243,128.157072,127.630027,128.273006,126.532819,129.023232,126.769231,126.926295,127.19646,...,128.800985,130.794007,131.026119,130.512476,130.41116,130.650165,130.904685,131.017268,130.477398,130.49615
4,1.0,128.100243,128.336364,129.255102,128.273006,130.810403,126.04298,129.854839,126.926295,127.19646,...,128.800985,130.794007,131.026119,130.512476,130.41116,130.650165,130.904685,131.017268,130.477398,130.49615


In [None]:
test_data.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v437,v438,v439,v440,v441,v442,v443,v444,v445,v446
0,130.798507,131.174905,129.255102,132.199234,128.746479,130.381148,129.854839,129.828431,129.868526,130.917969,...,128.800985,128.232694,128.211893,128.167519,126.1,127.709786,127.349282,127.063052,127.825,126.611554
1,128.100243,128.157072,127.630027,128.273006,130.810403,129.023232,126.769231,126.926295,127.19646,127.433333,...,128.800985,130.794007,127.75,130.512476,130.41116,130.650165,130.904685,131.017268,130.477398,130.49615
2,128.100243,128.157072,129.255102,128.273006,130.810403,126.04298,129.854839,129.828431,129.868526,128.618063,...,128.800985,130.794007,131.026119,130.512476,130.41116,130.650165,130.904685,131.017268,130.477398,130.49615
3,130.798507,131.174905,129.255102,132.199234,132.860465,130.381148,129.854839,129.828431,129.868526,130.917969,...,126.503759,128.232694,128.211893,128.167519,128.051724,127.709786,127.349282,127.063052,127.825,126.611554
4,130.798507,131.174905,129.255102,132.199234,130.810403,130.381148,129.854839,129.828431,129.868526,130.917969,...,128.914365,128.232694,128.211893,128.167519,128.051724,127.709786,127.349282,127.063052,125.169118,126.611554


In [None]:
# Check missing values
sum(train_data.isna().sum())

0

In [None]:
# Splitting into train and test data + IQR outliers detection
x_train = train_data.drop('im', axis=1)
x_test = test_data

for column in x_train.columns:
    Q1 = x_train[column].quantile(0.25)
    Q3 = x_train[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR

    # Calculate mean or median without outliers in the training data
    mean_value = x_train[(x_train[column] >= lower_bound) & (x_train[column] <= upper_bound)][column].mean()

    # Replace outliers in both training and test data
    x_train.loc[(x_train[column] < lower_bound) | (x_train[column] > upper_bound), column] = mean_value
    x_test.loc[(x_test[column] < lower_bound) | (x_test[column] > upper_bound), column] = mean_value

X_train = x_train.to_numpy()
Y_train = train_data['im'].to_numpy()

x_test = x_test.to_numpy()

In [None]:
# Class imbalance noticed, lets fix this problem with SMOTE, which we will solve within pipeline during cross_validation
from collections import Counter
Counter(Y_train)

Counter({0.0: 174, 1.0: 442})

In [None]:
!pip install imbalanced-learn



* Standartizing + KFold (It's better to observe mean score on every possible split)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=52)

* Baseline (Logistic Regression Model)




In [None]:
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import roc_auc_score

pipe = Pipeline(steps=[
    ('smote', BorderlineSMOTE(random_state=52, kind='borderline-1')),
    ('standardscaler', StandardScaler()),
    ('logisticregression', LogisticRegression(random_state=52))
])

roc_auc_scores = cross_val_score(pipe, X_train, Y_train, cv=kfold, scoring='roc_auc')
print(roc_auc_scores)
print(roc_auc_scores.mean())

[0.94574639 0.89415584 0.89220779 0.91266234 0.89160608]
0.9072756886206985


* Approach: SVM

In [None]:
from sklearn.svm import SVC

pipe = Pipeline(steps=[
    ('smote', BorderlineSMOTE(random_state=52, kind='borderline-1')),
    ('standardscaler', StandardScaler()),
    ('svmmodel', SVC(kernel='rbf', C=1.0, random_state=52))
])

roc_auc_scores = cross_val_score(pipe, X_train, Y_train, cv=kfold, scoring='roc_auc')
print(roc_auc_scores)
print(roc_auc_scores.mean())

[0.96083467 0.93603896 0.92694805 0.91201299 0.87541309]
0.9222495515059956


In [None]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [None]:
# Trying to pick the best parameters with PCA

import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.decomposition import PCA


def svm_bo(C, kernel='rbf'):

  params = {
      'svm__C': C,
      'svm__kernel': kernel,
      # 'pca__n_components': int(n_components)
  }

  pipe = Pipeline(steps=[
      ('smote', BorderlineSMOTE(random_state=52, kind='borderline-1')),
      ('standardscaler', StandardScaler()),
      # ('pca', PCA()),
      ('svm', SVC(random_state=52))
  ])

  pipe.set_params(**params)
  scores = cross_val_score(pipe, X_train, Y_train, cv=kfold, scoring='roc_auc')
  return scores.mean()



params_svm = {
    'C': (1, 20),
    # 'n_components': (200, 440)
}

# Initialize Bayesian Optimization
svm_bo = BayesianOptimization(svm_bo, params_svm, random_state=52)
svm_bo.maximize(init_points=5, n_iter=20)

best = svm_bo.max['params']
print(best)

|   iter    |  target   |     C     |
-------------------------------------
| [0m1        [0m | [0m0.9251   [0m | [0m16.64    [0m |
| [0m2        [0m | [0m0.9214   [0m | [0m1.496    [0m |
| [0m3        [0m | [0m0.9249   [0m | [0m5.005    [0m |
| [0m4        [0m | [0m0.9251   [0m | [0m12.75    [0m |
| [0m5        [0m | [0m0.9241   [0m | [0m2.867    [0m |
| [0m6        [0m | [0m0.9251   [0m | [0m18.14    [0m |
| [0m7        [0m | [0m0.9251   [0m | [0m14.05    [0m |
| [0m8        [0m | [0m0.9251   [0m | [0m20.0     [0m |
| [0m9        [0m | [0m0.9251   [0m | [0m10.63    [0m |
| [0m10       [0m | [0m0.9251   [0m | [0m8.546    [0m |
| [0m11       [0m | [0m0.9251   [0m | [0m6.87     [0m |
| [0m12       [0m | [0m0.9251   [0m | [0m15.35    [0m |
| [0m13       [0m | [0m0.9251   [0m | [0m9.6      [0m |
| [0m14       [0m | [0m0.9251   [0m | [0m11.71    [0m |
| [0m15       [0m | [0m0.9251   [0m | [0m19.11    

* Approach: CatBoostClassifier

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
# Try to find the best parameters

# Optimal parameters with PCA and SS: {'depth': 6.796939520449791, 'iterations': 853.199215524583, 'learning_rate': 0.2914911435217489}
# Target    | depth     | iter      | lr
# 0.9262    | 6.797     | 853.2     | 0.2915

# In theory PCA + SS + SMOTE will not boost our solution with CatBoost,
# because CatBoost search through every feature and return the best splitting value
# Features which are not important won't affect on convergence


# TODO: Try to remove PCA and SS in pipeline
import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier


def catboost_bo(learning_rate, iterations, depth):

  params = {
      'catboost__learning_rate':  learning_rate,
      'catboost__iterations': int(iterations),
      'catboost__depth': int(depth),
      'catboost__loss_function': 'Logloss',
      'catboost__custom_metric': 'AUC',
  }

  pipe = Pipeline(steps=[
      # ('smote', BorderlineSMOTE(random_state=52, kind='borderline-1')),
      # ('standardscaler', StandardScaler()),
      ('catboost', CatBoostClassifier(logging_level='Silent'))
  ])

  pipe.set_params(**params)
  scores = cross_val_score(pipe, X_train, Y_train, cv=kfold, scoring='roc_auc')
  return scores.mean()



params_catboost = {
    'learning_rate':  (0.1, 1),
    'iterations': (100, 1000),
    'depth': (2, 9),
}

# Initialize Bayesian Optimization
catboost_bo = BayesianOptimization(catboost_bo, params_catboost, random_state=111)
catboost_bo.maximize(init_points=5, n_iter=25)

best = catboost_bo.max['params']
print(best)

|   iter    |  target   |   depth   | iterat... | learni... |
-------------------------------------------------------------
| [0m1        [0m | [0m0.9184   [0m | [0m6.285    [0m | [0m252.2    [0m | [0m0.4925   [0m |
| [95m2        [0m | [95m0.9237   [0m | [95m7.385    [0m | [95m365.8    [0m | [95m0.2342   [0m |
| [0m3        [0m | [0m0.9152   [0m | [0m2.157    [0m | [0m478.2    [0m | [0m0.3148   [0m |
| [0m4        [0m | [0m0.9216   [0m | [0m4.364    [0m | [0m991.6    [0m | [0m0.314    [0m |
| [0m5        [0m | [0m0.9127   [0m | [0m2.568    [0m | [0m702.6    [0m | [0m0.6591   [0m |
| [95m6        [0m | [95m0.9268   [0m | [95m6.728    [0m | [95m366.7    [0m | [95m0.2943   [0m |
| [0m7        [0m | [0m0.9149   [0m | [0m5.157    [0m | [0m368.3    [0m | [0m0.3317   [0m |
| [0m8        [0m | [0m0.9121   [0m | [0m8.036    [0m | [0m367.1    [0m | [0m0.4215   [0m |
| [0m9        [0m | [0m0.9082   [0m | [0m6.57

* Approach: The best parameters of XGBoost

In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBClassifier

# Optimal parameters with PCA and SS:
# |  target   | colsam... |   gamma   | learni... | max_depth | n_esti... | subsample |
# | 0.9317    | 0.6379    | 1.476     | 0.1127    | 14.73     | 703.8     | 0.7479    |
def xgb_bo(max_depth, learning_rate, n_estimators, gamma, subsample, colsample_bytree):

    params = {
        'xgbclassifier__max_depth': int(max_depth),
        'xgbclassifier__learning_rate': learning_rate,
        'xgbclassifier__n_estimators': int(n_estimators),
        'xgbclassifier__gamma': gamma,
        'xgbclassifier__subsample': subsample,
        'xgbclassifier__colsample_bytree': colsample_bytree,
        'xgbclassifier__use_label_encoder': False,
        'xgbclassifier__eval_metric': 'logloss'
    }

    pipe = Pipeline(steps=[
      ('smote', BorderlineSMOTE(random_state=52, kind='borderline-1')),
      ('standardscaler', StandardScaler()),
      ('xgbclassifier', XGBClassifier(random_state=52))
    ])


    #Train the model
    pipe.set_params(**params)

    scores = cross_val_score(pipe, X_train, Y_train, cv=kfold, scoring='roc_auc')

    return scores.mean()

#Hyperparameter space
params_xgb = {
    'max_depth': (3, 20),
    'learning_rate': (0.01, 0.5),
    'n_estimators': (100, 2000),
    'gamma': (0, 10),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1),
}

# Initialize Bayesian Optimization
xgb_bo = BayesianOptimization(xgb_bo, params_xgb, random_state=52)
xgb_bo.maximize(init_points=10, n_iter=50)

best = xgb_bo.max['params']
print(best)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9181   [0m | [0m0.8061   [0m | [0m1.691    [0m | [0m0.2237   [0m | [0m16.08    [0m | [0m661.1    [0m | [0m0.5746   [0m |
| [95m2        [0m | [95m0.9256   [0m | [95m0.5112   [0m | [95m4.202    [0m | [95m0.127    [0m | [95m8.74     [0m | [95m1.982e+03[0m | [95m0.6189   [0m |
| [0m3        [0m | [0m0.9123   [0m | [0m0.5406   [0m | [0m6.696    [0m | [0m0.3144   [0m | [0m7.662    [0m | [0m985.8    [0m | [0m0.5592   [0m |
| [0m4        [0m | [0m0.9027   [0m | [0m0.537    [0m | [0m9.008    [0m | [0m0.399    [0m | [0m17.29    [0m | [0m1.649e+03[0m | [0m0.9955   [0m |
| [0m5        [0m | [0m0.9152   [0m | [0m0.7886   [0m | [0m8.138    [0m | [0m0.2164   [0m | [0m3.467    [0m | [0m962.9    [0m | [0m0.

In [None]:
# Another model with additional last 3 parameters

# |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
# | 0.9325    | 0.868     | 0.9236    | 0.3456    | 5.327     | 2.809     | 532.5     | 0.331     | 0.1515    | 0.9338   |
from xgboost import XGBClassifier

def xgb_bo(max_depth, learning_rate, n_estimators, gamma, subsample, colsample_bytree, min_child_weight, reg_lambda, reg_alpha):
    params = {
        'xgbclassifier__max_depth': int(max_depth),
        'xgbclassifier__learning_rate': learning_rate,
        'xgbclassifier__n_estimators': int(n_estimators),
        'xgbclassifier__gamma': gamma,
        'xgbclassifier__subsample': subsample,
        'xgbclassifier__colsample_bytree': colsample_bytree,
        'xgbclassifier__use_label_encoder': False,
        'xgbclassifier__eval_metric': 'logloss',
        'xgbclassifier__min_child_weight': min_child_weight,
        'xgbclassifier__reg_lambda': reg_lambda,
        'xgbclassifier__reg_alpha': reg_alpha
    }

    pipe = Pipeline(steps=[
      ('smote', BorderlineSMOTE(random_state=52, kind='borderline-1')),
      ('standardscaler', StandardScaler()),
      ('xgbclassifier', XGBClassifier(random_state=52))
    ])


    #Train the model
    pipe.set_params(**params)

    scores = cross_val_score(pipe, X_train, Y_train, cv=kfold, scoring='roc_auc')

    return scores.mean()

#Hyperparameter space
params_xgb = {
    'max_depth': (3, 20),
    'learning_rate': (0.01, 0.5),
    'n_estimators': (100, 2000),
    'gamma': (0, 10),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1),
    'min_child_weight': (1, 10),
    'reg_lambda': (0, 8),
    'reg_alpha': (0, 8)
}

# Initialize Bayesian Optimization
xgb_bo = BayesianOptimization(xgb_bo, params_xgb, random_state=52)
xgb_bo.maximize(init_points=10, n_iter=100)

best = xgb_bo.max['params']
print(best)

* Approach: Random Forest

In [None]:
# Doesn't make sense
from sklearn.ensemble import RandomForestClassifier


def rf_bo(max_depth, n_estimators, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features):

    params = {
        'rf__max_depth': int(max_depth),
        'rf__n_estimators': int(n_estimators),
        'rf__min_samples_split': int(min_samples_split),
        'rf__min_samples_leaf': int(min_samples_leaf),
        'rf__max_leaf_nodes': int(max_leaf_nodes),
        'rf__max_features': max_features
    }

    pipe = Pipeline(steps=[
      ('smote', BorderlineSMOTE(random_state=42, kind='borderline-1')),
      ('standardscaler', StandardScaler()),
      ('rf', RandomForestClassifier())
    ])

    #Train the model
    pipe.set_params(**params)
    scores = cross_val_score(pipe, X_train, Y_train, cv=kfold, scoring='roc_auc')

    return scores.mean()

#Hyperparameter space
params_rf = {
    'max_depth': (2, 50),
    'n_estimators': (100, 1000),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 10),
    'max_leaf_nodes': (10, 1000),
    'max_features': (0.1, 1)
}

# Initialize Bayesian Optimization
xgb_bo = BayesianOptimization(rf_bo, params_rf, random_state=111)
xgb_bo.maximize(init_points=10, n_iter=50)

best = rf_bo.max['params']
print(best)

|   iter    |  target   | max_depth | max_fe... | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9219   [0m | [0m31.38    [0m | [0m0.2522   [0m | [0m441.7    [0m | [0m7.923    [0m | [0m4.363    [0m | [0m234.2    [0m |
| [0m2        [0m | [0m0.8957   [0m | [0m3.079    [0m | [0m0.4782   [0m | [0m246.3    [0m | [0m4.039    [0m | [0m9.926    [0m | [0m314.0    [0m |
| [0m3        [0m | [0m0.9149   [0m | [0m5.897    [0m | [0m0.7026   [0m | [0m625.0    [0m | [0m3.468    [0m | [0m5.73     [0m | [0m206.5    [0m |
| [0m4        [0m | [0m0.9142   [0m | [0m5.55     [0m | [0m0.9107   [0m | [0m796.0    [0m | [0m8.565    [0m | [0m8.522    [0m | [0m991.9    [0m |
| [95m5        [0m | [95m0.9226   [0m | [95m29.71    [0m | [95m0.8324   [0m | [95m427.1    [0m | [95m1.247    [0m | [95m5.633    [0m | [95m19

KeyboardInterrupt: 

* Making final prediction on test set !!! ✊

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

params = {
    'colsample_bytree': 0.868,
    'gamma': 0.9236,
    'learning_rate': 0.3456,
    'max_depth': int(5.327),
    'min_child_weight': 2.809,
    'n_estimators': int(532.5),
    'reg_lambda': 0.331,
    'reg_alpha': 0.1515,
    'subsample': 0.9338,
    'use_label_encoder': False,
    'eval_metric': 'logloss',
}

# Defining models
smote = BorderlineSMOTE(kind='borderline-1')
scaler = StandardScaler()
model = XGBClassifier(**params)

# Prepare data
X_train_smote, Y_train_smote = smote.fit_resample(X_train, Y_train)
X_train_scaled = scaler.fit_transform(X_train_smote)

# Train the model
model.fit(X_train_scaled, Y_train_smote)

# Making prediction
X_test_scaled = scaler.transform(x_test)
predictions = model.predict(X_test_scaled)

In [None]:
df = pd.DataFrame({
    'im': predictions
})

df.to_csv('/content/sample_data/answers.csv')
df.head()

Unnamed: 0,im
0,0
1,1
2,0
3,0
4,0


In [None]:
# Testing

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


params = {
    'colsample_bytree': 0.868,
    'gamma': 0.9236,
    'learning_rate': 0.3456,
    'max_depth': int(5.327),
    'min_child_weight': 2.809,
    'n_estimators': int(532.5),
    'reg_lambda': 0.331,
    'reg_alpha': 0.1515,
    'subsample': 0.9338,
    'use_label_encoder': False,
    'eval_metric': 'logloss',
}

# Defining models
smote = BorderlineSMOTE(kind='borderline-1')
scaler = StandardScaler()
model = XGBClassifier(**params)

x_train_train, x_train_val, y_train_train, y_train_val = train_test_split(X_train, Y_train, test_size=0.1, shuffle=True)

X_train_smote, Y_train_smote = smote.fit_resample(x_train_train, y_train_train)
X_train_scaled = scaler.fit_transform(X_train_smote)
X_val_scaled = scaler.transform(x_train_val)

# Train the model
model.fit(X_train_scaled, Y_train_smote)

# Making prediction
predictions = model.predict_proba(X_val_scaled)
roc_auc_score(y_train_val, predictions[:, 1])

0.9777777777777776