In [15]:
!pip install lightgbm
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install catboost
!pip install xgboost
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0


In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score,precision_score,recall_score,confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt

In [2]:
file_path = "data/train.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,target,id,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_409,feature_410,feature_411,feature_412,feature_413,feature_414,feature_415,feature_416,feature_417,feature_418
0,0,0,0.468142,-1.045346,0.0,0.384487,0.435121,-1.178548,0.124543,1.801544,...,-0.361507,-1.026853,0.0,1.4186,-0.929668,1.284014,0.731842,0.801786,-0.728297,-0.412095
1,0,1,-0.760983,0.515132,0.0,-1.673905,-0.393862,-1.584207,-0.439778,0.796104,...,-0.546275,-1.489542,0.0,-0.622007,-0.473156,0.78002,0.648577,0.6461,-0.789362,0.083349
2,0,2,1.658855,0.915052,0.0,-0.581082,0.477199,-0.622226,0.390642,0.753299,...,-0.485999,0.586012,0.0,0.361481,-0.364566,-1.318596,-0.385155,0.140133,0.123245,-0.67003
3,0,3,-0.638854,0.314099,0.0,0.000919,1.102342,-0.807371,0.329158,0.484305,...,0.321985,-0.075827,0.0,-1.629672,0.876864,0.411271,0.43344,0.997364,2.82959,-1.275588
4,0,4,-1.091376,0.859811,0.0,-0.505439,1.665086,-0.912464,-0.332054,0.707705,...,0.828886,0.140387,0.0,-0.624304,-2.197691,-1.479267,-0.465917,-0.014757,-0.320434,-0.511896


In [3]:
X = df.drop(["target"],axis=1)
y = df["target"]

In [4]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights_dict = {i : class_weights[i] for i in range(len(class_weights))}

In [7]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [14]:
# Test data

file_path = "data/test.csv"
df = pd.read_csv(file_path)
df.head()

scaler = StandardScaler()
scaler.fit(df)
output_X = scaler.transform(df)

output_df = pd.read_csv('data/input/baseline_submission_case1.csv', index_col='id')

## BaseLine модель (LightGbm)

In [9]:
# rec 32
# prec 94
# roc-auc 98

import lightgbm as lgb

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'max_depth': 10,
    'min_child_samples': 10,
    'class_weight': 'balanced',
    'n_estimators': 1000
}

lgb_clf = lgb.LGBMClassifier(**params)
lgb_clf.fit(X_resampled,y_resampled)

lgb_clf.predict(X_test)
y_pred = lgb_clf.predict(X_test)
y_pred_proba = lgb_clf.predict_proba(X_test)[:, 1]

print("recall:", recall_score(y_test,y_pred))
print("precision:", precision_score(y_test,y_pred))
print("roc-auc:", roc_auc_score(y_test,y_pred_proba))
print("confusion matrix:", confusion_matrix(y_test,y_pred))

[LightGBM] [Info] Number of positive: 37888, number of negative: 37888
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.096583 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106693
[LightGBM] [Info] Number of data points in the train set: 75776, number of used features: 419
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
recall: 0.32075471698113206
precision: 0.9444444444444444
roc-auc: 0.9839369013636228
confusion matrix: [[7585    3]
 [ 108   51]]


In [10]:
threshold = 0.3
y_pred_custom = (y_pred_proba >= threshold).astype(int)

precision = precision_score(y_test, y_pred_custom)
recall = recall_score(y_test, y_pred_custom)
print(f"Precision: {precision}, Recall: {recall}")

Precision: 0.6910569105691057, Recall: 0.5345911949685535


In [15]:
output = lgb_clf.predict_proba(output_X)[:,1]
output_df['target'] = output
output_df.to_csv('data/output/submission.csv', index=True)

## Модель Catboost

In [15]:
import catboost as ctb
from catboost import CatBoostClassifier

cat_clf = CatBoostClassifier(
    iterations=1000,              
    learning_rate=0.1,             
    depth=6,                       
    class_weights=class_weights,   
    eval_metric='AUC',             
    random_seed=42,               
    verbose=100)                   

cat_clf.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),  
    early_stopping_rounds=50           
)

y_pred = cat_clf.predict(X_test)

y_pred_proba = cat_clf.predict_proba(X_test)[:, 1]

recall_ctb = recall_score(y_test, y_pred)
precision_ctb = precision_score(y_test, y_pred)
roc_auc_ctb = roc_auc_score(y_test, y_pred_proba)
conf_matrix_ctb = confusion_matrix(y_test, y_pred)

print("Recall:", recall_ctb)
print("Precision:", precision_ctb)
print("ROC-AUC:", roc_auc_ctb)
print("Confusion Matrix:\n", conf_matrix_ctb)

0:	test: 0.7108510	best: 0.7108510 (0)	total: 201ms	remaining: 3m 20s
100:	test: 0.7636130	best: 0.7775186 (65)	total: 3.73s	remaining: 33.2s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7775186242
bestIteration = 65

Shrink model to first 66 iterations.
Recall: 0.610062893081761
Precision: 0.06229929351316635
ROC-AUC: 0.7775186242428461
Confusion Matrix:
 [[6128 1460]
 [  62   97]]


## Модель RandomForest

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_jobs=-1, verbose=1, class_weight='balanced')
rf_clf.fit(X_train, y_train)

rf_y_pred = rf_clf.predict(X_test)
rf_y_pred_proba = rf_clf.predict_proba(X_test)[:, 1]
recall_rf = recall_score(y_test, rf_y_pred)
precision_rf = precision_score(y_test, rf_y_pred)
roc_auc_rf = roc_auc_score(y_test, rf_y_pred_proba)
conf_matrix_rf = confusion_matrix(y_test,rf_y_pred)

print("Recall:", recall_rf)
print("Precision:", precision_rf)
print("Roc-AUC:", roc_auc_rf)
print("Confusion Matrix:\n", conf_matrix_rf)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.0s


Recall: 0.0
Precision: 0.0
Roc-AUC: 0.7024211515700063
Confusion Matrix:
 [[7588    0]
 [ 159    0]]


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    6.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Стак модель

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier  
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, roc_auc_score, confusion_matrix

class EstimatorWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator, name, **kwargs):
        self.name = name
        self.base_estimator = base_estimator
        self.base_estimator.set_params(**kwargs)
    
    def fit(self, X, y, **fit_params):
        print(f"Начинается обучение модели {self.name}")
        self.base_estimator.fit(X, y, **fit_params)
        self.classes_ = self.base_estimator.classes_  
        print(f"Обучение модели {self.name} завершено")
        return self
    
    def predict(self, X):
        return self.base_estimator.predict(X)
    
    def predict_proba(self, X):
        return self.base_estimator.predict_proba(X)
    
    def get_params(self, deep=True):
        params = {'name': self.name, 'base_estimator': self.base_estimator}
        if deep:
            base_params = self.base_estimator.get_params(deep=deep)
            params.update(base_params)
        return params
    
    def set_params(self, **params):
        if 'name' in params:
            self.name = params.pop('name')
        if 'base_estimator' in params:
            self.base_estimator = params.pop('base_estimator')
        if params:
            self.base_estimator.set_params(**params)
        return self

lgb_clf_base = lgb.LGBMClassifier()
cat_clf_base = CatBoostClassifier(
    class_weights=class_weights,
    random_seed=42,
    verbose=0
)
rf_clf_base = RandomForestClassifier()

lgb_clf_wrapped = EstimatorWrapper(lgb_clf_base, name='LightGBM', n_estimators=100, verbosity=2, n_jobs=-1)
cat_clf_wrapped = EstimatorWrapper(cat_clf_base, name='CatBoost',
                                   iterations=1000,
                                   learning_rate=0.1,
                                   depth=6,
                                   eval_metric='AUC',
                                   verbose=2)
rf_clf_wrapped = EstimatorWrapper(rf_clf_base, name='RandomForest', verbose=2, n_jobs=-1)

estimators = [
    ('lgb', lgb_clf_wrapped),
    ('cat', cat_clf_wrapped),
    ('rf', rf_clf_wrapped)
]

meta_clf_base = XGBClassifier(
    use_label_encoder=False,  
    eval_metric='logloss',   
    random_state=42,
    n_estimators=100,
    verbosity=2
)
meta_clf_wrapped = EstimatorWrapper(meta_clf_base, name='XGBoost')

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_clf_wrapped,
    cv=5,
    verbose=2
)

stacking_clf.fit(X_train, y_train)

y_pred = stacking_clf.predict(X_test)
y_pred_proba = stacking_clf.predict_proba(X_test)[:, 1]

recall_stack = recall_score(y_test, y_pred)
precision_stack = precision_score(y_test, y_pred)
roc_auc_stack = roc_auc_score(y_test, y_pred_proba)
conf_matrix_stack = confusion_matrix(y_test, y_pred)

print("Recall:", recall_stack)
print("Precision:", precision_stack)
print("ROC-AUC:", roc_auc_stack)
print("Confusion Matrix:\n", conf_matrix_stack)

Начинается обучение модели LightGBM
[LightGBM] [Info] Number of positive: 684, number of negative: 30300
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.826924
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.057600
[LightGBM] [Debug] init for col-wise cost 0.001200 seconds, init for row-wise cost 0.017020 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035317 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97565
[LightGBM] [Info] Number of data points in the train set: 30984, number of used features: 419
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.022076 -> initscore=-3.790945
[LightGBM] [Info] Start training from score -3.790945
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 14
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Deb

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.4s


building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71

KeyboardInterrupt: 