In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score, precision_score, confusion_matrix, fbeta_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
#!pip install optuna
import optuna
!pip install catboost
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from tqdm import tqdm 
from sklearn.linear_model import SGDClassifier 
#!pip install dill
import dill

import warnings
warnings.filterwarnings("ignore") 


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
df_train_1 = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/train_1.csv')
df_test_1 = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/test_1.csv')
df_train_2 = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/train_2.csv')
df_test_2 = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/test_2.csv')

#RUSBoostClassifer

Подбор гиперпараметров для RUSBoost

In [None]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'sampling_strategy': ['majority', 'not minority', 'not majority', 'all']
} 

#Определяем random_state

In [None]:
rnd = 42

X = df_train_1.drop(['target'],axis=1)#.to_numpy()
y = df_train_1['target'].to_numpy()


In [None]:
rnd = 42

def RUSBoost_CL(train, test):  
  X = train.drop(['target'],axis=1)#.to_numpy()
  y = train['target'].to_numpy()
  model = RUSBoostClassifier(random_state = rnd, n_estimators=100, sampling_strategy = 'majority', base_estimator =  AdaBoostClassifier(n_estimators=10))
  model.fit(X, y)

  X_t = test.drop(['target'],axis=1)
  y_t = test['target'].to_numpy()

  pred_proba = model.predict_proba(X_t)[:,1]
  print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
  pred = model.predict(X_t)
  print('f1_score: ', round(f1_score(y_t, pred),3))
  print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
  print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
  print('recall_score: ', round(recall_score(y_t, pred),3))
  print('precision_score: ', round(precision_score(y_t, pred),3),'\n\n')

In [None]:
RUSBoost_CL(df_train_1, df_test_1)
RUSBoost_CL(df_train_2, df_test_2)



gini:  0.723
f1_score:  0.36
f2_score:  0.5
accuracy_score:  0.845
recall_score:  0.675
precision_score:  0.245 






gini:  0.733
f1_score:  0.323
f2_score:  0.5
accuracy_score:  0.787
recall_score:  0.787
precision_score:  0.203 




In [None]:
X = df_train_1.drop(['target'],axis=1)
y = df_train_1['target'].to_numpy()

rus_boost = RUSBoostClassifier(random_state = 42)
grid_search = GridSearchCV(rus_boost, param_grid=param_grid, cv=5)
grid_search.fit(X, y)

# вывод наилучшей комбинации гиперпараметров и соответствующего значения метрики качества
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'n_estimators': 100, 'sampling_strategy': 'majority'}
Best score:  0.8106807515344476


Гиперпараметры для AdaBoost

In [None]:
X = df_train_1.drop(['target'],axis=1)
y = df_train_1['target'].to_numpy()

In [None]:
ab_clf = AdaBoostClassifier(random_state=rnd)

parameters = {
    'learning_rate': [(0.97 + x / 100) for x in range(0, 8)]
}
clf = GridSearchCV(ab_clf, parameters, cv=5)
clf.fit(X, y)
print("Best parameters: ", clf.best_params_)

Best parameters:  {'learning_rate': 1.02}


In [None]:
def AdaBoostCl(train, test):
  X = train.drop(['target'],axis=1)#.to_numpy()
  y = train['target'].to_numpy()
  model = AdaBoostClassifier(random_state = rnd, learning_rate=1.02, n_estimators=100)
  model.fit(X, y)

  X_t = test.drop(['target'],axis=1)
  y_t = test['target'].to_numpy()

  pred_proba = model.predict_proba(X_t)[:,1]
  print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
  pred = model.predict(X_t)
  print('f1_score: ', round(f1_score(y_t, pred),3))
  print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
  print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
  print('recall_score: ', round(recall_score(y_t, pred),3))
  print('precision_score: ', round(precision_score(y_t, pred),3),'\n\n')

In [None]:
AdaBoostCl(df_train_1, df_test_1)
AdaBoostCl(df_train_2, df_test_2)

gini:  0.716
f1_score:  0.356
f2_score:  0.498
accuracy_score:  0.841
recall_score:  0.68
precision_score:  0.241
gini:  0.725
f1_score:  0.32
f2_score:  0.494
accuracy_score:  0.788
recall_score:  0.772
precision_score:  0.202


In [None]:
def KNN(train, test):

  X = train.drop(['target'],axis=1)
  y = train['target'].to_numpy()
  model = KNeighborsClassifier(n_neighbors=100)
  model.fit(X, y)

  X_t = test.drop(['target'],axis=1)
  y_t = test['target'].to_numpy()

  pred_proba = model.predict_proba(X_t)[:,1]
  print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
  pred = model.predict(X_t)
  print('f1_score: ', round(f1_score(y_t, pred),3))
  print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
  print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
  print('recall_score: ', round(recall_score(y_t, pred),3))
  print('precision_score: ', round(precision_score(y_t, pred),3),'\n\n')

In [None]:
KNN(df_train_1, df_test_1)
KNN(df_train_2, df_test_2)

gini:  0.625
f1_score:  0.3
f2_score:  0.434
accuracy_score:  0.813
recall_score:  0.618
precision_score:  0.198
gini:  0.65
f1_score:  0.28
f2_score:  0.444
accuracy_score:  0.756
recall_score:  0.731
precision_score:  0.173


#CatBoostClassifier

подборка гиперпараметров

In [None]:
catboost = CatBoostClassifier()

param_grid = {'learning_rate': [0.01, 0.1, 1],
              'depth': [3, 7]}

grid_search = GridSearchCV(catboost, param_grid=param_grid, cv=5)

grid_search.fit(X, y)

print("Best parameters: {}".format(grid_search.best_params_))

In [None]:
def CatBoostCl(train, test):

  X = train.drop(['target'],axis=1)
  y = train['target'].to_numpy()
  model = CatBoostClassifier(random_state = rnd, iterations = 100, depth= 7, learning_rate = 1)
  model.fit(X, y)

  X_t = test.drop(['target'],axis=1)
  y_t = test['target'].to_numpy()

  pred_proba = model.predict_proba(X_t)[:,1]
  print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
  pred = model.predict(X_t)
  print('f1_score: ', round(f1_score(y_t, pred),3))
  print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
  print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
  print('recall_score: ', round(recall_score(y_t, pred),3))
  print('precision_score: ', round(precision_score(y_t, pred),3))

In [None]:
CatBoostCl(df_train_1, df_test_1)

#XGB Classifier

Подборка гиперпараметров

In [None]:
params = {
    'learning_rate': [0.05, 0.15],
    'max_depth': [3, 5],
}

xgb_model =XGBClassifier(random_state=rnd)
grid_search = GridSearchCV(xgb_model, param_grid=params, cv=5, n_jobs=-1, verbose=3)
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters: {'learning_rate': 0.15, 'max_depth': 5}


In [None]:
def XgbCl(train, test):

  X = train.drop(['target'],axis=1)
  y = train['target'].to_numpy()
  model = XGBClassifier(random_state = rnd, n_estimators=100, learning_rate=0.15, max_depth=5)
  model.fit(X, y)

  X_t = test.drop(['target'],axis=1)
  y_t = test['target'].to_numpy()

  pred_proba = model.predict_proba(X_t)[:,1]
  print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
  pred = model.predict(X_t)
  print('f1_score: ', round(f1_score(y_t, pred),3))
  print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
  print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
  print('recall_score: ', round(recall_score(y_t, pred),3))
  print('precision_score: ', round(precision_score(y_t, pred),3))

In [None]:
XgbCl(df_train_1, df_test_1)

gini:  0.741
f1_score:  0.366
f2_score:  0.512
accuracy_score:  0.844
recall_score:  0.698
precision_score:  0.248


#LGBM Classifier

Подборка гиперпараметров

In [None]:
def objective(trial):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rnd)
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 50),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
    }

    # Create a Decision Tree Classifier with the hyperparameters
    lgbm = LGBMClassifier(**params)

    # Train and evaluate the model
    lgbm.fit(X_train, y_train)
    score = lgbm.score(X_val, y_val)

    return score

# Create Optuna study and optimize hyperparameters
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and score
print('Best trial:')
trial = study.best_trial
print(f'  Score: {trial.value:.3f}')
print('  Params:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

In [None]:
def LGBMCl(train, test):

  X = train.drop(['target'],axis=1)
  y = train['target'].to_numpy()
  model = LGBMClassifier(random_state = rnd, learning_rate=0.099, num_leaves = 45, max_depth=8, min_child_samples = 55, subsample = 0.23, colsample_bytree = 0.325)
  model.fit(X, y)

  X_t = test.drop(['target'],axis=1)
  y_t = test['target'].to_numpy()

  pred_proba = model.predict_proba(X_t)[:,1]
  print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
  pred = model.predict(X_t)
  print('f1_score: ', round(f1_score(y_t, pred),3))
  print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
  print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
  print('recall_score: ', round(recall_score(y_t, pred),3))
  print('precision_score: ', round(precision_score(y_t, pred),3))

In [None]:
 LGBMCl(df_train_1, df_test_1)

Tune LogReg

In [None]:
def objective(trial):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rnd)
    params = {
        'solver' : trial.suggest_categorical('solver', ['liblinear']),
        'penalty' : trial.suggest_categorical('penalty',  ['l1', 'l2']),
         'C' : trial.suggest_loguniform('C', 1e-4, 1e4), 
    }

    logreg = LogisticRegression(**params)

    logreg.fit(X_train, y_train)
    score = logreg.score(X_val, y_val)

    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Best trial:')
trial = study.best_trial
print(f'  Score: {trial.value:.3f}')
print('  Params:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

In [None]:
def LogReg(train, test):

  X = train.drop(['target'],axis=1)
  y = train['target'].to_numpy()
  model = LogisticRegression(random_state = rnd, solver = 'liblinear', penalty = 'l2' , C = 6484.008244618032)
  model.fit(X, y)

  X_t = test.drop(['target'],axis=1)
  y_t = test['target'].to_numpy()

  pred_proba = model.predict_proba(X_t)[:,1]
  print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
  pred = model.predict(X_t)
  print('f1_score: ', round(f1_score(y_t, pred),3))
  print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
  print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
  print('recall_score: ', round(recall_score(y_t, pred),3))
  print('precision_score: ', round(precision_score(y_t, pred),3))

In [None]:
LogReg(df_train_1,df_test_1)

gini:  0.637
f1_score:  0.312
f2_score:  0.446
accuracy_score:  0.822
recall_score:  0.624
precision_score:  0.208


Tune SGD

In [None]:
def objective(trial):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rnd)
    alpha = trial.suggest_loguniform('alpha', 1e-5, 1e0)
    l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1)
    clf = SGDClassifier(loss='log', alpha=alpha, l1_ratio=l1_ratio)

    clf.fit(X_train, y_train)
    score = clf.score(X_val, y_val)

    return score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Best trial:')
trial = study.best_trial
print(f'  Score: {trial.value:.3f}')
print('  Params:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

In [None]:
#1 варик
def Voting(train, test):
  X = train.drop(['target'],axis=1)
  y = train['target'].to_numpy()

  clf1 = LogisticRegression(random_state=rnd, C =6484.008244618032 , penalty = 'l2', solver = 'liblinear')
  clf2 = RandomForestClassifier(random_state = rnd, n_estimators=507, max_depth=43, n_jobs=-1)
  clf3 = LGBMClassifier(random_state = rnd, learning_rate=0.099, num_leaves = 45, max_depth=8, min_child_samples = 55, subsample = 0.23, colsample_bytree = 0.325)

  model = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('lgbm', clf3)], voting='soft')
  model.fit(X, y)

  X_t = test.drop(['target'],axis=1)
  y_t = test['target'].to_numpy()

  pred_proba = model.predict_proba(X_t)[:,1]
  print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
  pred = model.predict(X_t)
  print('f1_score: ', round(f1_score(y_t, pred),3))
  print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
  print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
  print('recall_score: ', round(recall_score(y_t, pred),3))
  print('precision_score: ', round(precision_score(y_t, pred),3))
  with open('/content/drive/MyDrive/Colab_Notebooks/Voting_LGBM_RF_LR.dat', 'wb') as f:
    dill.dump(model,f)


In [None]:
Voting(df_train_1,df_test_1)

gini:  0.728
f1_score:  0.404
f2_score:  0.453
accuracy_score:  0.906
recall_score:  0.494
precision_score:  0.342


#VotingClassifier

In [None]:
#2 варик
def Voting_2(train, test):
  X = train.drop(['target'],axis=1)
  y = train['target'].to_numpy()

  clf1 = SGDClassifier(loss='log', alpha=1.6732202108806564e-05, l1_ratio=0.5386785802038093)
  clf2 = RandomForestClassifier(random_state = rnd, n_estimators=507, max_depth=43, n_jobs=-1)
  clf3 = LGBMClassifier(random_state = rnd, learning_rate=0.099, num_leaves = 45, max_depth=8, min_child_samples = 55, subsample = 0.23, colsample_bytree = 0.325)

  model = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('lgbm', clf3)], voting='soft')
  model.fit(X, y)

  X_t = test.drop(['target'],axis=1)
  y_t = test['target'].to_numpy()

  pred_proba = model.predict_proba(X_t)[:,1]
  print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
  pred = model.predict(X_t)
  print('f1_score: ', round(f1_score(y_t, pred),3))
  print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
  print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
  print('recall_score: ', round(recall_score(y_t, pred),3))
  print('precision_score: ', round(precision_score(y_t, pred),3))
  with open('/content/drive/MyDrive/Colab_Notebooks/Voting_LGBM_RF_SGD.dat', 'wb') as f:
    dill.dump(model,f)


In [None]:
Voting_2(df_train_1,df_test_1)

gini:  0.725
f1_score:  0.401
f2_score:  0.459
accuracy_score:  0.902
recall_score:  0.508
precision_score:  0.331
