In [9]:
import pandas as pd, numpy as np
import dill
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, recall_score, precision_score, fbeta_score, confusion_matrix
from tqdm import tqdm
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier


import warnings
warnings.filterwarnings("ignore") 

In [5]:
rnd = 42
train = pd.read_csv('df_train_ADASYN_MinMax.csv')
test =  pd.read_csv('df_test_ADASYN_MinMax.csv')

In [6]:
X = train.drop('target', axis = 1)
y = train['target']
X_test = test.drop('target', axis = 1)
y_test = test['target']

# RandomForestClassifier

In [7]:
params = {
    'n_estimators': [50, 80, 100, 150, 200,300,  400],
    'max_depth': [7, 12, 20,30],
}


rf_model =RandomForestClassifier(random_state=rnd)


grid_search = GridSearchCV(rf_model, param_grid=params, cv=3, n_jobs=-1, verbose=3)
grid_search.fit(X, y)


print("Best parameters:", grid_search.best_params_)

Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV 3/3] END .....max_depth=7, n_estimators=150;, score=0.883 total time= 4.3min
[CV 3/3] END .....max_depth=20, n_estimators=80;, score=0.962 total time= 5.8min
[CV 2/3] END .....max_depth=30, n_estimators=80;, score=0.966 total time= 6.4min
[CV 3/3] END ......max_depth=7, n_estimators=80;, score=0.883 total time= 2.1min
[CV 1/3] END ....max_depth=12, n_estimators=300;, score=0.885 total time=15.0min
[CV 1/3] END .....max_depth=12, n_estimators=50;, score=0.882 total time= 2.1min
[CV 3/3] END ....max_depth=12, n_estimators=300;, score=0.932 total time=15.1min
[CV 3/3] END .....max_depth=12, n_estimators=50;, score=0.930 total time= 2.1min
[CV 2/3] END ....max_depth=12, n_estimators=300;, score=0.925 total time=15.3min
[CV 2/3] END ......max_depth=7, n_estimators=50;, score=0.872 total time= 1.4min
[CV 1/3] END ....max_depth=12, n_estimators=200;, score=0.884 total time= 9.9min
[CV 3/3] END .....max_depth=30, n_estimators=80;

In [10]:
def RF(train, test):
    X = train.drop(['target'],axis=1)
    y = train['target'].to_numpy()
    model = RandomForestClassifier(random_state = rnd, n_estimators=400, max_depth=30, n_jobs=-1)
    model.fit(X, y)

    X_t = test.drop(['client_id', 'dt','target'],axis=1)
    y_t = test['target'].to_numpy()

    pred_proba = model.predict_proba(X_t)[:,1]
    print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
    pred = model.predict(X_t)
    print('f1_score: ', round(f1_score(y_t, pred),3))
    print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
    print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
    print('recall_score: ', round(recall_score(y_t, pred),3))
    print('precision_score: ', round(precision_score(y_t, pred),3))


In [11]:
RF(train, test)

gini:  0.356
f1_score:  0.149
f2_score:  0.299
accuracy_score:  0.325
recall_score:  0.913
precision_score:  0.081


Альтернативный способ подбора гиперпараметров с помощью optune

In [23]:
def objective(trial):
    X = train.drop(['target'],axis=1)
    y = train['target'].to_numpy()
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rnd)
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 50),
        'n_estimators': trial.suggest_int('n_estimators', 50, 800),
    }

    # Create a Decision Tree Classifier with the hyperparameters
    dt = RandomForestClassifier(**params, n_jobs=-1, random_state = rnd)

    # Train and evaluate the model
    dt.fit(X_train, y_train)
    score = dt.score(X_val, y_val)

    return score

# Create Optuna study and optimize hyperparameters
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best hyperparameters and score
print('Best trial:')
trial = study.best_trial
print(f'  Score: {trial.value:.3f}')
print('  Params:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

[32m[I 2023-04-01 16:01:37,943][0m A new study created in memory with name: no-name-59da9bef-ff92-496c-a967-9c4ac9b3864d[0m
[32m[I 2023-04-01 16:02:31,238][0m Trial 0 finished with value: 0.9661249758173728 and parameters: {'max_depth': 48, 'n_estimators': 387}. Best is trial 0 with value: 0.9661249758173728.[0m
[32m[I 2023-04-01 16:03:01,139][0m Trial 1 finished with value: 0.9552137744244534 and parameters: {'max_depth': 19, 'n_estimators': 260}. Best is trial 0 with value: 0.9661249758173728.[0m
[32m[I 2023-04-01 16:03:56,952][0m Trial 2 finished with value: 0.9661249758173728 and parameters: {'max_depth': 48, 'n_estimators': 416}. Best is trial 0 with value: 0.9661249758173728.[0m
[32m[I 2023-04-01 16:04:08,317][0m Trial 3 finished with value: 0.9218224027858386 and parameters: {'max_depth': 12, 'n_estimators': 105}. Best is trial 0 with value: 0.9661249758173728.[0m
[32m[I 2023-04-01 16:05:07,862][0m Trial 4 finished with value: 0.9656800154768814 and parameters: 

KeyboardInterrupt: 

In [12]:
def RF(train, test):
    X = train.drop(['target'],axis=1)
    y = train['target'].to_numpy()
    model = RandomForestClassifier(random_state = rnd, n_estimators=507, max_depth=43, n_jobs=-1)
    model.fit(X, y)

    X_t = test.drop(['client_id', 'dt','target'],axis=1)
    y_t = test['target'].to_numpy()

    pred_proba = model.predict_proba(X_t)[:,1]
    print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
    pred = model.predict(X_t)
    print('f1_score: ', round(f1_score(y_t, pred),3))
    print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
    print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
    print('recall_score: ', round(recall_score(y_t, pred),3))
    print('precision_score: ', round(precision_score(y_t, pred),3))


In [13]:
RF(train, test)

gini:  0.341
f1_score:  0.149
f2_score:  0.299
accuracy_score:  0.328
recall_score:  0.91
precision_score:  0.081


# DecisionTreeClassifier

Определение гиперпараметров для DecisionTreeCL
* что засовывать в параметры подсказывал мне GPT ^_^

In [None]:
def objective(trial):
    X = train.drop(['target'],axis=1)
    y = train['target'].to_numpy()
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rnd)
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
    }

    # Create a Decision Tree Classifier with the hyperparameters
    dt = DecisionTreeClassifier(**params, random_state = rnd)

    # Train and evaluate the model
    dt.fit(X_train, y_train)
    score = dt.score(X_val, y_val)

    return score

# Create Optuna study and optimize hyperparameters
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Print the best hyperparameters and score
print('Best trial:')
trial = study.best_trial
print(f'  Score: {trial.value:.3f}')
print('  Params:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

In [26]:
def DTCL(train, test):
    X = train.drop(['target'],axis=1)
    y = train['target'].to_numpy()
    model = DecisionTreeClassifier(random_state = rnd, max_depth=10, min_samples_split= 7, min_samples_leaf = 1, criterion= 'gini')
    model.fit(X, y)

    X_t = test.drop(['target'],axis=1)
    y_t = test['target'].to_numpy()

    pred_proba = model.predict_proba(X_t)[:,1]
    print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
    pred = model.predict(X_t)
    print('f1_score: ', round(f1_score(y_t, pred),3))
    print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
    print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
    print('recall_score: ', round(recall_score(y_t, pred),3))
    print('precision_score: ', round(precision_score(y_t, pred),3))


In [None]:
DTCL(train, test)

gini:  0.614
f1_score:  0.325
f2_score:  0.466
accuracy_score:  0.825
recall_score:  0.654
precision_score:  0.216


# ExtraTreesClassifier

In [29]:
def objective(trial):
    X = train.drop(['target'],axis=1)
    y = train['target'].to_numpy() 
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rnd)  
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }

    # Create an ExtraTrees Classifier with the hyperparameters
    et = ExtraTreesClassifier(**params, random_state = rnd, n_jobs=15)

    # Train and evaluate the model
    et.fit(X_train, y_train)
    score = et.score(X_val, y_val)

    return score

# Create Optuna study and optimize hyperparameters
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, n_jobs=15)

# Print the best hyperparameters and score
print('Best trial:')
trial = study.best_trial
print(f'  Score: {trial.value:.3f}')
print('  Params:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

[32m[I 2023-04-01 16:36:02,729][0m A new study created in memory with name: no-name-2314fe2c-43ff-4b73-bfc1-a70003187e73[0m
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
  'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
[32m[I 2023-04-01 16:37:32,735][0m Trial 13 finished with value: 0.8592377635906365 and parameters: {'n_estimators': 10, 'max_depth': 

In [16]:
def ETCL(train, test):
    X = train.drop(['target'],axis=1)
    y = train['target'].to_numpy()
    model = ExtraTreesClassifier(random_state = rnd, n_jobs=-1, n_estimators =108, max_depth=10, min_samples_split= 2, min_samples_leaf = 7, bootstrap = False, max_features = 0.8484163210879152 )
    model.fit(X, y)

    X_t = test.drop(['client_id', 'dt','target'],axis=1)
    y_t = test['target'].to_numpy()

    pred_proba = model.predict_proba(X_t)[:,1]
    print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
    pred = model.predict(X_t)
    print('f1_score: ', round(f1_score(y_t, pred),3))
    print('f2_score: ',  round(fbeta_score(y_t, pred,beta =2), 3))
    print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
    print('recall_score: ', round(recall_score(y_t, pred),3))
    print('precision_score: ', round(precision_score(y_t, pred),3))

In [17]:
ETCL(train, test)

gini:  0.297
f1_score:  0.121
f2_score:  0.256
accuracy_score:  0.069
recall_score:  0.99
precision_score:  0.064
