In [28]:
import xgboost as xgb

from sklearn.metrics import roc_auc_score

from hpsklearn import HyperoptEstimator, svc, xgboost_classification, any_classifier, any_preprocessing

In [29]:

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe


In [30]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import matplotlib.pyplot as plt
import pandas as pd

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error
from sklearn.model_selection import train_test_split 
from sklearn.neural_network import MLPClassifier


In [31]:
merged_df = pd.read_csv('processed_data/prepared_featured_balanced_sample_5.csv').drop('Unnamed: 0',axis=1)

In [32]:
x = merged_df.drop(['event_target'],axis=1)
y = merged_df['event_target']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [55]:
def fit_and_result(clf):
    clf.fit(X_train, y_train)

    clf_train_pred = clf.predict(X_train)
    clf_test_pred = clf.predict(X_test)

    print('Точность на тренировочной выборке')
    print(accuracy_score(y_train, clf_train_pred))
    print('Точность на тестовой выборке')
    print(accuracy_score(y_test, clf_test_pred))
    
    
    score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    print(f"\n\nROC AUC TEST: {score:.4f}")

In [33]:
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter("ignore")

In [57]:
from hyperopt import hp, STATUS_OK, Trials, fmin, tpe
from sklearn.model_selection import cross_val_score


def objective(space):

    warnings.filterwarnings(action='ignore', category=DeprecationWarning)
    classifier = xgb.XGBClassifier(n_estimators = space['n_estimators'],
                            max_depth = int(space['max_depth']),
                            learning_rate = space['learning_rate'],
                            gamma = space['gamma'],
                            min_child_weight = space['min_child_weight'],
                            subsample = space['subsample'],
                            colsample_bytree = space['colsample_bytree'],
                            booster = 'gbtree'
                            )
    
    classifier.fit(X_train, y_train)

    # Applying k-Fold Cross Validation
    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(estimator = classifier, scoring='roc_auc', X = X_test, y = y_test, cv = 10)
    CrossValMean = accuracies.mean()
    
    print('---------')
    print("CrossValMean AUC SCORE:", CrossValMean)


    return{'loss':1-CrossValMean, 'status': STATUS_OK }

space = {
     'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    
    'max_depth' : hp.choice('max_depth', range(5, 30, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'n_estimators' : hp.choice('n_estimators', range(20, 800, 50)),
    'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
    'eta' : hp.quniform('eta', 0.1, 1 , 0.1),
    'tree_method' : hp.choice('tree_method', ('auto', 'exact', 'approx', 'hist'))
}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=200,
            trials=trials)

print("Best: ", best)


# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators = best['n_estimators'],
                            max_depth = best['max_depth'],
                            learning_rate = best['learning_rate'],
                            gamma = best['gamma'],
                            min_child_weight = best['min_child_weight'],
                            subsample = best['subsample'],
                            colsample_bytree = best['colsample_bytree'],
                            eta = best['eta'],
                            tree_method = best['tree_method']
                            )

classifier.fit(X_train, y_train)


---------                                                                                                              
CrossValMean:                                                                                                          
0.6359019635275165                                                                                                     
---------                                                                                                              
CrossValMean:                                                                                                          
0.6379085481088158                                                                                                     
---------                                                                                                              
CrossValMean:                                                                                                          
0.6442913766631285                      


KeyboardInterrupt



In [46]:
best_params = {'colsample_bytree': 0.5, 'eta': 0.8, 'gamma': 0.24, 'learning_rate': 0.04, 'max_depth': 5, 'min_child_weight': 4.0, 'n_estimators': 27, 'subsample': 0.75}

In [47]:
xgb_final_clf = XGBClassifier(**best_params)

In [56]:
fit_and_result(xgb_final_clf)

Точность на тренировочной выборке
0.601772899957788
Точность на тестовой выборке
0.5995812791247384


ROC AUC TEST: 0.6589


In [52]:
cross_val_score(estimator = xgb_final_clf, scoring='roc_auc', X = X_test, y = y_test, cv = 10)

array([0.6386161 , 0.65757365])