In [232]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, classification_report
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from tqdm.auto import tqdm


In [199]:
# Загрузка тренировочных данных
train_data = pd.read_csv('P02/training_data.csv')
# Загрузка тестовых данных
test_data = pd.read_csv('P02/test_data.csv')

In [200]:
train_data

Unnamed: 0.1,Unnamed: 0,ID,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,78830894-cdd8-43d1-9655-03db74141b7a,Female,80.0,0,1,never,25.19,6.6,140,0
1,1,32262c0e-903a-46b8-9061-b1a2f5a3e9b2,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,2,c4868b73-ca90-48ed-af14-3d1f78a5f030,Male,28.0,0,0,never,27.32,5.7,158,0
3,3,bec2ef91-5aff-48df-ac16-cb210b5f29fa,Female,36.0,0,0,current,23.45,5.0,155,0
4,4,1a6852a8-ee80-4d93-bea0-f0cdd941dc3d,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...,...,...
73713,73713,5c890971-a6d7-44ce-a8a1-3b38cda7ef3b,Female,70.0,1,0,never,55.57,6.2,130,1
73714,73714,13a8f177-1623-4acf-9db8-1e8ac8577c86,Male,60.0,0,0,No Info,27.32,6.6,160,0
73715,73715,c9539d1b-075f-481f-8ae4-ab9bba75af5b,Male,65.0,0,0,former,32.07,5.0,160,0
73716,73716,08eb5a8d-f577-440a-9408-c6d99ca17204,Male,64.0,0,0,current,30.23,6.2,158,0


In [201]:
# Замена категориальных признаков на числовые значения
train_data = pd.get_dummies(train_data, columns=['gender', 'smoking_history'])
test_data = pd.get_dummies(test_data, columns=['gender', 'smoking_history'])
train_data = train_data.drop(['Unnamed: 0', 'ID'], axis = 1)
test_data = test_data.drop(['Unnamed: 0', 'ID'], axis =1)

In [202]:
train_data

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,1,0,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0,0,1,0,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,0,1,0,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,1,0,0,0,1,0,0,0,0
4,76.0,1,1,20.14,4.8,155,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73713,70.0,1,0,55.57,6.2,130,1,1,0,0,0,0,0,0,1,0
73714,60.0,0,0,27.32,6.6,160,0,0,1,0,1,0,0,0,0,0
73715,65.0,0,0,32.07,5.0,160,0,0,1,0,0,0,0,1,0,0
73716,64.0,0,0,30.23,6.2,158,0,0,1,0,0,1,0,0,0,0


In [203]:
diabetes = train_data['diabetes']
train_data = train_data.drop('diabetes', axis = 1)
train_data = train_data.assign(diabetes = diabetes)

In [204]:
train_data

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current,diabetes
0,80.0,0,1,25.19,6.6,140,1,0,0,0,0,0,0,1,0,0
1,54.0,0,0,27.32,6.6,80,1,0,0,1,0,0,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,1,0,0,0,0,0,1,0,0
3,36.0,0,0,23.45,5.0,155,1,0,0,0,1,0,0,0,0,0
4,76.0,1,1,20.14,4.8,155,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73713,70.0,1,0,55.57,6.2,130,1,0,0,0,0,0,0,1,0,1
73714,60.0,0,0,27.32,6.6,160,0,1,0,1,0,0,0,0,0,0
73715,65.0,0,0,32.07,5.0,160,0,1,0,0,0,0,1,0,0,0
73716,64.0,0,0,30.23,6.2,158,0,1,0,0,1,0,0,0,0,0


In [205]:
train, valid, test = np.split(train_data.sample(frac = 1), [int(0.6*len(train_data)), int(0.8*len(train_data))])

In [206]:
def scale_dataset(dataframe, oversample = False):
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    data = np.hstack((X, np.reshape(y, (-1,1))))
    
    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)
    
    return data, X, y


In [207]:
train, X_train, y_train = scale_dataset(train, oversample = True)
valid, X_valid, y_valid = scale_dataset(valid, oversample = False)
test, X_test, y_test = scale_dataset(test, oversample = False)

In [208]:
X = X_train
y = y_train

In [209]:
from time import time

import xgboost as xgb

In [210]:
results = list()

def score(params):
    params['max_depth'] = int(params['max_depth'])
    print ("Training with params:")
    print (params)
    
    dtrain = xgb.DMatrix(X, label=y)
    r = open('hyperopt.txt','a')
    t = time()

    cvresult = xgb.cv(params, dtrain, num_boost_round=1_000, nfold=3, stratified=True,
                      metrics='auc', early_stopping_rounds=50, verbose_eval=100, seed=12345, shuffle=True)
    delta_t = time()-t
    print ("Time: {0}\n".format(delta_t))
    # print ("Score:\n{0}".format(avg_result.iloc[-1]))
    r.write(str(params) + ';' + \
            str(cvresult['test-auc-mean'].iloc[-1]) + ';' + \
            str(cvresult['test-auc-std'].iloc[-1]) + ';' + \
            str(cvresult['train-auc-mean'].iloc[-1]) + ';' + \
            str(cvresult['train-auc-std'].iloc[-1]) + ';' + \
            str(cvresult.shape[0]) + ';' + \
            str(delta_t) + '\n')
    r.close()
        
    result = params.copy()
    result['test-auc-mean'] = cvresult['test-auc-mean'].iloc[-1]
    result['test-auc-std'] = cvresult['test-auc-std'].iloc[-1]
    result['train-auc-mean'] = cvresult['train-auc-mean'].iloc[-1]    
    result['train-auc-std'] = cvresult['train-auc-std'].iloc[-1]
    results.append(result)
    
    return {'loss': -cvresult['test-auc-mean'].iloc[-1], 'status': STATUS_OK}

In [211]:
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK

In [212]:
def optimize():
    # Задаем сетку
    # Размер сетки 10*10*10*10 = 10_000
    space = {
#         'n_estimators' : 10000,
        'eta' : 0.1,

        # Model complexity
        'max_depth' : hp.quniform('max_depth', 1, 10, 1),
#         'max_depth' : hp.choice([1,2,3,4,5,6,7,8,9,10]),
#         'min_child_weight': hp.quniform('min_child_weight', 10, 100, 1),

        # Robust to noise
        'subsample' : hp.quniform('subsample', 0.1, 1.0, 0.1),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.1),
        'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1.0, 0.1),

        # 'gamma' : hp.uniform('gamma', 0.0, 1000.0),
        # 'alpha' : hp.uniform('alpha', 0.0, 1000.0),
        # 'lambda' : hp.uniform('lambda', 0.001, 1000.0),
        'eval_metric': 'auc',
        'objective': 'binary:logistic',
#         'silent' : 1,
        'seed' : 12345
    }
    trials = Trials()
    best_parameters = fmin(fn=score, space=space, algo=tpe.suggest, trials=trials, max_evals=100)
    print ("Best parameters:")
    print (best_parameters)
    print ("Best result: {0}\n".format(min(trials.losses())))
    return best_parameters

In [213]:
best_params = optimize()

Training with params:                                                                                                  
{'colsample_bylevel': 0.9, 'colsample_bytree': 0.2, 'eta': 0.1, 'eval_metric': 'auc', 'max_depth': 6, 'objective': 'binary:logistic', 'seed': 12345, 'subsample': 0.7000000000000001}
[0]	train-auc:0.84989+0.00014	test-auc:0.84907+0.00141                                                                 
[100]	train-auc:0.98073+0.00013	test-auc:0.97998+0.00025                                                               
[200]	train-auc:0.98394+0.00006	test-auc:0.98264+0.00039                                                               
[300]	train-auc:0.98583+0.00024	test-auc:0.98415+0.00021                                                               
[400]	train-auc:0.98744+0.00041	test-auc:0.98535+0.00026                                                               
[500]	train-auc:0.98895+0.00047	test-auc:0.98654+0.00032                                          

In [214]:
best_params

{'colsample_bylevel': 1.0,
 'colsample_bytree': 1.0,
 'max_depth': 9.0,
 'subsample': 0.9}

In [215]:
# Лучшие параметры по test-auc-mean
best_params['max_depth'] = int(best_params['max_depth'])
best_params

{'colsample_bylevel': 1.0,
 'colsample_bytree': 1.0,
 'max_depth': 9,
 'subsample': 0.9}

In [220]:
clf = xgb.XGBClassifier(**best_params)
clf.fit(X, y)
y_pred = pd.Series(clf.predict_proba(X_test)[:,1])

In [228]:
# Конвертация результатов в pandas dataframe
df_results = pd.DataFrame.from_records(results)

# Сортируем по убыванию метрики на тесте
df_results = df_results.sort_values(by=['test-auc-mean','train-auc-mean'], ascending=False)
df_results = df_results.reset_index(drop=True)

# Создадим доп поля по разнице метрик на train и test данных
df_results['delta_mean'] = df_results['test-auc-mean'] - df_results['train-auc-mean']
df_results['delta_std'] = df_results['test-auc-std'] - df_results['train-auc-std']

df_results = df_results.drop_duplicates(ignore_index=True)
df_results.head(20)

Unnamed: 0,colsample_bylevel,colsample_bytree,eta,eval_metric,max_depth,objective,seed,subsample,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std,delta_mean,delta_std
0,1.0,1.0,0.1,auc,9,binary:logistic,12345,0.9,0.999391,8.9e-05,0.999999,6.707346e-07,-0.000608,8.8e-05
1,0.8,1.0,0.1,auc,10,binary:logistic,12345,0.7,0.999384,0.000102,0.999999,6.510199e-07,-0.000615,0.000101
2,0.8,0.8,0.1,auc,10,binary:logistic,12345,0.8,0.999373,0.000161,0.999999,5.641164e-07,-0.000625,0.00016
3,0.8,1.0,0.1,auc,10,binary:logistic,12345,0.9,0.999369,0.000107,0.999999,6.083716e-07,-0.00063,0.000107
4,0.9,0.9,0.1,auc,9,binary:logistic,12345,0.8,0.999357,0.000117,0.999999,6.515353e-07,-0.000641,0.000116
5,0.6,1.0,0.1,auc,10,binary:logistic,12345,0.7,0.999353,0.000111,0.999999,6.540897e-07,-0.000646,0.00011
6,1.0,0.9,0.1,auc,9,binary:logistic,12345,0.8,0.999334,0.000138,0.999999,6.330751e-07,-0.000664,0.000137
7,0.8,0.8,0.1,auc,10,binary:logistic,12345,0.7,0.999326,0.000159,0.999999,6.413582e-07,-0.000672,0.000158
8,0.7,0.7,0.1,auc,10,binary:logistic,12345,0.8,0.999299,0.000139,0.999999,6.103854e-07,-0.000699,0.000138
9,0.6,1.0,0.1,auc,10,binary:logistic,12345,0.6,0.999292,0.000104,0.999999,6.285764e-07,-0.000706,0.000103


In [229]:
# Создадим дополнительный столбец для хранения среднего значения таргета
df_results.loc[:,'target_mean'] = 0
df_results

Unnamed: 0,colsample_bylevel,colsample_bytree,eta,eval_metric,max_depth,objective,seed,subsample,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std,delta_mean,delta_std,target_mean
0,1.0,1.0,0.1,auc,9,binary:logistic,12345,0.9,0.999391,0.000089,0.999999,6.707346e-07,-0.000608,0.000088,0
1,0.8,1.0,0.1,auc,10,binary:logistic,12345,0.7,0.999384,0.000102,0.999999,6.510199e-07,-0.000615,0.000101,0
2,0.8,0.8,0.1,auc,10,binary:logistic,12345,0.8,0.999373,0.000161,0.999999,5.641164e-07,-0.000625,0.000160,0
3,0.8,1.0,0.1,auc,10,binary:logistic,12345,0.9,0.999369,0.000107,0.999999,6.083716e-07,-0.000630,0.000107,0
4,0.9,0.9,0.1,auc,9,binary:logistic,12345,0.8,0.999357,0.000117,0.999999,6.515353e-07,-0.000641,0.000116,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,0.7,0.2,0.1,auc,2,binary:logistic,12345,0.5,0.981213,0.000294,0.981985,1.709923e-04,-0.000772,0.000123,0
91,0.4,0.3,0.1,auc,2,binary:logistic,12345,0.4,0.979668,0.000331,0.980203,1.466750e-04,-0.000534,0.000184,0
92,1.0,0.8,0.1,auc,1,binary:logistic,12345,0.8,0.979522,0.000257,0.979812,9.746878e-05,-0.000290,0.000160,0
93,0.6,0.7,0.1,auc,1,binary:logistic,12345,0.4,0.979406,0.000229,0.979739,1.069212e-04,-0.000332,0.000123,0


In [230]:
# Строка -> dict
df_results.iloc[0][:-7].to_dict()

{'colsample_bylevel': 1.0,
 'colsample_bytree': 1.0,
 'eta': 0.1,
 'eval_metric': 'auc',
 'max_depth': 9,
 'objective': 'binary:logistic',
 'seed': 12345,
 'subsample': 0.9}

In [235]:
# Рассчитаем среднюю долю таргета для каждого набора параметров
for i in tqdm(range(df_results.shape[0])):
    params = df_results.iloc[i][:-7].to_dict()
    clf = xgb.XGBClassifier(**params)
    clf.fit(X, y)
    df_results.loc[i,'target_mean'] = np.mean(clf.predict(X_test))

  0%|          | 0/95 [00:00<?, ?it/s]

In [242]:
df_results

Unnamed: 0,colsample_bylevel,colsample_bytree,eta,eval_metric,max_depth,objective,seed,subsample,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std,delta_mean,delta_std,target_mean
0,1.0,1.0,0.1,auc,9,binary:logistic,12345,0.9,0.999391,0.000089,0.999999,6.707346e-07,-0.000608,0.000088,0.154707
1,0.8,1.0,0.1,auc,10,binary:logistic,12345,0.7,0.999384,0.000102,0.999999,6.510199e-07,-0.000615,0.000101,0.135445
2,0.8,0.8,0.1,auc,10,binary:logistic,12345,0.8,0.999373,0.000161,0.999999,5.641164e-07,-0.000625,0.000160,0.145279
3,0.8,1.0,0.1,auc,10,binary:logistic,12345,0.9,0.999369,0.000107,0.999999,6.083716e-07,-0.000630,0.000107,0.142092
4,0.9,0.9,0.1,auc,9,binary:logistic,12345,0.8,0.999357,0.000117,0.999999,6.515353e-07,-0.000641,0.000116,0.153554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,0.7,0.2,0.1,auc,2,binary:logistic,12345,0.5,0.981213,0.000294,0.981985,1.709923e-04,-0.000772,0.000123,0.170985
91,0.4,0.3,0.1,auc,2,binary:logistic,12345,0.4,0.979668,0.000331,0.980203,1.466750e-04,-0.000534,0.000184,0.176953
92,1.0,0.8,0.1,auc,1,binary:logistic,12345,0.8,0.979522,0.000257,0.979812,9.746878e-05,-0.000290,0.000160,0.202591
93,0.6,0.7,0.1,auc,1,binary:logistic,12345,0.4,0.979406,0.000229,0.979739,1.069212e-04,-0.000332,0.000123,0.187059


In [237]:
y_test.mean()

0.08471242539338036

In [222]:
top_n = int(y_test.mean()*len(y_pred))
top_n

1249

In [224]:
# Новый порог
thd = y_pred.sort_values(ascending=False).head(top_n).iloc[-1]
thd

0.60259074

In [225]:
# Конвертируем в ответы с учетом нового порога
y_pred[y_pred >= thd] = 1
y_pred[y_pred < thd] = 0
y_pred = y_pred.astype(int)
y_pred

0        0
1        1
2        0
3        0
4        0
        ..
14739    0
14740    0
14741    0
14742    0
14743    0
Length: 14744, dtype: int32

In [226]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98     13495
           1       0.76      0.76      0.76      1249

    accuracy                           0.96     14744
   macro avg       0.87      0.87      0.87     14744
weighted avg       0.96      0.96      0.96     14744

