In [1]:
import time

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import log_loss, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import StratifiedKFold, ParameterGrid

In [2]:
RANDOM_SEED = 42
pd.options.mode.chained_assignment = None

In [3]:
data = pd.read_csv('../data/4/grbs_finished.csv', encoding='utf-8')
print(data.shape)
data.head()

(53979, 47)


Unnamed: 0,valID,cntrID,supID,orgID,okpdID,sup_cntr_num,sup_running_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,...,sup_mun_cntr_share,sup_okpd_exp,org_good_cntr_share,org_fed_cntr_share,org_sub_cntr_share,org_mun_cntr_share,okpd_good_cntr_share,quarter,cntr_length,one_day_price
0,57692890,32352291,10135334,467566,153861,6,3,165150,0.0,1.0,...,0.0,0.166667,0.99006,1.0,0.0,0.0,1.011099,2,233,1001
1,211683097,34748597,10490076,480718,151943,1,0,255590,0.0,1.0,...,0.0,1.0,0.985011,1.0,0.0,0.0,1.010274,3,129,1981
2,58994828,32503663,10147390,520420,151705,217,162,60633,0.0,1.0,...,0.0,0.603687,0.968401,1.0,0.0,0.0,1.036036,3,84,1017
3,208185735,40043960,10197821,207848,150662,3,0,187902,0.0,1.0,...,0.333333,0.666667,0.993682,1.0,0.0,0.0,1.07904,1,71,5752
4,59431305,33698468,10326174,90470,157012,3,0,180288,0.0,1.0,...,0.0,0.666667,0.74,1.0,0.0,0.0,1.070969,4,67,788


In [4]:
def process_numerical(df, train=True):
    """Обработка количественных переменных"""
    
    if train:
        process_numerical.cache = {}
        scaler = StandardScaler()
    else:
        scaler = load_scaler()
    
    for nv in df[num_var]:
        if train:
            ulimit = np.percentile(df[nv].values, 99)
            dlimit = np.percentile(df[nv].values, 1)
            process_numerical.cache[nv] = (dlimit, ulimit)
        else:
            ulimit = process_numerical.cache[nv][1]
            dlimit = process_numerical.cache[nv][0]
            
        df.loc[df[nv] > ulimit, nv] = ulimit
        df.loc[df[nv] < dlimit, nv] = dlimit
    
    for nv in df[num_var]:
        df.loc[df[nv] < 1, nv] = 1
        df.loc[:,nv] = np.log(df[nv])
    
    if train:
        df.loc[:,num_var] = scaler.fit_transform(df[num_var])
        # save_scaler(scaler)
    else:
        df.loc[:,num_var] = scaler.transform(df[num_var])
        
    return df

def process_nominal(df, train=True):
    """Обработка номинальных переменных"""
    
    if train:
        group_converters = {}
        woe_converters = {}

        for cv in cat_var:
            cnt = df[cv].value_counts()
            for val, count in zip(cnt.index, cnt.values):
                if count / df.shape[0] <= 0.005:
                    df.loc[df[cv] == val, cv] = 'NEW'

            conv = {}
            for val in set(df[cv]):
                if val != 'NEW': conv[val] = val

            group_converters[cv] = conv

        for cv in cat_var:
            cnt = df[cv].value_counts()
            conv = {}
            for val, count in zip(cnt.index, cnt.values):
                good_with_val = df.loc[(df.cntr_result == 1) & (df[cv] == val)].shape[0]
                bad_with_val = df.loc[(df.cntr_result == 0) & (df[cv] == val)].shape[0]

                p = good_with_val / df.loc[df.cntr_result == 1].shape[0]
                q = bad_with_val / df.loc[df.cntr_result == 0].shape[0]
                woe = round(np.log(p / q), 3)

                conv[val] = woe
                df.loc[df[cv]==val, cv] = round(np.log(p / q), 3)

            woe_converters[cv] = conv
       
        process_nominal.gp_convs = group_converters
        process_nominal.woe_convs = woe_converters
    else:
        for cv in cat_var:
            df.loc[:,cv] = df[cv].map(process_nominal.gp_convs[cv])
            df.loc[:,cv] = df[cv].fillna('NEW')
            df.loc[:,cv] = df[cv].map(process_nominal.woe_convs[cv])

    return df

In [5]:
# Список количественных переменных с нефиксированной областью значений
num_var = [
    'sup_cntr_num', 'sup_running_cntr_num', 'sup_cntr_avg_price', 'org_cntr_num', 
    'org_cntr_avg_price', 'org_running_cntr_num','price', 'pmp',
    'cntr_num_together', 'cntr_length', 'one_day_price'
]

# Список количественных переменных с областью значений от 0 до 1
# Без учета 'sup_okpd_exp'
num_var01 = [
        'sup_good_cntr_share', 'sup_fed_cntr_share', 'sup_sub_cntr_share', 
        'sup_mun_cntr_share', 'sup_cntr_avg_penalty_share', 'sup_1s_sev', 'sup_1s_org_sev',  
        'sup_no_pnl_share', 'sup_sim_price_share', 'org_good_cntr_share', 'org_fed_cntr_share', 
        'org_sub_cntr_share', 'org_mun_cntr_share', 'org_1s_sev', 'org_1s_sup_sev', 'org_sim_price_share', 
        'okpd_good_cntr_share'
    ]

# Список категориальных переменных
cat_var = ['org_type', 'okpd2', 'purch_type', 'quarter']

# Список бинарных переменных
cat_bin_var = ['price_higher_pmp', 'price_too_low']

In [6]:
def initial_preprocess(df):
    """Начальная предобработка"""
    
    # Список бинарных переменных
    cat_bin_var = ['price_higher_pmp', 'price_too_low']
    
    # Удаление неважных количественных переменных
    f0or nv in ('cntr_num_together', 'price', 'pmp'):
        num_var.remove(nv)

    for nv01 in ('sup_cntr_avg_penalty_share', 'sup_1s_sev', 'sup_1s_org_sev', 
        'sup_no_pnl_share', 'org_fed_cntr_share', 'org_sub_cntr_share', 
        'org_mun_cntr_share', 'org_1s_sev', 'org_1s_sup_sev'):
        num_var01.remove(nv01)

    # Удаление бинарных переменных
    cat_bin_var.clear()
    
    # Перемена местами обозначений целевой переменной: "0" соответствует хорошему контракту, "1" - плохому 
    df.loc[df.cntr_result == 0, 'cntr_result'] = 2
    df.loc[df.cntr_result == 1, 'cntr_result'] = 0
    df.loc[df.cntr_result == 2, 'cntr_result'] = 1
    
    # Предобработка
    df = process_numerical(df)
    df = process_nominal(df)
    
    # Формирование финального набора переменных
    var = ['cntrID']
    var.extend(num_var)
    var.extend(num_var01)
    var.extend(cat_var)
    
    return df[var + ['cntr_result']]

In [7]:
def get_classifier():
    """Набор классификаторов с параметрами"""
    
    classifiers = {
        "LogReg": (LogisticRegression, ParameterGrid({
            'random_state': [RANDOM_SEED],
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'solver': ['liblinear', 'saga'],
            'penalty': ['l1', 'l2']
            })),
        "RandForest": (RandomForestClassifier, ParameterGrid({
            'random_state': [RANDOM_SEED],
            'n_estimators': [10, 100, 200, 400, 600, 800, 1000],
            'max_depth': [3, 4, 6, 8]
        })),
        "XGBoost": (XGBClassifier, ParameterGrid({
            'random_state': [RANDOM_SEED],
            "eta": [0.03, 0.1, 0.3],
            "n_estimators": [100, 200, 400, 800, 1000],
            "max_depth": [3, 4, 6, 8],
            "logging_level": ['Silent'],
            'subsample': [0.7, 0.85, 1]
        }))
    }
    
    for clf_name in classifiers:
        for params in classifiers[clf_name][1]:
            yield clf_name, params, classifiers[clf_name][0](**params)

In [8]:
def choose_best_classifier(X, y, kfolds=10, valid=0.2):
    """Выбор лучшего классификатора"""
    
    # TODO: Переделать процесс оценки качества модели
    # TODO: Добавить дисперсию для каждой метрики на основе значений метрики в каждом фолде
    
    start_time = time.time()
    LOG_FILE = 'logs.log'
    
    stats = np.array([])
    row_names = []
    column_names = [
        'train_acc', 'train_auc', 'train_ll', 'test_acc', 'test_auc', 'test_ll',
        'valid_acc', 'valid_auc', 'valid_ll', 'time'
    ]
    
    best_clf_name = None
    best_params = None
    
    best_test_log_loss = None
    best_y_test_pred = None
    best_y_test_pred_proba = None
    best_y_test_real = None
    
    best_y_valid_pred = None
    best_y_valid_pred_proba = np.zeros((0, 2))
    best_y_valid_real = None
    best_valid_log_loss = 100
    
    # Формирование обучающей и валидационной выборки
    val_ind = int(X.shape[0] * valid)
    X_valid, y_valid = X[:val_ind,:], y[:val_ind]
    X, y = X[val_ind:,:], y[val_ind:]
    
    for clf_name, params, clf in get_classifier():
        y_train_pred = np.array([])
        y_train_real = np.array([])
        y_train_pred_proba = np.zeros((0, 2))
        
        y_test_pred = np.array([])
        y_test_pred_proba = np.zeros((0, 2))
        y_test_real = np.array([])
        
        row_names.append(clf_name + ' ' + '_'.join(str(param) for param in params.values()))
        
        # Обучение модели на 10-кратной кросс-валидации
        start_learning_time = time.time()
        kfold_generator = StratifiedKFold(n_splits=kfolds)
        for train_index, test_index in kfold_generator.split(X, y):
            X_train = X[train_index]
            y_train = y[train_index]

            X_test = X[test_index]
            y_test = y[test_index]
            
            clf.fit(X_train, y_train)
            
            y_train_real = np.concatenate((y_train_real, y_train))
            y_train_pred = np.concatenate((y_train_pred, clf.predict(X_train)))
            y_train_pred_proba = np.concatenate((y_train_pred_proba, clf.predict_proba(X_train)))
            
            y_test_real = np.concatenate((y_test_real, y_test))
            y_test_pred = np.concatenate((y_test_pred, clf.predict(X_test)))
            y_test_pred_proba = np.concatenate((y_test_pred_proba, clf.predict_proba(X_test)))
            
        
        learning_time = time.time() - start_learning_time
        
        # Выбор лучшего алгоритма
        if log_loss(y_valid, clf.predict_proba(X_valid)) < best_valid_log_loss:
            best_clf_name = clf_name
            best_params = params
            
            best_y_test_pred = y_test_pred
            best_y_test_pred_proba = y_test_pred_proba
            best_y_test_real = y_test_real
            best_test_log_loss = log_loss(y_test_real, y_test_pred_proba)
            
            best_y_valid_pred = clf.predict(X_valid)
            best_y_valid_pred_proba = clf.predict_proba(X_valid)
            best_y_valid_real = y_valid
            best_valid_log_loss = log_loss(y_valid, best_y_valid_pred_proba)
        
        # Расчет метрик для тренировочной, тестовой, валидационной выборки
        train_acc = accuracy_score(y_train_real, y_train_pred)
        train_auc = roc_auc_score(y_train_real, y_train_pred)
        train_ll = log_loss(y_train_real, y_train_pred_proba)

        test_acc = accuracy_score(y_test_real, y_test_pred)
        test_auc = roc_auc_score(y_test_real, y_test_pred)
        test_ll = log_loss(y_test_real, y_test_pred_proba)

        valid_acc = accuracy_score(y_valid, clf.predict(X_valid))
        valid_auc = roc_auc_score(y_valid, clf.predict(X_valid))
        valid_ll = log_loss(y_valid, clf.predict_proba(X_valid))
        
        metrics = [
            train_acc, train_auc, train_ll,
            test_acc, test_auc, test_ll,valid_acc, valid_auc, valid_ll,
            int(learning_time)
        ]
        
        key_metrics_str = (
            '{} {}\ntrain: ({:.3f}, {:.3f}, {:.3f}) '
            'test: ({:.3f}, {:.3f}, {:.3f}) '
            'valid: ({:.3f}, {:.3f}, {:.3f}) - {}\n').format(
                clf_name, params, train_acc, train_auc, train_ll,
                test_acc, test_auc, test_ll, valid_acc, valid_auc, valid_ll,
                int(learning_time)
        )
        print(key_metrics_str)
        
        # Добавление расчетов в массив
        if stats.shape[0]:
            stats = np.vstack((stats, np.array(metrics)))
        else:
            stats = np.array(metrics)
            
        # Запись логов
        with open(LOG_FILE, 'a', encoding='utf-8') as file:
            file.write(key_metrics_str)
        
    best_algorithm_str = 'Лучший алгоритм {} с параметрами {} с valid_acc: {:.3f}, valid_ll: {:.3f}\n'.format(
        best_clf_name,
        best_params,
        accuracy_score(best_y_valid_real, best_y_valid_pred),
        log_loss(best_y_valid_real, best_y_valid_pred_proba)
    )
    print(best_algorithm_str)
    with open(LOG_FILE, 'a', encoding='utf-8') as file:
            file.write(best_algorithm_str)

    print(classification_report(
        best_y_valid_real,
        best_y_valid_pred,
        target_names=tuple(('Хороший', 'Плохой'))
    ))
    
    time_consumed_str ='Выбор лучшего классификатора занял %s секунд' % int((time.time() - start_time))
    print(time_consumed_str)
    with open(LOG_FILE, 'a', encoding='utf-8') as file:
            file.write(time_consumed_str)
    
    result = pd.DataFrame(data=stats, columns=column_names, index=row_names)
    return result

In [9]:
%%time
data = initial_preprocess(data.copy())

Wall time: 1.8 s


In [10]:
# Балансировка выборки
bad_cntr = data.loc[data.cntr_result == 1]
good_cntr = data.loc[data.cntr_result == 0].sample(bad_cntr.shape[0], random_state=RANDOM_SEED)
data = bad_cntr.append(good_cntr)

print(data.shape)
data.head()

(35468, 22)


Unnamed: 0,cntrID,sup_cntr_num,sup_running_cntr_num,sup_cntr_avg_price,org_cntr_num,org_cntr_avg_price,org_running_cntr_num,cntr_length,one_day_price,sup_good_cntr_share,...,sup_mun_cntr_share,sup_sim_price_share,org_good_cntr_share,org_sim_price_share,okpd_good_cntr_share,org_type,okpd2,purch_type,quarter,cntr_result
4,33698468,-0.580939,-0.699322,-0.425817,-0.213088,-0.528096,0.428038,-0.366531,-0.705934,0.666667,...,0.0,0.333,0.74,0.111,1.070969,0.218,1.62,-0.081,-0.039,1.0
5,27661203,-0.415206,-0.699322,0.387776,-1.048033,0.07287,-0.473172,1.073282,-0.215795,0.75,...,0.0,0.25,0.619403,0.142,1.108748,-0.356,0.435,-0.081,0.5,1.0
7,33682440,-0.580939,-0.699322,-0.151879,1.010943,2.17848,0.724813,1.400113,-0.554295,0.666667,...,0.0,0.333,0.874825,0.088,1.049705,0.041,1.244,0.238,-0.039,1.0
8,34009214,-0.580939,-0.699322,2.854341,0.428582,1.230921,1.145543,-0.710917,1.605741,0.666667,...,0.0,0.333,0.819672,0.081,1.108748,0.218,0.435,-0.081,-0.039,1.0
9,31700286,-0.181618,-0.243157,0.240404,-0.815883,0.624808,-1.532427,0.29948,0.613241,0.833333,...,0.0,0.167,0.994286,0.074,1.013832,0.041,-0.677,-0.081,-0.172,1.0


In [11]:
data = data.sample(frac=1, random_state=RANDOM_SEED)

In [12]:
X = data.drop(['cntr_result', 'cntrID'], axis=1).values
y = data.cntr_result.values

In [13]:
result = choose_best_classifier(X, y)

LogRer {'C': 0.001, 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}
train: (0.801, 0.801, 0.497) test: (0.800, 0.800, 0.497) valid: (0.801, 0.801, 0.495) - 1

LogRer {'C': 0.001, 'penalty': 'l1', 'random_state': 42, 'solver': 'saga'}
train: (0.835, 0.835, 0.426) test: (0.834, 0.834, 0.426) valid: (0.836, 0.836, 0.426) - 2

LogRer {'C': 0.001, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
train: (0.800, 0.800, 0.477) test: (0.800, 0.800, 0.477) valid: (0.802, 0.802, 0.472) - 1

LogRer {'C': 0.001, 'penalty': 'l2', 'random_state': 42, 'solver': 'saga'}
train: (0.811, 0.811, 0.467) test: (0.811, 0.811, 0.467) valid: (0.816, 0.816, 0.462) - 2

LogRer {'C': 0.01, 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}
train: (0.885, 0.885, 0.283) test: (0.885, 0.885, 0.283) valid: (0.886, 0.886, 0.281) - 17

LogRer {'C': 0.01, 'penalty': 'l1', 'random_state': 42, 'solver': 'saga'}
train: (0.893, 0.893, 0.266) test: (0.892, 0.892, 0.267) valid: (0.891, 0.891, 0.



LogRer {'C': 1, 'penalty': 'l1', 'random_state': 42, 'solver': 'saga'}
train: (0.901, 0.901, 0.249) test: (0.901, 0.901, 0.250) valid: (0.903, 0.903, 0.245) - 16

LogRer {'C': 1, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
train: (0.900, 0.900, 0.251) test: (0.900, 0.900, 0.252) valid: (0.899, 0.899, 0.247) - 3

LogRer {'C': 1, 'penalty': 'l2', 'random_state': 42, 'solver': 'saga'}
train: (0.900, 0.900, 0.250) test: (0.900, 0.900, 0.251) valid: (0.901, 0.901, 0.246) - 13

LogRer {'C': 10, 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}
train: (0.901, 0.901, 0.249) test: (0.901, 0.901, 0.250) valid: (0.903, 0.903, 0.245) - 40

LogRer {'C': 10, 'penalty': 'l1', 'random_state': 42, 'solver': 'saga'}
train: (0.901, 0.901, 0.249) test: (0.901, 0.901, 0.250) valid: (0.903, 0.903, 0.245) - 16

LogRer {'C': 10, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
train: (0.901, 0.901, 0.249) test: (0.901, 0.901, 0.250) valid: (0.903, 0.903, 0.245) - 3

LogRer

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 800, 'random_state': 42, 'subsample': 0.7}
train: (0.958, 0.958, 0.110) test: (0.924, 0.924, 0.179) valid: (0.929, 0.929, 0.168) - 161

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 800, 'random_state': 42, 'subsample': 0.85}
train: (0.958, 0.958, 0.110) test: (0.924, 0.924, 0.178) valid: (0.930, 0.931, 0.167) - 153

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 800, 'random_state': 42, 'subsample': 1}
train: (0.955, 0.955, 0.116) test: (0.923, 0.923, 0.178) valid: (0.929, 0.929, 0.166) - 137

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 1000, 'random_state': 42, 'subsample': 0.7}
train: (0.964, 0.964, 0.100) test: (0.923, 0.923, 0.180) valid: (0.929, 0.929, 0.168) - 202

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 1000, 'random_state': 42, 'subsample': 0.85}
train:

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 8, 'n_estimators': 100, 'random_state': 42, 'subsample': 1}
train: (0.964, 0.964, 0.103) test: (0.923, 0.923, 0.176) valid: (0.932, 0.932, 0.165) - 49

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 8, 'n_estimators': 200, 'random_state': 42, 'subsample': 0.7}
train: (0.993, 0.993, 0.049) test: (0.922, 0.922, 0.185) valid: (0.928, 0.928, 0.175) - 112

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 8, 'n_estimators': 200, 'random_state': 42, 'subsample': 0.85}
train: (0.993, 0.993, 0.052) test: (0.923, 0.923, 0.183) valid: (0.929, 0.929, 0.171) - 107

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 8, 'n_estimators': 200, 'random_state': 42, 'subsample': 1}
train: (0.988, 0.988, 0.060) test: (0.923, 0.923, 0.181) valid: (0.931, 0.931, 0.168) - 98

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 8, 'n_estimators': 400, 'random_state': 42, 'subsample': 0.7}
train: (1.000

XGBoost {'eta': 0.1, 'logging_level': 'Silent', 'max_depth': 4, 'n_estimators': 800, 'random_state': 42, 'subsample': 0.85}
train: (0.981, 0.981, 0.071) test: (0.923, 0.923, 0.184) valid: (0.929, 0.929, 0.170) - 203

XGBoost {'eta': 0.1, 'logging_level': 'Silent', 'max_depth': 4, 'n_estimators': 800, 'random_state': 42, 'subsample': 1}
train: (0.977, 0.977, 0.079) test: (0.923, 0.923, 0.183) valid: (0.927, 0.927, 0.169) - 182

XGBoost {'eta': 0.1, 'logging_level': 'Silent', 'max_depth': 4, 'n_estimators': 1000, 'random_state': 42, 'subsample': 0.7}
train: (0.988, 0.988, 0.058) test: (0.921, 0.921, 0.191) valid: (0.927, 0.927, 0.179) - 269

XGBoost {'eta': 0.1, 'logging_level': 'Silent', 'max_depth': 4, 'n_estimators': 1000, 'random_state': 42, 'subsample': 0.85}
train: (0.988, 0.988, 0.059) test: (0.922, 0.922, 0.189) valid: (0.926, 0.926, 0.174) - 253

XGBoost {'eta': 0.1, 'logging_level': 'Silent', 'max_depth': 4, 'n_estimators': 1000, 'random_state': 42, 'subsample': 1}
train: (0.98

XGBoost {'eta': 0.3, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 200, 'random_state': 42, 'subsample': 0.85}
train: (0.933, 0.933, 0.156) test: (0.922, 0.922, 0.176) valid: (0.931, 0.931, 0.165) - 38

XGBoost {'eta': 0.3, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 200, 'random_state': 42, 'subsample': 1}
train: (0.930, 0.930, 0.159) test: (0.921, 0.921, 0.177) valid: (0.931, 0.931, 0.165) - 35

XGBoost {'eta': 0.3, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 400, 'random_state': 42, 'subsample': 0.7}
train: (0.943, 0.943, 0.136) test: (0.922, 0.922, 0.176) valid: (0.931, 0.931, 0.166) - 81

XGBoost {'eta': 0.3, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 400, 'random_state': 42, 'subsample': 0.85}
train: (0.943, 0.943, 0.137) test: (0.923, 0.923, 0.175) valid: (0.930, 0.930, 0.164) - 77

XGBoost {'eta': 0.3, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 400, 'random_state': 42, 'subsample': 1}
train: (0.940, 0.94

XGBoost {'eta': 0.3, 'logging_level': 'Silent', 'max_depth': 6, 'n_estimators': 1000, 'random_state': 42, 'subsample': 0.85}
train: (1.000, 1.000, 0.013) test: (0.921, 0.921, 0.222) valid: (0.929, 0.929, 0.203) - 402

XGBoost {'eta': 0.3, 'logging_level': 'Silent', 'max_depth': 6, 'n_estimators': 1000, 'random_state': 42, 'subsample': 1}
train: (1.000, 1.000, 0.017) test: (0.921, 0.921, 0.216) valid: (0.929, 0.929, 0.195) - 360

XGBoost {'eta': 0.3, 'logging_level': 'Silent', 'max_depth': 8, 'n_estimators': 100, 'random_state': 42, 'subsample': 0.7}
train: (0.968, 0.968, 0.095) test: (0.923, 0.923, 0.177) valid: (0.929, 0.929, 0.167) - 52

XGBoost {'eta': 0.3, 'logging_level': 'Silent', 'max_depth': 8, 'n_estimators': 100, 'random_state': 42, 'subsample': 0.85}
train: (0.968, 0.968, 0.097) test: (0.923, 0.923, 0.176) valid: (0.932, 0.932, 0.166) - 52

XGBoost {'eta': 0.3, 'logging_level': 'Silent', 'max_depth': 8, 'n_estimators': 100, 'random_state': 42, 'subsample': 1}
train: (0.964, 

In [14]:
# Экспорт результатов
result.to_csv(path_or_buf='parameter_tuning_v4.csv')