In [1]:
import time
import datetime

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import log_loss, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import StratifiedKFold, ParameterGrid

In [2]:
RANDOM_SEED=42

In [3]:
def get_classifier():
    """Набор классификаторов с параметрами"""
    
    classifiers = {
        "LogRer": (LogisticRegression, ParameterGrid({
            'random_state': [RANDOM_SEED],
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'solver': ['liblinear', 'saga'],
            'penalty': ['l1', 'l2']
            })),
        "RandForest": (RandomForestClassifier, ParameterGrid({
            'random_state': [RANDOM_SEED],
            'n_estimators': [10, 100, 200, 400, 600, 800, 1000],
            'max_depth': [3, 4, 6, 8]
        })),
        "XGBoost": (XGBClassifier, ParameterGrid({
            'random_state': [RANDOM_SEED],
            "eta": [0.03, 0.1, 0.3],
            "n_estimators": [100, 200, 400, 800, 1000],
            "max_depth": [3, 4, 6, 8],
            "logging_level": ['Silent'],
            'subsample': [0.7, 0.85, 1]
        }))
    }
    
    for clf_name in classifiers:
        for params in classifiers[clf_name][1]:
            yield clf_name, params, classifiers[clf_name][0](**params)

In [4]:
def choose_best_classifier(X, y, kfolds=10, valid=0.2):
    """Выбор лучшего классификатора"""
    
    start_time = time.time()
    LOG_FILE = 'logs.log'
    
    stats = np.array([])
    row_names = []
    column_names = [
        'train_acc', 'train_auc', 'train_ll', 'test_acc', 'test_auc', 'test_ll',
        'valid_acc', 'valid_auc', 'valid_ll', 'time'
    ]
    
    best_clf_name = None
    best_params = None
    
    best_test_log_loss = None
    best_y_test_pred = None
    best_y_test_pred_proba = None
    best_y_test_real = None
    
    best_y_valid_pred = None
    best_y_valid_pred_proba = np.zeros((0, 2))
    best_y_valid_real = None
    best_valid_log_loss = 100
    
    # Формирование обучающей и валидационной выборки
    val_ind = int(X.shape[0] * valid)
    X_valid, y_valid = X[:val_ind,:], y[:val_ind]
    X, y = X[val_ind:,:], y[val_ind:]
    
    for clf_name, params, clf in get_classifier():
        y_train_pred = np.array([])
        y_train_real = np.array([])
        y_train_pred_proba = np.zeros((0, 2))
        
        y_test_pred = np.array([])
        y_test_pred_proba = np.zeros((0, 2))
        y_test_real = np.array([])
        
        row_names.append(clf_name + ' ' + '_'.join(str(param) for param in params.values()))
        
        # Обучение модели на 10-кратной кросс-валидации
        start_learning_time = time.time()
        kfold_generator = StratifiedKFold(n_splits=kfolds)
        for train_index, test_index in kfold_generator.split(X, y):
            X_train = X[train_index]
            y_train = y[train_index]

            X_test = X[test_index]
            y_test = y[test_index]
            
            clf.fit(X_train, y_train)
            
            y_train_real = np.concatenate((y_train_real, y_train))
            y_train_pred = np.concatenate((y_train_pred, clf.predict(X_train)))
            y_train_pred_proba = np.concatenate((y_train_pred_proba, clf.predict_proba(X_train)))
            
            y_test_real = np.concatenate((y_test_real, y_test))
            y_test_pred = np.concatenate((y_test_pred, clf.predict(X_test)))
            y_test_pred_proba = np.concatenate((y_test_pred_proba, clf.predict_proba(X_test)))
            
        
        learning_time = time.time() - start_learning_time
        
        # Выбор лучшего алгоритма
        if log_loss(y_valid, clf.predict_proba(X_valid)) < best_valid_log_loss:
            best_clf_name = clf_name
            best_params = params
            
            best_y_test_pred = y_test_pred
            best_y_test_pred_proba = y_test_pred_proba
            best_y_test_real = y_test_real
            best_test_log_loss = log_loss(y_test_real, y_test_pred_proba)
            
            best_y_valid_pred = clf.predict(X_valid)
            best_y_valid_pred_proba = clf.predict_proba(X_valid)
            best_y_valid_real = y_valid
            best_valid_log_loss = log_loss(y_valid, best_y_valid_pred_proba)
        
        # Расчет метрик для тренировочной, тестовой, валидационной выборки
        train_acc = accuracy_score(y_train_real, y_train_pred)
        train_auc = roc_auc_score(y_train_real, y_train_pred)
        train_ll = log_loss(y_train_real, y_train_pred_proba)

        test_acc = accuracy_score(y_test_real, y_test_pred)
        test_auc = roc_auc_score(y_test_real, y_test_pred)
        test_ll = log_loss(y_test_real, y_test_pred_proba)

        valid_acc = accuracy_score(y_valid, clf.predict(X_valid))
        valid_auc = roc_auc_score(y_valid, clf.predict(X_valid))
        valid_ll = log_loss(y_valid, clf.predict_proba(X_valid))
        
        metrics = [
            train_acc, train_auc, train_ll,
            test_acc, test_auc, test_ll,valid_acc, valid_auc, valid_ll,
            int(learning_time)
        ]
        
        key_metrics_str = (
            '{} {}\ntrain: ({:.3f}, {:.3f}, {:.3f}) '
            'test: ({:.3f}, {:.3f}, {:.3f}) '
            'valid: ({:.3f}, {:.3f}, {:.3f}) - {}\n').format(
                clf_name, params, train_acc, train_auc, train_ll,
                test_acc, test_auc, test_ll, valid_acc, valid_auc, valid_ll,
                int(learning_time)
        )
        print(key_metrics_str)
        
        # Добавление расчетов в массив
        if stats.shape[0]:
            stats = np.vstack((stats, np.array(metrics)))
        else:
            stats = np.array(metrics)
            
        # Запись логов
        with open(LOG_FILE, 'a', encoding='utf-8') as file:
            file.write(key_metrics_str)
        
    best_algorithm_str = 'Лучший алгоритм {} с параметрами {} с valid_acc: {:.3f}, valid_ll: {:.3f}\n'.format(
        best_clf_name,
        best_params,
        accuracy_score(best_y_valid_real, best_y_valid_pred),
        log_loss(best_y_valid_real, best_y_valid_pred_proba)
    )
    print(best_algorithm_str)
    with open(LOG_FILE, 'a', encoding='utf-8') as file:
            file.write(best_algorithm_str)

    print(classification_report(
        best_y_valid_real,
        best_y_valid_pred,
        target_names=tuple(('Хороший', 'Плохой'))
    ))
    
    time_consumed_str ='Выбор лучшего классификатора занял %s секунд' % int((time.time() - start_time))
    print(time_consumed_str)
    with open(LOG_FILE, 'a', encoding='utf-8') as file:
            file.write(time_consumed_str)
    
    result = pd.DataFrame(data=stats, columns=column_names, index=row_names)
    return result

In [5]:
def preprocess_initial_dataset(data):
    """Предобработка данных"""
    
    # Удаление строк с пропущенными значениями
    data = data.dropna()

    # Балансировка выборки
    bad_cntr = data.loc[data.cntr_result == 0]
    good_cntr = data.loc[data.cntr_result == 1].sample(bad_cntr.shape[0], random_state=RANDOM_SEED)
    data = bad_cntr.append(good_cntr)
    
    # Перемена местами обозначений целевой переменной: "0" соответствует хорошему контракту, "1" - плохому 
    data.loc[data.cntr_result == 0, 'cntr_result'] = 2
    data.loc[data.cntr_result == 1, 'cntr_result'] = 0
    data.loc[data.cntr_result == 2, 'cntr_result'] = 1
    
     # Удаление неинформативных переменных (доля хороших контрактов по территории и ID контракта)
    data.drop(['ter_good_cntr_share', 'cntrID'], inplace=True, axis=1)
    # Удаление данных с пропущенной датой окончания контракта
    data.drop(data[data.exec_date == -1].index, inplace=True)
    
    num_var = ['sup_cntr_num', 'sup_cntr_avg_price', 'org_cntr_num', 'org_cntr_avg_price', 'cntr_length']
    num_var01 = [
        'sup_good_cntr_share', 'sup_fed_cntr_share', 'sup_sub_cntr_share', 
        'sup_nun_cntr_share', 'sup_sim_price_share', 'org_good_cntr_share', 'org_fed_cntr_share', 
        'org_sub_cntr_share', 'org_nun_cntr_share', 'okpd_good_cntr_share']
    cat_var = ['org_form', 'org_type', 'cntr_lvl', 'purch_type', 'okpd_2', 'okpd_3', 'okpd_4', 'quarter']
    
    # Создание дополнительных переменных, использующих первые 2, 3 и 4 символа ОКПД
    data['okpd_2'] = data['okpd'].apply(lambda a: pd.Series(a[:2]))
    data['okpd_3'] = data['okpd'].apply(lambda a: pd.Series(a[:3]))
    data['okpd_4'] = data['okpd'].apply(lambda a: pd.Series(a[:4]))
    
    # Добавление переменной, отражающей квартал заключения контракта
    sign_month = data['sign_date'].apply(lambda a: pd.Series(int(str(a)[4:6])))
    data['quarter'] = sign_month.apply(lambda a: pd.Series((a - 1) // 3 + 1))
    
    # Рассчет длительности контракта
    cntr_start = data['sign_date'].apply(lambda a: pd.Series(datetime.datetime.strptime(str(a), "%Y%m%d").date()))
    cntr_end = data['exec_date'].apply(lambda a: pd.Series(datetime.datetime.strptime(str(a), "%Y%m%d").date()))
    data['cntr_length'] = (cntr_end - cntr_start)[0].apply(lambda a: pd.Series(int(str(a).split()[0])))
    
    # Обработка выбросов для количественных переменных с нефиксированной областью значений
    for nv in data[num_var]:
        ulimit = np.percentile(data[nv].values, 99)
        dlimit = np.percentile(data[nv].values, 1)
        data.loc[data[nv] > ulimit, nv] = ulimit
        data.loc[data[nv] < dlimit, nv] = dlimit
    
    for idx, nv in enumerate(('sup_cntr_avg_price', 'org_cntr_avg_price', 'cntr_length')):
        if idx != 2:
            ulimit = np.percentile(data[nv].values, 95)
            data.loc[data[nv] > ulimit, nv] = ulimit  
        else:   
            dlimit = np.percentile(data[nv].values, 5)
            data.loc[data[nv] < dlimit, nv] = dlimit
    
    # Логарифмирование количественных переменных       
    for nv in data[num_var]:
        data.loc[data[nv] < 1, nv] = 1
        data[nv] = np.log(data[nv])
    
    # Шкалирование и центрирование количественных переменных
    scaler = StandardScaler()
    data[num_var] = scaler.fit_transform(data[num_var])
    
    # Обработка выбросов для количественных переменных с областью значений [0, 1]
    for nv01 in num_var01:
        ulimit = np.percentile(data[nv01].values, 99)
        dlimit = np.percentile(data[nv01].values, 1)
        data.loc[data[nv01] > ulimit, nv01] = ulimit
        data.loc[data[nv01] < dlimit, nv01] = dlimit
    
    # Замена пропущенных значений для номинальных переменных
    data.loc[(data.cntr_lvl == -1) | (data.cntr_lvl == 0), 'cntr_lvl'] = data.cntr_lvl.value_counts().index[0]
    data.loc[(data.org_type == -1) | (data.org_type == 0), 'org_type'] = data.org_type.value_counts().index[0]
    data.loc[(data.org_form == -1) | (data.org_form == 0), 'org_form'] = data.org_form.value_counts().index[0]
    data.loc[(data.purch_type == -1) | (data.purch_type == 0), 'purch_type'] = data.purch_type.value_counts().index[0]
    
    # Группировка редки значений для номинальных переменных
    for cv in cat_var:
        cnt = data[cv].value_counts()
        for val, count in zip(cnt.index, cnt.values):
            if count / data.shape[0] <= 0.005:
                data.loc[data[cv] == val, cv] = 'NEW'    
    
    # WoE кодирование номинальных переменных          
    for cv in cat_var:
        cnt = data[cv].value_counts()
        for val, count in zip(cnt.index, cnt.values):
            good_with_val = data.loc[(data.cntr_result == 1) & (data[cv] == val)].shape[0]
            bad_with_val = data.loc[(data.cntr_result == 0) & (data[cv] == val)].shape[0]

            p = good_with_val / data.loc[data.cntr_result == 1].shape[0]
            q = bad_with_val / data.loc[data.cntr_result == 0].shape[0]
            data.loc[data[cv] == val, cv] = round(np.log(p / q), 3)
    
    sample = data[num_var  + cat_var + num_var01 + ['cntr_result']]
    return sample

In [6]:
%%time
data = pd.read_csv('../data/tula_yarobl_contracts.csv', converters={'okpd': str})
data = preprocess_initial_dataset(data)

CPU times: user 57.6 s, sys: 1.3 s, total: 58.9 s
Wall time: 1min 1s


In [7]:
data = data.sample(frac=1, random_state=RANDOM_SEED)
X = data.drop(['cntr_result'], axis=1).values
y = data.cntr_result.values

In [8]:
result = choose_best_classifier(X, y)

LogRer {'C': 0.001, 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}
train: (0.746, 0.746, 0.533) test: (0.746, 0.746, 0.533) valid: (0.747, 0.747, 0.531) - 1

LogRer {'C': 0.001, 'penalty': 'l1', 'random_state': 42, 'solver': 'saga'}
train: (0.746, 0.746, 0.533) test: (0.746, 0.746, 0.533) valid: (0.747, 0.748, 0.531) - 2

LogRer {'C': 0.001, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
train: (0.756, 0.756, 0.500) test: (0.756, 0.756, 0.500) valid: (0.754, 0.754, 0.497) - 2

LogRer {'C': 0.001, 'penalty': 'l2', 'random_state': 42, 'solver': 'saga'}
train: (0.764, 0.764, 0.491) test: (0.764, 0.764, 0.492) valid: (0.761, 0.761, 0.489) - 2

LogRer {'C': 0.01, 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}
train: (0.830, 0.830, 0.392) test: (0.829, 0.829, 0.392) valid: (0.831, 0.831, 0.391) - 9

LogRer {'C': 0.01, 'penalty': 'l1', 'random_state': 42, 'solver': 'saga'}
train: (0.841, 0.841, 0.369) test: (0.841, 0.841, 0.369) valid: (0.843, 0.842, 0.3



LogRer {'C': 1, 'penalty': 'l1', 'random_state': 42, 'solver': 'saga'}
train: (0.853, 0.853, 0.346) test: (0.853, 0.853, 0.347) valid: (0.853, 0.853, 0.348) - 16

LogRer {'C': 1, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
train: (0.851, 0.851, 0.347) test: (0.851, 0.851, 0.348) valid: (0.852, 0.852, 0.348) - 6

LogRer {'C': 1, 'penalty': 'l2', 'random_state': 42, 'solver': 'saga'}
train: (0.852, 0.852, 0.346) test: (0.852, 0.852, 0.347) valid: (0.852, 0.852, 0.348) - 13

LogRer {'C': 10, 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}
train: (0.853, 0.853, 0.346) test: (0.853, 0.853, 0.347) valid: (0.853, 0.853, 0.348) - 45

LogRer {'C': 10, 'penalty': 'l1', 'random_state': 42, 'solver': 'saga'}
train: (0.853, 0.853, 0.346) test: (0.853, 0.853, 0.347) valid: (0.853, 0.853, 0.348) - 15

LogRer {'C': 10, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
train: (0.853, 0.853, 0.346) test: (0.853, 0.853, 0.347) valid: (0.853, 0.853, 0.348) - 7

LogRer

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 800, 'random_state': 42, 'subsample': 0.7}
train: (0.924, 0.924, 0.188) test: (0.900, 0.900, 0.238) valid: (0.896, 0.897, 0.243) - 156

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 800, 'random_state': 42, 'subsample': 0.85}
train: (0.924, 0.924, 0.188) test: (0.900, 0.900, 0.238) valid: (0.899, 0.900, 0.242) - 158

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 800, 'random_state': 42, 'subsample': 1}
train: (0.921, 0.921, 0.194) test: (0.898, 0.898, 0.239) valid: (0.898, 0.898, 0.245) - 144

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 1000, 'random_state': 42, 'subsample': 0.7}
train: (0.931, 0.931, 0.176) test: (0.903, 0.903, 0.235) valid: (0.899, 0.899, 0.240) - 209

XGBoost {'eta': 0.03, 'logging_level': 'Silent', 'max_depth': 3, 'n_estimators': 1000, 'random_state': 42, 'subsample': 0.85}
train:

In [12]:
# Экспорт результатов
result.to_csv(path_or_buf='parameter_tuning.csv')