In [1]:
import time
import datetime

import pandas as pd
import numpy as np

import pickle

from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

from sklearn.metrics import accuracy_score, log_loss, f1_score
from sklearn.metrics import roc_curve, auc

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline

random_seed = 42
version = 2

In [2]:
def preprocess_v2(data, use_old_scaler=False):
    num_var = ['org_cntr_num', 'sup_cntr_num', 'sup_cntr_avg_price', 'price', 'pmp']
    num_var01 = ['org_1s_sev', 'org_1s_sup_sev', 'sup_cntr_avg_penalty', 'sup_no_pnl_share', 'sup_okpd_exp', 'sup_1s_org_sev', 'sup_sim_price']
    cat_var = ['sup_type', 'sup_status', 'org_form', 'org_type', 'okpd', 'type_prod', 'cntr_lvl']
    cat_bin_var = ['price_higher_pmp', 'price_too_low']

    # Удаление строк с датой окончания контракта равной -1
    data.drop(data[data.exec_date == -1].index, inplace=True)
    # Исправление ошибки на уровне сбора данных
    data.loc[data['sup_okpd_exp'] > 1, 'sup_okpd_exp'] = 1
  
    for nv in num_var:
        data.loc[data[nv]<=1, nv] = 1
        data[nv] = np.log(data[nv])
    
    if use_old_scaler:
        # Нормализация
        scaler = load_scaler()
        data[num_var] = scaler.transform(data[num_var])
    else:
        scaler = StandardScaler()
        data[num_var] = scaler.fit_transform(data[num_var])
        save_scaler(scaler)
       
    # Удаление выбросов
    for nv in num_var:
        data['{}_rare'.format(nv)] = (np.abs(data[nv]) > 3) * 1
        data.loc[np.abs(data[nv]) > 3, nv] = 3
    
    # Исправление значений
    data.loc[data.sup_type == 0, 'sup_type'] = 3
    data.loc[(data.org_form == 45) | (data.org_form == 0), 'org_form'] = 13
    data.loc[(data.org_type == 1) | (data.org_type == -1), 'org_type'] = 12
    data.loc[data.cntr_lvl == -1, 'cntr_lvl'] = 3
  
    data['okpd_class'] = data['okpd'].apply(lambda a: pd.Series(a[:2]))
  
    cntr_start = data['sign_date'].apply(lambda a: pd.Series(datetime.datetime.strptime(str(a), "%Y%m%d").date()))
    cntr_end = data['exec_date'].apply(lambda a: pd.Series(datetime.datetime.strptime(str(a), "%Y%m%d").date()))
    data['cntr_length'] = (cntr_end - cntr_start)[0].apply(lambda a: pd.Series(int(str(a).split()[0])))

    # Удаление столбца
    nv01_columns_to_drop = set(num_var01) - set(['sup_okpd_exp', 'sup_sim_price'])
    columns_to_drop = list(nv01_columns_to_drop) + cat_bin_var + ['Unnamed: 0', 'sup_status', 'sign_date', 'exec_date', 'okpd']
  
    data.drop(columns_to_drop, inplace=True, axis=1)
    data.drop(data[data.cntr_length == 0].index, inplace=True)
    
    return data, scaler

In [3]:
def load_scaler(version=version):
    with open('model/skaler{}.pkl'.format(version), 'rb') as file:
        return pickle.load(file)

### Считывание данных

In [4]:
finished = pd.read_csv('../data/2/tula_yarobl_grbs_finished.csv', converters={'okpd': str})
print(finished.shape)

(24912, 27)


In [5]:
unfinished = pd.read_csv('../data/2/tula_yarobl_grbs_unfinished.csv', converters={'okpd': str})
print(unfinished.shape)

(172, 27)


In [6]:
# Предобработка данных
prep_finished, scaler = preprocess_v2(finished.copy())
prep_unfinished, _ = preprocess_v2(unfinished.copy(), use_old_scaler=True)
prep_real_unfinished, _ = preprocess_v2(unfinished.loc[unfinished.exec_date > 20180400].copy(), use_old_scaler=True)

# X, y для обучающей выборки 
Xf = prep_finished.drop(['cntr_result'], axis=1).values
yf = prep_finished.cntr_result.values

# X, y для выборки, на которой необходимо сделать предсказания 
Xunf = prep_unfinished.drop(['cntr_result'], axis=1).values
yunf = prep_unfinished.cntr_result.values

# X, y для выборки, на которой необходимо сделать предсказания, и контракты в которой на самом деле еще не завершены
X_r_unf = prep_real_unfinished.drop(['cntr_result'], axis=1).values
y_r_unf = prep_real_unfinished.cntr_result.values

### Применение различных приемов over/under sampling

In [7]:
%%time
class DummySampler(object):
    def sample(self, X, y):
        return X, y

    def fit(self, X, y):
        return self

    def fit_sample(self, X, y):
        return self.sample(X, y)

classifier = [
    ['GBC', GradientBoostingClassifier()],
    ['XGB', XGBClassifier()],
]

samplers = [
    ['Standard', DummySampler()],
    ['RUS', RandomUnderSampler(random_state=random_seed)],
    ['ADASYN', ADASYN(random_state=random_seed)],
    ['ROS', RandomOverSampler(random_state=random_seed)],
    ['SMOTE', SMOTE(random_state=random_seed)],
]

pipelines = [
    ['{}-{}'.format(sampler[0], classifier[0][0]),
     make_pipeline(sampler[1], classifier[0][1])]
    for sampler in samplers
] + [
    ['{}-{}'.format(sampler[0], classifier[1][0]),
     make_pipeline(sampler[1], classifier[1][1])]
    for sampler in samplers
]

result = {
    'name': [],
    'train_acc': [],
    'train_auc': [],
    'test_acc': [],
    'test_auc': [],
    'test_log_loss': [],
    'test_f1': [],
    'test_f1_w': []
}

preds = {}

KFOLD = 5

for name, pipeline in pipelines:
    print('Обучение и тестирование {}'.format(name))
    train_acc, train_auc, test_acc, test_auc, test_log_loss, test_f1, test_f1_w = [], [], [], [], [], [], []
    
    for idx, (train, test) in enumerate(StratifiedKFold(n_splits=KFOLD).split(Xf, yf)):
        pipeline.fit(Xf[train], yf[train])
        
        pred_train = pipeline.predict(Xf[train])
        pred_test = pipeline.predict(Xf[test])
        pred_test_proba = pipeline.predict_proba(Xf[test])
        
        train_acc.append(accuracy_score(yf[train], pred_train))
        test_acc.append(accuracy_score(yf[test], pred_test))
        test_log_loss.append(log_loss(yf[test], pred_test_proba))
        test_f1.append(f1_score(yf[test], pred_test))
        test_f1_w.append(f1_score(yf[test], pred_test, average='weighted'))
        
        fpr, tpr, _ = roc_curve(yf[train], pred_train)
        train_auc.append(auc(fpr, tpr))
        
        fpr, tpr, _ = roc_curve(yf[test], pred_test)
        test_auc.append(auc(fpr, tpr))
        
        preds[name + str(idx)] = pipeline.predict(X_r_unf)
        
    result['name'].append(name)
    result['train_acc'].append(np.mean(train_acc))
    result['train_auc'].append(np.mean(train_auc))
    result['test_acc'].append(np.mean(test_acc))
    result['test_auc'].append(np.mean(test_auc))
    result['test_log_loss'].append(np.mean(test_log_loss))
    result['test_f1'].append(np.mean(test_f1))
    result['test_f1_w'].append(np.mean(test_f1_w))

Обучение и тестирование Standard-GBC
Обучение и тестирование RUS-GBC
Обучение и тестирование ADASYN-GBC
Обучение и тестирование ROS-GBC


  'precision', 'predicted', average, warn_for)


Обучение и тестирование SMOTE-GBC
Обучение и тестирование Standard-XGB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Обучение и тестирование RUS-XGB
Обучение и тестирование ADASYN-XGB
Обучение и тестирование ROS-XGB


  'precision', 'predicted', average, warn_for)


Обучение и тестирование SMOTE-XGB
CPU times: user 2min 3s, sys: 857 ms, total: 2min 3s
Wall time: 2min 4s


In [8]:
pd.DataFrame(result)

Unnamed: 0,name,test_acc,test_auc,test_f1,test_f1_w,test_log_loss,train_acc,train_auc
0,Standard-GBC,0.876063,0.523114,0.91948,0.900521,0.454546,0.984955,0.676481
1,RUS-GBC,0.642846,0.611742,0.759688,0.74368,0.783325,0.738998,0.820352
2,ADASYN-GBC,0.875538,0.521147,0.926872,0.906316,0.305596,0.934307,0.593676
3,ROS-GBC,0.728856,0.61077,0.827418,0.809843,0.65143,0.843783,0.85361
4,SMOTE-GBC,0.834947,0.598003,0.895045,0.876541,0.417952,0.943268,0.710539
5,Standard-XGB,0.874012,0.498292,0.918516,0.899215,0.383161,0.983345,0.640637
6,RUS-XGB,0.667466,0.625192,0.782487,0.766079,0.790402,0.735378,0.812344
7,ADASYN-XGB,0.878716,0.520227,0.929454,0.908718,0.300057,0.930475,0.604237
8,ROS-XGB,0.747805,0.614526,0.842263,0.824435,0.736358,0.833937,0.843901
9,SMOTE-XGB,0.8442,0.557743,0.901832,0.882684,0.460691,0.941055,0.708558


In [9]:
pd.DataFrame(preds)

Unnamed: 0,ADASYN-GBC0,ADASYN-GBC1,ADASYN-GBC2,ADASYN-GBC3,ADASYN-GBC4,ADASYN-XGB0,ADASYN-XGB1,ADASYN-XGB2,ADASYN-XGB3,ADASYN-XGB4,...,Standard-GBC0,Standard-GBC1,Standard-GBC2,Standard-GBC3,Standard-GBC4,Standard-XGB0,Standard-XGB1,Standard-XGB2,Standard-XGB3,Standard-XGB4
0,1,1,1,1,1,1,1,1,1,1,...,0,1,0,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,0,0,0,1,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,0,0,1,1,1,1,1,1,1,1


In [10]:
pd.DataFrame(preds).sum(axis=1)

0    48
1    50
2    12
3    32
4    38
dtype: int64

In [11]:
unfinished.loc[unfinished.exec_date > 20180400]

Unnamed: 0.1,Unnamed: 0,cntrID,org_cntr_num,org_1s_sev,org_1s_sup_sev,sup_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty,sup_no_pnl_share,sup_okpd_exp,...,price_higher_pmp,price_too_low,price,pmp,okpd,type_prod,cntr_lvl,sign_date,exec_date,cntr_result
71,103,1594833,613,0.0,0,1,114303.0,0,1.0,0.0,...,0,0,2187961.0,2205737.67,265112190,19,1,20170505,20190525,1
127,184,1602932,125,0.0,0,1,41467.0,0,1.0,1.0,...,0,0,31282.29,31500.0,651221000,14,1,20170713,20181115,1
168,31,847944,553,0.00181,0,40,235168.0,0,1.0,0.575,...,0,0,82060.68,82060.68,531011000,28,1,20171121,20180720,1
170,33,863761,140,0.0,0,3,193914.0,0,1.0,0.33333,...,0,0,443916.0,448848.0,432212120,4,1,20171229,20181231,1
171,34,865402,140,0.0,0,5,113913.0,0,1.0,0.0,...,0,0,269000.0,299232.0,331218000,28,1,20171229,20181231,1


**Вывод**. 2ой контракт (cntrID = 847944) алгоритмы оценивают как самый рискованный из всех присутствующих.