In [411]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, scale
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import sklearn
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import os

def read_dfs(files: list) -> pd.DataFrame:
    df_res = None
    for x in files:
        df: pd.DataFrame = pd.read_pickle(x)
        if df_res is None:
            df_res = df
        else:
            df_res = pd.concat([df_res, df], axis=0, sort=False, ignore_index=True)
    return df_res


mypath = '/mnt/hit4/hit4user/PycharmProjects/mysql_connector'
X_files = sorted([os.path.join(mypath, f) for f in os.listdir(mypath) if 'final_features_X' in f])
Y_files = sorted([os.path.join(mypath, f) for f in os.listdir(mypath) if 'final_features_Y' in f])
X = read_dfs(X_files)
Y = read_dfs(Y_files)
print(X.shape)
for c in X:
    if 'deal_created_date' in c:
        X.drop(c, axis=1, inplace=True)
X = pd.get_dummies(X, dummy_na=True)  # categorical
X.fillna(0, inplace=True)  # numerical if NaN > 50%
print(X.shape)
print(Y.shape)
Y['under'].replace(2,0, inplace=True)

(11020, 200)
(11020, 309)
(11020, 2)


## Статистика

In [412]:
print("X строк, столбцов", X.shape)
print("Y", Y.shape)
print("system:")
p = len(Y['system'][Y['system'] == 1])
n = len(Y['system'][Y['system'] == 0])
print("Ys OK", p)
print("Ys FAIL", n)
print("Ys FAIL/OK", "%.2f" % (n / p))
print()
print("under:")
p = len(Y['under'][Y['under'] == 1])
n = len(Y['under'][Y['under'] == 0])
print("Yu OK", p)
print("Yu FAIL", n)
print("Yu FAIL/OK", "%.2f" % (n / p))
print()
print("Nan exist?", X.isna().values.any(), X.isnull().values.any())

X строк, столбцов (11020, 309)
Y (11020, 2)
system:
Ys OK 4077
Ys FAIL 6943
Ys FAIL/OK 1.70

under:
Yu OK 435
Yu FAIL 10585
Yu FAIL/OK 24.33

Nan exist? False False


# XGBoost ручной подбор параметров (СПР)

### СПР

In [425]:
dtrain = xgb.DMatrix(X, Y['system'])
param = {'booster': 'gbtree', 'tree_method': 'gpu_hist', 'objective': 'binary:logistic', 
         'scale_pos_weight': 1.6, 'max_depth': 3, 'eta': 0.2,
         'gamma': 1}
num_round = 40
res = xgb.cv(param, dtrain, num_round, metrics=['error', 'auc'], nfold=5)
print("cross-train accuracy train\t", 1 - np.mean(res['train-error-mean']))
print("cross-train gini\t\t", np.mean(res['train-auc-mean'] * 2 - 1))
print("cross-test accuracy test\t", 1 - np.mean(res['test-error-mean']))
print("cross-test gini\t\t\t", np.mean(res['test-auc-mean'] * 2 - 1))

cross-train accuracy train	 0.855831955
cross-train gini		 0.86441815
cross-test accuracy test	 0.853890705
cross-test gini			 0.85551821


### Андерайтор

In [453]:
dtrain = xgb.DMatrix(X, Y['under'])
param = {'booster': 'gbtree', 'tree_method': 'gpu_hist', 'objective': 'binary:logistic', 
         'scale_pos_weight': 8, 'max_depth': 18, 'eta': 1,
         'gamma': 0.7}
num_round = 60
res = xgb.cv(param, dtrain, num_round, metrics=['error', 'auc'], nfold=6)
print("cross-train accuracy train\t", 1 - np.mean(res['train-error-mean']))
print("cross-train gini\t\t", np.mean(res['train-auc-mean'] * 2 - 1))
print("cross-test accuracy test\t", 1 - np.mean(res['test-error-mean']))
print("cross-test gini\t\t\t", np.mean(res['test-auc-mean'] * 2 - 1))

cross-train accuracy train	 0.9996696361111111
cross-train gini		 0.9999120388888888
cross-test accuracy test	 0.9791594194444444
cross-test gini			 0.979995538888889


## Модель предсказания решения СПР с минимальным числом ложных отклонений заявки (обученная на решениях андерайтора)

In [392]:
# Кросс-валидация для уменьшения FalseNegative
res1 = []
res2 = []
res3 = []
res4 = []
acc1 = []
gini1 = []

res21 = []
res22 = []
res23 = []
res24 = []

acc2 = []
gini2 = []
def run(param, num_round):
    for train_index, test_index in skf.split(X, Y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        Y_train, Y_test = Y.iloc[train_index, :], Y.iloc[test_index, :]

        # Обучаем на фолде отклоненных андерайтером
        dtrain = xgb.DMatrix(X_train, Y_train['under']) # under
        bst: Booster = xgb.train(param, dtrain, num_round)

        # Тестируем на отклоненных системой
        dtest = xgb.DMatrix(X_test, Y_test['system']) # system
        ypred2: np.array = bst.predict(dtest)

        cn = []
        cp = []
        for i, x in enumerate(Y_test['system']):
            if x == 0:
                cn.append(ypred2[i])
            if x == 1:
                cp.append(ypred2[i])
        res21.append((np.round(cn) == 0).mean())
        res22.append((np.round(cn) == 1).mean())
        res23.append((np.round(cp) == 1).mean())
        res24.append((np.round(cp) == 0).mean())
        acc1.append((np.round(ypred2) == Y_test['system']).mean())
        auc = sklearn.metrics.roc_auc_score(Y_test['system'], ypred2)
        gini1.append(2 * auc - 1)


        # тестируем на отклоненных андерайтором
        dtest = xgb.DMatrix(X_test, Y_test['under'])
        ypred2: np.array = bst.predict(dtest)

        cn = []
        cp = []
        for i, x in enumerate(Y_test['under']):
            if x == 0:
                cn.append(ypred2[i])
            if x == 1:
                cp.append(ypred2[i])
        res1.append((np.round(cn) == 0).mean())
        res2.append((np.round(cn) == 1).mean())
        res3.append((np.round(cp) == 1).mean())
        res4.append((np.round(cp) == 0).mean())
        acc2.append((np.round(ypred2) == Y_test['under']).mean())
        auc = sklearn.metrics.roc_auc_score(Y_test['under'], ypred2)
        gini2.append(2 * auc - 1)

    print("Результаты кросс-валидации тестирования на отклоненных системой")
    print("Точность:", np.array(acc1).mean())
    print("Коэффициент gini:", np.array(gini1).mean())
    print("TrueNegative/Negative для 0:\t%f" % np.array(res21).mean())
    print("FalsePositive/Negative для 0:\t%f" % np.array(res22).mean())
    print("TruePositive/Positive для 1:\t%f" % np.array(res23).mean())
    print("FalseNegative/Positive для 1:\t%f" % np.array(res24).mean(), "\n")

    print("Результаты кросс-валидации тестирования на отклоненных андерайтором")
    print("Точность:", np.array(acc2).mean())
    print("Коэффициент gini:", np.array(gini2).mean())
    print("TrueNegative/Negative для 0:\t%f" % np.array(res1).mean())
    print("FalsePositive/Negative для 0:\t%f" % np.array(res2).mean())
    print("TruePositive/Positive для 1:\t%f" % np.array(res3).mean())
    print("* FalseNegative/Positive для 1:\t%f" % np.array(res4).mean())

In [393]:
param = {'booster': 'gbtree', 'tree_method': 'gpu_hist', 'objective': 'binary:logistic', 
         'scale_pos_weight': 90, 'max_depth': 3, 'eta': 0.1,
         'gamma': 0.3}
num_round = 3
# print("KFold\n")
# skf = KFold(n_splits=5)
# run()
print("\nStratifiedShuffleSplit\n")
skf = StratifiedShuffleSplit(n_splits=5)
run(param, num_round)


StratifiedShuffleSplit

Результаты кросс-валидации тестирования на отклоненных системой
Точность: 0.7635208711433756
Коэффициент gini: 0.582014606995536
TrueNegative/Negative для 0:	0.990202
FalsePositive/Negative для 0:	0.009798
TruePositive/Positive для 1:	0.377941
FalseNegative/Positive для 1:	0.622059 

Результаты кросс-валидации тестирования на отклоненных андерайтором
Точность: 0.8931034482758621
Коэффициент gini: 0.959890015466575
TrueNegative/Negative для 0:	0.889036
FalsePositive/Negative для 0:	0.110964
TruePositive/Positive для 1:	0.990909
* FalseNegative/Positive для 1:	0.009091


# Важность параметров (СПР)

### 1) Подбор парамтеров

In [454]:
# ручной подбор параметров
kfold = StratifiedKFold(n_splits=5)

max_depth = 12
n_estimators = 25
max_leaf_nodes = 14
min_samples_split = 2
res = []
ci = []
for i in range(7):
    rf = RandomForestClassifier(random_state=i, max_depth=max_depth, n_estimators=n_estimators,
                                max_leaf_nodes=max_leaf_nodes, min_samples_split=min_samples_split)
    results = cross_val_score(rf, X, Y['under'], cv=kfold)
    res.append(results.mean())
    ci.append(results.std() * 2)
print("Accuracy: %f (+/- %0.2f)" % (sum(res)/ len(res), sum(ci)/ len(ci)))

Accuracy: 0.978844 (+/- 0.03)


### 2)

In [409]:
importance_sum = np.zeros(X.shape[1], dtype=np.float)
n = 100
max_depth = np.linspace(7, 20, 100) # 12
n_estimators = np.linspace(5, 40, 100) # 25
max_leaf_nodes = np.linspace(8, 20, 100) # 14
min_samples_split = 2

for i in range(n):
    depth = int(round(max_depth[i]))
    n_est = int(round(n_estimators[i]))
    max_l = int(round(max_leaf_nodes[i]))
    
    model = RandomForestClassifier(random_state=i, max_depth=depth, 
                                   n_estimators=n_est, max_leaf_nodes=max_l, 
                                   min_samples_split=2)
    model.fit(X, Y['under'])
    # FEATURE IMPORTANCE
    importances = model.feature_importances_  # feature importance
    importance_sum += importances

indices = np.argsort(importance_sum)[::-1]  # sort indexes

# Print the feature ranking
print("Feature ranking:")
print(importance_sum.shape)

for f in range(X.shape[1])[:100]: # первые 100
    print("%d. %s (%f)" % (f + 1, X.columns[indices[f]], importance_sum[indices[f]] / 100))


Feature ranking:
(309,)
1. spec_cot_ids.COUNT(autocredit_document) (0.140956)
2. spec_cot_ids.NUM_UNIQUE(autocredit_document.code) (0.128709)
3. scoring.Проверка объема двигателя транспортного средства на допустимое значение_OK (0.082434)
4. scoring.Соответствие категории ТС условиям программы кредитования_OK (0.077294)
5. scoring.Соответствие категории ТС условиям программы кредитования_NoneMy (0.075912)
6. scoring.Проверка объема двигателя транспортного средства на допустимое значение_NoneMy (0.069963)
7. scoring.Проверка, является ли ТС в залоге, по данным ФНП_NoneMy (0.031887)
8. client_ids.SUM(c_p_budget.`sum_confirmed`) (0.031881)
9. client_ids.NUM_UNIQUE(c_p_budget.`kind`) (0.026422)
10. client_ids.NUM_UNIQUE(c_p_budget.`type`) (0.026252)
11. scoring.Проверка, является ли ТС в залоге, по данным ФНП_OK (0.023258)
12. client_ids.STD(c_p_budget.`sum_confirmed`) (0.020225)
13. client_ids.COUNT(c_p_budget) (0.019120)
14. a_car_info.issue_year (0.017871)
15. client_ids.MAX(c_p_budget.

In [410]:
for f in range(X.shape[1])[-100:]: # последние 100
    print("%d. %s (%f)" % (f + 1, X.columns[indices[f]], importance_sum[indices[f]] / 100))

210. partner_point.sale_point_city_Мытищи (0.000002)
211. deal_ids.STD(a_opti.cost) (0.000000)
212. c_p_w_info.c_p_w_info_contact.SKEW(c_p_w_info.experience) (0.000000)
213. c_p_w_info.c_p_w_info_contact.STD(c_p_w_info.experience) (0.000000)
214. deal_ids.SKEW(a_opti.cost) (0.000000)
215. partner_point.sale_point_city_nan (0.000000)
216. c_p_w_info.c_p_w_info_contact.MODE(c_p_w_info.job_type)_nan (0.000000)
217. a_car_info.`condition`_nan (0.000000)
218. scoring.Проверка, является ли ТС в залоге, по данным ФНП_nan (0.000000)
219. scoring.Соответствие размера суммы кредита условиям программы кредитования_OK (0.000000)
220. scoring.Соответствие размера суммы кредита условиям программы кредитования_NoneMy (0.000000)
221. scoring.Соответствие категории ТС условиям программы кредитования_nan (0.000000)
222. scoring.Реестр банкротств_nan (0.000000)
223. scoring.Реестр банкротств_NoneMy (0.000000)
224. scoring.Регион регистрации/фактического проживания клиента_nan (0.000000)
225. scoring.Реги