# этапы решения задачи на реальном примере

для курса "Машинное обучение и анализ данных" https://github.com/Dyakonov/MLDM/
    
2019, Александр Дьяконов https://dyakonov.org/ag/

In [2]:
import pandas as pd
import numpy as np
%pylab inline
plt.style.use('seaborn-dark')
import warnings
warnings.filterwarnings("ignore") # отключение варнингов
pd.set_option('display.max_columns', None) # pd.options.display.max_columns = None 
# pd.set_option('display.max_rows', None) # не прятать столбцы при выводе дата-фреймов
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.rc('font', size=14)

Populating the interactive namespace from numpy and matplotlib


# загрузили данные

In [3]:
train = pd.read_csv('./data_GMSC/train.csv')
test = pd.read_csv('./data_GMSC/test.csv')

In [4]:
# размеры данных
print(train.shape, test.shape)

(112500, 11) (37500, 10)


# посмотрели

In [5]:
train.sample(5)

Unnamed: 0,плохой_клиент,линии,возраст,поведение_30-59_дней,Debt_Ratio,доход,число_кредитов,поведение_90_дней,недвижимость,поведение_60-89_дней,семья
22126,0,0.357596,52,2,0.532058,4600.0,14,0,1,0,4.0
54387,0,0.056145,60,0,56.0,,4,0,0,0,0.0
815,0,0.447224,45,0,0.653607,9009.0,14,0,3,0,3.0
13043,0,0.09881,54,0,0.203736,19166.0,15,0,2,0,4.0
75469,0,0.683554,34,0,0.264168,5416.0,9,0,1,0,2.0


# особенности

Нам повезло: нет категориальных признаков - не надо думать о кодировках
    
Но есть пропуски: пока не будем думать о них (попробуйте придкмать что-то умнее) - заменим (-1)

In [6]:
y = train.pop('плохой_клиент') # целевой вектор
train.shape, y.shape

((112500, 10), (112500,))

In [7]:
# заменить пропуски на -11
train.fillna(-1, inplace=True)
test.fillna(-1, inplace=True)

у нас задача бинарной классификации:

In [8]:
np.unique(y)

array([0, 1])

перечислим подходящие алгоритмы для бинарной классификации (тут, кстати, не все алгоритмы):

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

models = {'лог_регрессия': LogisticRegression(),
          'лин_svm': LinearSVC(),
          'SGD': SGDClassifier(),
          'knn': KNeighborsClassifier(),
          'RF': RandomForestClassifier(),
          'ETC': ExtraTreesClassifier(),
          'GBM': GradientBoostingClassifier()}     

поэкспериментируем со всеми алгоритмами (параметры по умолчанию)

In [22]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=5, test_size=0.1, train_size=None, random_state=1)

for model_name in models:
    model = models[model_name]
    cvs = cross_val_score(model, train, y, cv=cv, scoring='roc_auc')
    print (model_name, f"auc={np.round(np.mean(cvs), 3)}", f"std={np.round(np.std(cvs), 3)}")

лог_регрессия auc=0.697 std=0.011
лин_svm auc=0.565 std=0.029
SGD auc=0.537 std=0.036
knn auc=0.568 std=0.008
RF auc=0.777 std=0.007
ETC auc=0.778 std=0.01
GBM auc=0.866 std=0.002


некоторые алгоритмы долго обучаются

совет: поймите какие и от чего это зависит!

пока самый лучший алгоритм - **градиентный бустинг**

здесь метрика качества - AUC ROC
https://dyakonov.org/2017/07/28/auc-roc-%D0%BF%D0%BB%D0%BE%D1%89%D0%B0%D0%B4%D1%8C-%D0%BF%D0%BE%D0%B4-%D0%BA%D1%80%D0%B8%D0%B2%D0%BE%D0%B9-%D0%BE%D1%88%D0%B8%D0%B1%D0%BE%D0%BA/

Метрик качества очень много! Вот некоторые из них:

In [14]:
from sklearn.metrics import SCORERS
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])

настроим параметры бустинга

In [28]:
from sklearn.model_selection import RandomizedSearchCV

params = {'learning_rate': [0.05, 0.1, 0.2], 'subsample': [0.5, 1.0], 'max_depth': [1, 2, 3, 4, 5], 'max_features': [0.5, 0.75, 1.0]}

model = GradientBoostingClassifier()

rs = RandomizedSearchCV(model, params, n_iter=10, scoring='roc_auc', n_jobs=-1, cv=cv)

rs.fit(train, y)

RandomizedSearchCV(cv=ShuffleSplit(n_splits=5, random_state=None, test_size=0.1, train_size=None),
          error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'learning_rate': [0.05, 0.1, 0.2], 'subsample': [0.5, 1.0], 'max_depth': [1, 2, 3, 4, 5], 'max_features': [0.5, 0.75, 1.0]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=0)

In [30]:
pd.DataFrame(rs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_max_features,param_max_depth,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,6.836809,0.28908,0.018913,0.000148,1.0,0.75,4,0.05,"{'subsample': 1.0, 'max_features': 0.75, 'max_...",0.864399,0.868879,0.862285,0.851924,0.846587,0.858815,0.008265,6,0.868508,0.867889,0.8684,0.869092,0.869901,0.868758,0.000688
1,6.139652,1.158666,0.020198,0.007867,1.0,0.75,3,0.05,"{'subsample': 1.0, 'max_features': 0.75, 'max_...",0.863162,0.867577,0.860194,0.850598,0.845386,0.857383,0.008187,9,0.863949,0.863699,0.864472,0.865349,0.865811,0.864656,0.000808
2,7.081066,0.455116,0.019497,0.000924,0.5,0.5,4,0.05,"{'subsample': 0.5, 'max_features': 0.5, 'max_d...",0.864676,0.869023,0.862186,0.852309,0.846378,0.858914,0.008329,5,0.86717,0.866971,0.867352,0.869077,0.869399,0.867994,0.001028
3,4.471708,0.441073,0.01342,0.004068,0.5,0.75,2,0.2,"{'subsample': 0.5, 'max_features': 0.75, 'max_...",0.864668,0.867907,0.861987,0.850504,0.847244,0.858462,0.008116,7,0.864127,0.865011,0.865198,0.866355,0.866536,0.865445,0.000895
4,2.114495,0.046776,0.009247,0.000988,1.0,0.75,1,0.05,"{'subsample': 1.0, 'max_features': 0.75, 'max_...",0.852736,0.855334,0.847909,0.840206,0.836336,0.846504,0.007236,10,0.851555,0.850605,0.850773,0.852862,0.853601,0.851879,0.001173
5,5.227625,0.426865,0.015371,0.000238,0.5,0.75,3,0.05,"{'subsample': 0.5, 'max_features': 0.75, 'max_...",0.862699,0.868307,0.86085,0.849881,0.846355,0.857619,0.008213,8,0.863511,0.863461,0.864245,0.86477,0.866266,0.864451,0.00103
6,8.550311,0.636935,0.023514,0.00062,1.0,0.5,5,0.05,"{'subsample': 1.0, 'max_features': 0.5, 'max_d...",0.865147,0.869298,0.86258,0.854252,0.848193,0.859894,0.007642,4,0.872683,0.87304,0.873156,0.874111,0.874701,0.873538,0.000749
7,5.745884,0.232485,0.017884,0.000216,0.5,0.5,4,0.1,"{'subsample': 0.5, 'max_features': 0.5, 'max_d...",0.863934,0.869459,0.863171,0.854791,0.849844,0.86024,0.007,3,0.870433,0.870615,0.872036,0.872633,0.872147,0.871573,0.000882
8,9.247265,0.295792,0.021539,0.000234,1.0,0.75,5,0.1,"{'subsample': 1.0, 'max_features': 0.75, 'max_...",0.866498,0.869321,0.863294,0.854622,0.84915,0.860577,0.007549,2,0.879376,0.878538,0.879975,0.880051,0.880607,0.879709,0.000704
9,5.208328,0.149721,0.017666,0.000754,1.0,0.5,4,0.2,"{'subsample': 1.0, 'max_features': 0.5, 'max_d...",0.867451,0.870034,0.864594,0.852394,0.849555,0.860806,0.008258,1,0.876857,0.877002,0.877972,0.877981,0.87937,0.877836,0.0009


лучшие параметры

In [33]:
rs.cv_results_['params'][-1]

{'subsample': 1.0, 'max_features': 0.5, 'max_depth': 4, 'learning_rate': 0.2}

ожидаемое качество

In [36]:
rs.cv_results_['mean_test_score'][-1]

0.8608056678859954

Кстати, хуже параметров по умолчанию;)

Мало экспериментов...

# советы по улучшению

раз лучшим оказался градиентный бустинг => смотрим его лучшие реализации

* xgboost https://en.wikipedia.org/wiki/XGBoost
* lightgbm https://github.com/Microsoft/LightGBM
* catboost https://tech.yandex.ru/catboost/