In [2]:
import pandas
from sklearn.preprocessing import StandardScaler, scale
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

df: pandas.DataFrame = pandas.read_pickle('/mnt/hit4/hit4user/PycharmProjects/mysql_connector/c')
X = df.drop(['system'], 1)
X = X.drop(['under'], 1)
Y = df['system']
print("X строк, столбцов", X.shape)
print("Y", Y.shape)
p = len(Y[Y == 1])
n = len(Y[Y == 0])
print("Y OK", p)
print("Y FAIL", n)
print("FAIL/OK", n // p)
print("Nan exist?", df.isnull().values.any())

kfold = StratifiedKFold(n_splits=5)

X строк, столбцов (11036, 210)
Y (11036,)
Y OK 4193
Y FAIL 6843
FAIL/OK 1
Nan exist? False


# Линеная модель(СПР)

In [2]:
# SCALE
scaler = StandardScaler().fit(X, Y)
X_s = scaler.transform(X)

# CROSS-VAL
# 1 Linear Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter= 340)

results = cross_val_score(model, X_s, Y, cv=kfold)
print("Accuracy: %f" % results.mean())

Accuracy: 0.846235


# Дерево принятия решений с автоматическим подбором параметров(СПР)

In [7]:
from sklearn.tree import DecisionTreeClassifier
params = {'criterion':['gini','entropy'], 'max_leaf_nodes': list(range(4, 20)),
          'min_samples_split': [2, 3, 4], 'max_depth': list(range(3, 14))}
clf = GridSearchCV(DecisionTreeClassifier(), params, cv=kfold)
results = clf.fit(X,Y)
print(results.best_estimator_)
results = cross_val_score(results.best_estimator_, X, Y, cv=kfold)
print("Accuracy: %f" % results.mean())

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=10, max_features=None, max_leaf_nodes=12,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
Accuracy: 0.847857


# Случайный лес

## 1. СПР

In [8]:
from sklearn.ensemble import RandomForestClassifier
params = {'n_estimators': [5, 50], 'min_samples_split': [2, 3, 4],
          'max_leaf_nodes': list(range(10, 20)), 'max_depth': list(range(3, 13))}
clf = GridSearchCV(RandomForestClassifier(), params, cv=kfold)
results = clf.fit(X_s, Y)
print(results.best_estimator_)
results = cross_val_score(results.best_estimator_, X_s, Y, cv=kfold)
print("Accuracy: %f" % results.mean())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=17, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Accuracy: 0.844209


## 2. Андерайтор

In [4]:
from sklearn.ensemble import RandomForestClassifier
X = df.drop(['system'], 1)
X = X.drop(['under'], 1)
Y = df['under']
params = {'n_estimators': [5, 50], 'min_samples_split': [2, 3, 4],
          'max_leaf_nodes': list(range(10, 20)), 'max_depth': list(range(3, 13))}
clf = GridSearchCV(RandomForestClassifier(), params, cv=kfold)
results = clf.fit(X_s, Y)
print(results.best_estimator_)
results = cross_val_score(results.best_estimator_, X_s, Y, cv=kfold)
print("Accuracy: %f" % results.mean())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=17, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Accuracy: 0.990213


# K ближайших соседей(СПР)

In [17]:
# SCALE
X = df.drop(['system'], 1)
X = X.drop(['under'], 1)
Y = df['system']
X_s = scale(X)

from sklearn.neighbors import KNeighborsClassifier
params = {'n_neighbors':list(range(6, 11)), 'leaf_size':list(range(5, 40, 5))}
clf = GridSearchCV(KNeighborsClassifier(), params, cv=kfold)
results = clf.fit(X_s, Y)
print(results.best_estimator_)
results = cross_val_score(results.best_estimator_, X_s, Y, cv=kfold)
print("Accuracy: %f" % results.mean())

KNeighborsClassifier(algorithm='auto', leaf_size=5, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')
Accuracy: 0.783546


# XGBoost ручной подбор параметров (СПР)

In [3]:
import xgboost as xgb
import numpy as np

dtrain = xgb.DMatrix(X, Y)
param = {'booster': 'gbtree', 'objective': 'binary:logistic', 'scale_pos_weight': 1.6, 'max_depth': 3, 'eta': 0.1,
         'gamma': 1}
num_round = 9
res = xgb.cv(param, dtrain, num_round, metrics=['error', 'auc'], nfold=5)
print("cross-train accuracy train\t", 1 - np.mean(res['train-error-mean']))
print("cross-train gini\t", np.mean(res['train-auc-mean'] * 2 - 1))
print("cross-test accuracy test\t", 1 - np.mean(res['test-error-mean']))
print("cross-test gini\t\t", np.mean(res['test-auc-mean'] * 2 - 1))

cross-train accuracy train	 0.8484882444444445
cross-train gini	 0.7490440888888888
cross-test accuracy test	 0.8479419333333333
cross-test gini		 0.7439737777777776


Как можно видеть, все модели показывают меньший результат по сравнению с XGBoost

# XGBoost Автоматический подбор параметров

## 1. Решение СПР

In [7]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
X = df.drop(['system'], 1)
X = X.drop(['under'], 1)
Y = df['system']
params = {'objective': ['binary:logistic'], 'n_estimators': list(range(3, 20)), 'scale_pos_weight': [32],
          'learning_rate': list(np.arange(0, 1, 0.1)), 'max_depth': list(range(2, 21)), 'gamma': [0, 0.5, 1, 1.2, 1.5, 2]}
clf = RandomizedSearchCV(XGBClassifier(), params, cv=kfold, n_iter=50) # pre_dispatch = 1,
results = clf.fit(X, Y)
print(results.best_estimator_)
results = cross_val_score(results.best_estimator_, X, Y, cv=kfold)
print("Accuracy: %f" % results.mean())

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.9, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=9, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=32, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)
Accuracy: 0.782625


## 2. Решение андерайтора:

In [6]:
Y = df['under']
params = {'objective': ['binary:logistic'], 'n_estimators': list(range(3, 20)), 'scale_pos_weight': [45],
          'learning_rate': list(np.arange(0, 1, 0.1)), 'max_depth': list(range(2, 21)), 'gamma': [0, 0.5, 1, 1.2, 1.5, 2]}
kfold = StratifiedKFold(n_splits=5)
clf = RandomizedSearchCV(XGBClassifier(), params, cv=kfold, n_iter=30)
results = clf.fit(X, Y)
print(results.best_estimator_)
results = cross_val_score(results.best_estimator_, X, Y, cv=kfold)
print("Accuracy: %f" % results.mean())

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.7000000000000001, max_delta_step=0, max_depth=18,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=13, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=45, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)
Accuracy: 0.985411
