In [74]:
#импорт всех необходимых библиотек
import pandas as pd
import numpy as np 
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [68]:
data = pd.read_csv('data/orange_small_churn_train_data.csv') # загрузка данных

na_cols = [t for t in data.columns if data[t].isnull().values.all()] # обработка пустых признаков
data = data.drop(na_cols,axis = 1)

# отделение целевой переменной от данных 
labels = data['labels'] 
data = data.drop('labels', axis=1)

# разбиение данных  на тренировочную и контрольную части
data, hold_data, labels, hold_labels = train_test_split(data, labels) 
original_data = data.copy()
original_hold_data = hold_data.copy()
original_labels = labels.copy()
original_hold_labels = hold_labels.copy()
data = data.reset_index().drop(['index', 'ID'],axis=1)
hold_data = hold_data.reset_index().drop(['index', 'ID'],axis=1)
labels = labels.reset_index().drop(['index'],axis=1)
labels = labels['labels']
hold_labels = hold_labels.reset_index().drop(['index'],axis=1)
hold_labels = hold_labels['labels']

# обработка числовых признаков, заполнение пропусков в данных средним значением 
num_features = data.columns[:174]
data[num_features] = data[num_features].fillna(data.mean())
hold_data[num_features] = hold_data[num_features].fillna(data.mean())

# обработка категориальных признаков, заполнение пропусков значением'NAN'
cat_features = data.columns[174:]
data[cat_features] = data[cat_features].fillna('NAN')
hold_data[cat_features] = hold_data[cat_features].fillna('NAN')

# приведение числовых признаков к 1 масштабу, One-Hot-Encoding для категориальных признаков
# для использования в логистической регрессии
ohe = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()
col_trans = ColumnTransformer([("scaling:", scaler, slice(0,174)),
                               ("OHE:", ohe, slice(174,-1))],
                               sparse_threshold=0.3, 
                               n_jobs=-1)
data_1 = col_trans.fit_transform(data)
hold_data_1 = col_trans.transform(hold_data)

In [69]:
# приведение признаков к 1 масштабу
for t in num_features:
    scaler = StandardScaler()
    data[t] = scaler.fit_transform(np.array(data[t]).reshape(-1,1))
    hold_data[t] = scaler.transform(np.array(hold_data[t]).reshape(-1,1))

# LabelEncoding для категориальных признаков 
# для использования в xgboost
for t in cat_features:
    scaler = LabelEncoder()
    data[t] = scaler.fit_transform(np.array(data[t]).reshape(-1,1))
    hold_data[t] = scaler.fit_transform(np.array(hold_data[t]).reshape(-1,1))

In [70]:
# метрики для оценки моделей
scoring = {'f1':'f1',
           'recall':'recall',
           'precision': 'precision'}

# метод кросс-валидации 3-разбиения, с учетом баланса классов и перемешиваннием
cv=StratifiedKFold(3,True)

In [71]:
# базовая модель градиентного бустинга
# используется gpu ускорение, задан вес для положительного класса 
xgboost_clf = XGBClassifier(tree_method='gpu_hist', scale_pos_weight = labels.value_counts()[-1]
                            /labels.value_counts()[1], n_jobs=-1)
#кросс-валидация моделей
xgb_cv = GridSearchCV(xgboost_clf, {'n_estimators' : [100, 250, 500, 1000, 2000],
                                    'learning_rate' : [0.1, 0.05, 0.02, 0.005]},
                      cv=cv,scoring=scoring, n_jobs=-1, refit='precision')
xgb_cv.fit(data, labels)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=12.286093888396811, seed=None, silent=True,
       subsample=1, tree_method='gpu_hist'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100, 250, 500, 1000, 2000], 'learning_rate': [0.1, 0.05, 0.02, 0.005]},
       pre_dispatch='2*n_jobs', refit='precision',
       return_train_score='warn',
       scoring={'f1': 'f1', 'recall': 'recall', 'precision': 'precision'},
       verbose=0)

In [72]:
# извлечение результатов
vals = xgb_cv.cv_results_.keys()
vals = [t for t in vals if t[:9]=='mean_test']
vals

['mean_test_f1', 'mean_test_recall', 'mean_test_precision']

In [75]:
# xgboost с лучшим f1-score на котрольной выборке 
print(xgb_cv.cv_results_['params'][np.argmax(xgb_cv.cv_results_[vals[0]])])
xgboost_clf = XGBClassifier(learning_rate = 0.05, n_estimators = 1000,
                            tree_method='gpu_hist', scale_pos_weight = labels.value_counts()[-1]
                            /labels.value_counts()[1], n_jobs=-1)
xgboost_clf.fit(data, labels)
print(classification_report(hold_labels, xgboost_clf.predict(hold_data)))
print(accuracy_score(hold_labels, xgboost_clf.predict(hold_data)))

{'learning_rate': 0.05, 'n_estimators': 2000}
              precision    recall  f1-score   support

          -1       0.95      0.85      0.90      9282
           1       0.19      0.44      0.26       718

   micro avg       0.82      0.82      0.82     10000
   macro avg       0.57      0.65      0.58     10000
weighted avg       0.90      0.82      0.85     10000

0.8234


In [76]:
# xgboost с наибольшей  полнотой на контрольной выборке 
print(xgb_cv.cv_results_['params'][np.argmax(xgb_cv.cv_results_[vals[1]])])
xgboost_clf = XGBClassifier(learning_rate = 0.005, n_estimators = 100,
                            tree_method='gpu_hist', scale_pos_weight = labels.value_counts()[-1]
                            /labels.value_counts()[1], n_jobs=-1)
xgboost_clf.fit(data, labels)
print(classification_report(hold_labels, xgboost_clf.predict(hold_data)))
print(accuracy_score(hold_labels, xgboost_clf.predict(hold_data)))

{'learning_rate': 0.005, 'n_estimators': 100}
              precision    recall  f1-score   support

          -1       0.97      0.55      0.70      9282
           1       0.12      0.77      0.20       718

   micro avg       0.57      0.57      0.57     10000
   macro avg       0.54      0.66      0.45     10000
weighted avg       0.91      0.57      0.67     10000

0.5668


In [87]:
# xgboost с наибольшей  точностью на контрольной выборке 
print(xgb_cv.cv_results_['params'][np.argmax(xgb_cv.cv_results_[vals[2]])])
xgboost_clf = XGBClassifier(learning_rate = 0.1, n_estimators = 2000,
                            tree_method='gpu_hist', scale_pos_weight = labels.value_counts()[-1]
                            /labels.value_counts()[1], n_jobs=-1)
xgboost_clf.fit(data, labels)
print(classification_report(hold_labels, xgboost_clf.predict(hold_data)))
print(accuracy_score(hold_labels, xgboost_clf.predict(hold_data)))

{'learning_rate': 0.1, 'n_estimators': 2000}
              precision    recall  f1-score   support

          -1       0.94      0.92      0.93      9282
           1       0.20      0.26      0.23       718

   micro avg       0.87      0.87      0.87     10000
   macro avg       0.57      0.59      0.58     10000
weighted avg       0.89      0.87      0.88     10000

0.8748


In [78]:
# базовая модель логистической регрессии задан вес для положительного класса 
LR = LogisticRegression(class_weight='balanced')
# кросс-валидация
lr_cv = GridSearchCV(LR, {'penalty' : ['l1','l2'],
                          'C' : [2.0, 1.0, 0.5, 0.1, 0.05]},
                     cv=cv,scoring=scoring, n_jobs=-1, refit='precision')
lr_cv.fit(data_1, labels)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [2.0, 1.0, 0.5, 0.1, 0.05]},
       pre_dispatch='2*n_jobs', refit='precision',
       return_train_score='warn',
       scoring={'f1': 'f1', 'recall': 'recall', 'precision': 'precision'},
       verbose=0)

In [79]:
# логистическая регрессия с лучшим f1-score на котрольной выборке 
print(lr_cv.cv_results_['params'][np.argmax(lr_cv.cv_results_[vals[0]])])
LR = LogisticRegression(class_weight='balanced', C=0.1, penalty='l1')
LR.fit(data_1, labels)
print(classification_report(hold_labels, LR.predict(hold_data_1)))
print(accuracy_score(hold_labels, LR.predict(hold_data_1)))

{'C': 0.05, 'penalty': 'l2'}
              precision    recall  f1-score   support

          -1       0.96      0.66      0.78      9282
           1       0.12      0.60      0.20       718

   micro avg       0.66      0.66      0.66     10000
   macro avg       0.54      0.63      0.49     10000
weighted avg       0.90      0.66      0.74     10000

0.6588


In [80]:
# логистическая регрессия с наибольшей  полнотой на контрольной выборке 
print(lr_cv.cv_results_['params'][np.argmax(lr_cv.cv_results_[vals[1]])])
LR = LogisticRegression(class_weight='balanced', C=0.05, penalty='l1')
LR.fit(data_1, labels)
print(classification_report(hold_labels, LR.predict(hold_data_1)))
print(accuracy_score(hold_labels, LR.predict(hold_data_1)))

{'C': 0.05, 'penalty': 'l1'}
              precision    recall  f1-score   support

          -1       0.96      0.64      0.77      9282
           1       0.12      0.63      0.20       718

   micro avg       0.64      0.64      0.64     10000
   macro avg       0.54      0.64      0.48     10000
weighted avg       0.90      0.64      0.73     10000

0.6407


In [83]:
# логистическая регрессия с наибольшей  точностью на контрольной выборке 
print(lr_cv.cv_results_['params'][np.argmax(lr_cv.cv_results_[vals[2]])])
LR = LogisticRegression(class_weight='balanced', C=1.0, penalty='l1')
LR.fit(data_1, labels)
print(classification_report(hold_labels, LR.predict(hold_data_1)))
print(accuracy_score(hold_labels, LR.predict(hold_data_1)))

{'C': 1.0, 'penalty': 'l1'}
              precision    recall  f1-score   support

          -1       0.94      0.89      0.91      9282
           1       0.15      0.25      0.19       718

   micro avg       0.84      0.84      0.84     10000
   macro avg       0.54      0.57      0.55     10000
weighted avg       0.88      0.84      0.86     10000

0.8416
