In [1]:
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import stats
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix, hstack

plt.style.use(['dark_background'])

## 1. Загрузим данные и слегка предобработаем

In [4]:
orange_small_churn_data = pd.read_csv('orange_small_churn_data.train')
orange_small_churn_labels = pd.read_csv('orange_small_churn_labels.train', header=None, names=['target'])

In [5]:
df_all = pd.concat([orange_small_churn_data, orange_small_churn_labels], axis=1, sort=False)

In [6]:
# дропнем пустые признаки
df_all = df_all.dropna(axis=1, how='all')

In [7]:
# дропнем константные признаки
nuniq_val = df_all.nunique()
df_all = df_all.drop(nuniq_val[nuniq_val==1].index, axis=1)

In [8]:
num_col = df_all.drop('target', axis=1).dtypes[(df_all.dtypes==np.float64) | (df_all.dtypes==np.int64)].index
category_col = df_all.dtypes[df_all.dtypes==object].index

## 2. Выделим train и test и продолжим дальше предобрабатывать данные

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_all.drop('target', axis=1), df_all.target, test_size=0.3, stratify=df_all.target, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((28000, 207), (12000, 207), (28000L,), (12000L,))

In [10]:
## заполним пропуски

# для численных - среднее по тренировочной выборке
X_train[num_col] = X_train[num_col].fillna(X_train[num_col].mean())
X_test[num_col] = X_test[num_col].fillna(X_train[num_col].mean())

# и сделаем скелинг
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[num_col].values)
X_test_num_scaled = scaler.transform(X_test[num_col].values)

X_train_num_scaled.shape, X_test_num_scaled.shape

((28000L, 173L), (12000L, 173L))

In [11]:
# для категориальных используем OHE, обученный на тренировочном наборе

enc = OneHotEncoder(sparse=True, handle_unknown = 'ignore')
X_train_cat_ohe = enc.fit_transform(X_train[category_col].fillna('-'))
X_test_cat_ohe = enc.transform(X_test[category_col].fillna('-'))

X_train_cat_ohe.shape, X_test_cat_ohe.shape

((28000L, 51949L), (12000L, 51949L))

In [12]:
# соединим признаки в train и test

X_train_final = csr_matrix(hstack([X_train_num_scaled, X_train_cat_ohe]))
X_test_final = csr_matrix(hstack([X_test_num_scaled, X_test_cat_ohe]))

X_train_final.shape, X_test_final.shape

((28000L, 52122L), (12000L, 52122L))

## 4. Получим baselines 

In [13]:
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix,accuracy_score, classification_report
from sklearn.metrics import precision_recall_curve, f1_score, recall_score, precision_score, log_loss

### 4.1 Линейная модель (ридж)

In [14]:
rc = RidgeClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [15]:
cv_score = cross_val_score(estimator=rc, X=X_train_final, y=y_train, cv=cv, n_jobs=-1, scoring='roc_auc')

In [16]:
print('roc_auc cv_score: ', cv_score)
print('roc_auc cv_score mean: ', cv_score.mean())

('roc_auc cv_score: ', array([0.67167655, 0.69559861, 0.66206205, 0.69479502, 0.67069719]))
('roc_auc cv_score mean: ', 0.6789658862062481)


In [17]:
rc.fit(X_train_final, y_train)
predict = rc.predict(X_test_final)
decision = rc.decision_function(X_test_final)
probab = np.exp(decision) / (1 + np.exp(decision))

In [18]:
print("accuracy: ", accuracy_score(y_test,predict))
print('logLoss:', log_loss(y_test, probab))
print('rocAuc:', roc_auc_score(y_test, probab))
print(classification_report(y_test, predict))

('accuracy: ', 0.9255)
('logLoss:', 0.4150983823300894)
('rocAuc:', 0.6848815920793269)
              precision    recall  f1-score   support

          -1       0.93      1.00      0.96     11107
           1       0.00      0.00      0.00       893

   micro avg       0.93      0.93      0.93     12000
   macro avg       0.46      0.50      0.48     12000
weighted avg       0.86      0.93      0.89     12000



### 4.2 Случайный лес

In [19]:
rfcl = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight={-1: 10, 1:1})

In [20]:
rfcl_cv_score = cross_val_score(estimator=rfcl, X=X_train_final, y=y_train, cv=cv, n_jobs=-1, scoring='roc_auc')

In [21]:
print('roc_auc cv_score: ', rfcl_cv_score)
print('roc_auc cv_score mean: ', rfcl_cv_score.mean())

('roc_auc cv_score: ', array([0.57554489, 0.56814757, 0.55019824, 0.5369171 , 0.56118468]))
('roc_auc cv_score mean: ', 0.5583984944124538)


In [22]:
rfcl.fit(X_train_final, y_train)
rfcl_predict = rfcl.predict(X_test_final)
rfcl_probabil = rfcl.predict_proba(X_test_final)



In [23]:
print("accuracy: ", accuracy_score(y_test,rfcl_predict))
print('logLoss:', log_loss(y_test, rfcl_probabil[:,1]))
print('rocAuc:', roc_auc_score(y_test, rfcl_probabil[:,1]))
print(classification_report(y_test, rfcl_predict))

('accuracy: ', 0.9244166666666667)
('logLoss:', 1.513462037555319)
('rocAuc:', 0.5801963915898602)
              precision    recall  f1-score   support

          -1       0.93      1.00      0.96     11107
           1       0.25      0.01      0.02       893

   micro avg       0.92      0.92      0.92     12000
   macro avg       0.59      0.50      0.49     12000
weighted avg       0.88      0.92      0.89     12000



### 4.3 Бустинг

In [24]:
gb_cl = GradientBoostingClassifier(random_state=42)

In [25]:
gbcl_cv_score = cross_val_score(estimator=gb_cl, X=X_train_final, y=y_train, cv=cv, n_jobs=-1, scoring='roc_auc')

In [26]:
print('roc_auc cv_score: ', gbcl_cv_score)
print('roc_auc cv_score mean: ', gbcl_cv_score.mean())

('roc_auc cv_score: ', array([0.72226015, 0.75584532, 0.70575544, 0.75321085, 0.72807459]))
('roc_auc cv_score mean: ', 0.7330292708841327)


In [27]:
gb_cl.fit(X_train_final, y_train)
gbcl_predict = gb_cl.predict(X_test_final)
gbcl_probabil = gb_cl.predict_proba(X_test_final)

In [28]:
print("accuracy: ", accuracy_score(y_test,gbcl_predict))
print('logLoss:', log_loss(y_test, gbcl_probabil[:,1]))
print('rocAuc:', roc_auc_score(y_test, gbcl_probabil[:,1]))
print(classification_report(y_test, gbcl_predict))

('accuracy: ', 0.9243333333333333)
('logLoss:', 0.24131486182832643)
('rocAuc:', 0.7328792280243354)
              precision    recall  f1-score   support

          -1       0.93      1.00      0.96     11107
           1       0.27      0.01      0.02       893

   micro avg       0.92      0.92      0.92     12000
   macro avg       0.60      0.50      0.49     12000
weighted avg       0.88      0.92      0.89     12000



    Среди 3 моделей бэйзлайнов -  самый большой rocAuc получился у градиентного бустинга (0.733) 
    Что касается других метрик (precision, recall, f1-score) их качество совсем низкое)  