# Доверительные интервалы для оценки среднего - Xgboost  и Logit на данных по кредитному скорингу

In [1]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


**Считаем данные**

In [2]:
data = pd.read_csv('../../data/credit_scoring_train.csv')

In [3]:
data.head()

Unnamed: 0,client_id,DIR,Age,NumLoans,NumRealEstateLoans,NumDependents,Num30-59Delinquencies,Num60-89Delinquencies,Income,BalanceToCreditLimit,Delinquent90
0,0,0.496289,49.1,13,0,0.0,2,0,5298.360639,0.387028,0
1,1,0.433567,48.0,9,2,2.0,1,0,6008.056256,0.234679,0
2,2,2206.731199,55.5,21,1,,1,0,,0.348227,0
3,3,886.132793,55.3,3,0,0.0,0,0,,0.97193,0
4,4,0.0,52.3,1,0,0.0,0,0,2504.613105,1.00435,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 11 columns):
client_id                75000 non-null int64
DIR                      75000 non-null float64
Age                      75000 non-null float64
NumLoans                 75000 non-null int64
NumRealEstateLoans       75000 non-null int64
NumDependents            73084 non-null float64
Num30-59Delinquencies    75000 non-null int64
Num60-89Delinquencies    75000 non-null int64
Income                   60153 non-null float64
BalanceToCreditLimit     75000 non-null float64
Delinquent90             75000 non-null int64
dtypes: float64(5), int64(6)
memory usage: 6.3 MB


In [5]:
data['Income'].fillna(data['Income'].median(), inplace=True)
data['NumDependents'].fillna(data['NumDependents'].median(), inplace=True)

In [6]:
X = data.drop('Delinquent90', axis=1)
y = data['Delinquent90']

## Сравнение Xgboost и Logit

### Оценка среднего качеcтва на кросс-валидации

In [18]:
logit = LogisticRegression(n_jobs=-1, random_state=7)
logit_pipeline = Pipeline([('scaling', StandardScaler()), ('logit', logit)])

In [11]:
%%time
xgb_auc_scores = cross_val_score(XGBClassifier(), 
                                 X, y, scoring = 'roc_auc',  
                                 cv=20)

CPU times: user 2min, sys: 3.43 s, total: 2min 4s
Wall time: 51.2 s


In [20]:
%%time
logit_auc_scores = cross_val_score(logit_pipeline, 
                                 X, y, scoring = 'roc_auc',  
                                 cv = 20)

CPU times: user 13.2 s, sys: 506 ms, total: 13.7 s
Wall time: 7.95 s


### Точечная оценка среднего

In [22]:
print("Xgboost model auc: mean %.3f, std %.3f" % 
      (xgb_auc_scores.mean(), xgb_auc_scores.std()))
print("Logit model auc: mean %.3f, std %.3f" % 
      (logit_auc_scores.mean(), logit_auc_scores.std()))

Xgboost model auc: mean 0.812, std 0.043
Logit model auc: mean 0.689, std 0.020


### Интервальная оценка среднего 

In [23]:
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic

In [24]:
xgb_cv_auc_mean = xgb_auc_scores.mean()
logit_cv_auc_mean = logit_auc_scores.mean()

#### t-интервал

Вместо гипотетической теоретической дисперсии $\sigma^2$, которую мы на самом деле в данном случае не знаем, используем выборочные дисперсии, и построим доверительные интервалы вида $$\bar{X}_n \pm t_{1-\frac{\alpha}{2}} \frac{S}{\sqrt{n}}$$

In [25]:
xgb_mean_std = xgb_auc_scores.std() / sqrt(len(xgb_auc_scores))
logit_mean_std = logit_auc_scores.std() / sqrt(len(logit_auc_scores))

In [26]:
print("Xgboost mean auc confidence interval", 
      _tconfint_generic(xgb_cv_auc_mean, xgb_mean_std,
                        len(xgb_auc_scores) - 1,
                        0.05, 'two-sided'))

print("Logit mean auc confidence interval", 
      _tconfint_generic(logit_cv_auc_mean, logit_mean_std,
                        len(logit_auc_scores) - 1,
                        0.05, 'two-sided'))

Xgboost mean auc confidence interval (0.79226568592972857, 0.83234261480763139)
Logit mean auc confidence interval (0.68000994343117882, 0.69873677688552926)
