# Доверительные интервалы для долей правильных ответов на кросс-валидации для Xgboost и одного дерева решений

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
%pylab inline
import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint

Populating the interactive namespace from numpy and matplotlib


## Загрузка данных

In [2]:
data = pd.read_csv('../../data/credit_scoring_train.csv')

In [3]:
data.head()

Unnamed: 0,client_id,DIR,Age,NumLoans,NumRealEstateLoans,NumDependents,Num30-59Delinquencies,Num60-89Delinquencies,Income,BalanceToCreditLimit,Delinquent90
0,0,0.496289,49.1,13,0,0.0,2,0,5298.360639,0.387028,0
1,1,0.433567,48.0,9,2,2.0,1,0,6008.056256,0.234679,0
2,2,2206.731199,55.5,21,1,,1,0,,0.348227,0
3,3,886.132793,55.3,3,0,0.0,0,0,,0.97193,0
4,4,0.0,52.3,1,0,0.0,0,0,2504.613105,1.00435,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 11 columns):
client_id                75000 non-null int64
DIR                      75000 non-null float64
Age                      75000 non-null float64
NumLoans                 75000 non-null int64
NumRealEstateLoans       75000 non-null int64
NumDependents            73084 non-null float64
Num30-59Delinquencies    75000 non-null int64
Num60-89Delinquencies    75000 non-null int64
Income                   60153 non-null float64
BalanceToCreditLimit     75000 non-null float64
Delinquent90             75000 non-null int64
dtypes: float64(5), int64(6)
memory usage: 6.3 MB


In [5]:
data['Income'].fillna(data['Income'].median(), inplace=True)
data['NumDependents'].fillna(data['NumDependents'].median(), inplace=True)

In [6]:
X = data.drop('Delinquent90', axis=1)
y = data['Delinquent90']

**Прогнозы при кросс-валидации**

In [10]:
%%time
xgb_cv_pred = cross_val_predict(XGBClassifier(), 
                                 X, y,  
                                 cv=20)

CPU times: user 2min 3s, sys: 3.62 s, total: 2min 6s
Wall time: 52 s


In [16]:
%%time
tree_cv_pred = cross_val_predict(DecisionTreeClassifier(random_state=7), 
                                 X, y,  
                                 cv=20)

CPU times: user 20.8 s, sys: 231 ms, total: 21 s
Wall time: 21.1 s


In [17]:
xgb_is_right = (xgb_cv_pred == y).astype('int')
tree_is_right = (tree_cv_pred == y).astype('int')

In [18]:
xgb_is_right.mean(), tree_is_right.mean()

(0.9012533333333334, 0.8439733333333334)

## Интервальные оценки долей

$$\frac1{ 1 + \frac{z^2}{n} } \left( \hat{p} + \frac{z^2}{2n} \pm z \sqrt{ \frac{ \hat{p}\left(1-\hat{p}\right)}{n} + \frac{z^2}{4n^2} } \right), \;\; z \equiv z_{1-\frac{\alpha}{2}}$$ 

In [19]:
conf_interval_xgboost = proportion_confint(sum(xgb_is_right), 
                                            xgb_is_right.shape[0],
                                            method = 'wilson')
conf_interval_tree = proportion_confint(sum(tree_is_right), 
                                            tree_is_right.shape[0],
                                            method = 'wilson')

In [20]:
print('interval for Xgboost [%f, %f]' % conf_interval_xgboost)
print('interval for Tree [%f, %f]' % conf_interval_tree)

interval for Xgboost [0.899098, 0.903368]
interval for Tree [0.841359, 0.846553]


### Как их сравнить?

## Доверительный интервал для разности долей (связанные выборки)

  $X_1$ \ $X_2$ | 1| 0 | $\sum$
  ------------- | -------------|
  1  | e | f | e + f
  0  | g | h | g + h
  $\sum$ | e + g| f + h | n  
  
$$ \hat{p}_1 = \frac{e + f}{n}$$

$$ \hat{p}_2 = \frac{e + g}{n}$$

$$ \hat{p}_1 - \hat{p}_2 = \frac{f - g}{n}$$


$$\text{Доверительный интервал для }p_1 - p_2\colon \;\;  \frac{f - g}{n} \pm z_{1-\frac{\alpha}{2}}\sqrt{\frac{f + g}{n^2} - \frac{(f - g)^2}{n^3}}$$

In [22]:
def proportions_confint_diff_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = zip(sample1, sample2)
    n = min(len(sample1), len(sample2))
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * \
        np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * \
        np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [23]:
print("confidence interval: [%f, %f]" 
      % proportions_confint_diff_rel(xgb_is_right, tree_is_right))

confidence interval: [0.075843, 0.079677]
