# Доверительные интервалы для доли ушедших клиентов

In [1]:
import numpy as np
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
telecom_data = pd.read_csv('../../data/telecom_churn.csv')

In [3]:
telecom_data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


**Посмотрим только на Churn. Среднее:**

In [4]:
telecom_data['Churn'].mean()

0.14491449144914492

**Возьмем подвыборку и оценим на ее основе**

In [5]:
np.random.seed(1)
sample = np.random.choice(telecom_data['Churn'], size = 1000)

## Точечная оценка доли ушедших клиентов

In [6]:
sample.mean()

0.152

## Доверительный интервал для доли

In [7]:
from statsmodels.stats.proportion import proportion_confint

### Доверительный интервал на основе нормального распределения

$$\hat{p}\pm z_{1-\frac{\alpha}{2}} \sqrt{\frac{\hat{p}\left(1-\hat{p}\right)}{n}}$$

In [8]:
normal_interval = proportion_confint(sum(sample), 
                                     len(sample), method = 'normal')

In [9]:
print('normal_interval [%f, %f] with width %f' % (normal_interval[0],
                                                  normal_interval[1], 
                                                  normal_interval[1] - normal_interval[0]))

normal_interval [0.129748, 0.174252] with width 0.044504


### Доверительный интервал Уилсона

$$\frac1{ 1 + \frac{z^2}{n} } \left( \hat{p} + \frac{z^2}{2n} \pm z \sqrt{ \frac{ \hat{p}\left(1-\hat{p}\right)}{n} + \frac{
z^2}{4n^2} } \right), \;\; z \equiv z_{1-\frac{\alpha}{2}}$$ 

In [10]:
wilson_interval = proportion_confint(sum(sample), 
                                     len(sample), method = 'wilson')

In [11]:
print('wilson_interval [%f, %f] with width %f' % (wilson_interval[0],
                                                  wilson_interval[1],
                                                  wilson_interval[1] - wilson_interval[0]))

wilson_interval [0.131083, 0.175581] with width 0.044498


## Размер выборки для интервала заданной ширины

In [12]:
from statsmodels.stats.proportion import samplesize_confint_proportion

In [13]:
n_samples = int(np.ceil(samplesize_confint_proportion(sample.mean(), 0.02)))
n_samples

1238

In [14]:
np.random.seed(1)
random_sample = np.random.choice(telecom_data['Churn'], 
                                 size=n_samples)

In [15]:
normal_interval = proportion_confint(sum(random_sample), 
                                     len(random_sample), method = 'normal')

In [16]:
print('normal_interval [%f, %f] with width %f' % (normal_interval[0],
                                                  normal_interval[1],
                                                  normal_interval[1] - normal_interval[0]))

normal_interval [0.132631, 0.172700] with width 0.040070
