In [21]:
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats
from statsmodels.stats.weightstats import DescrStatsW
from statsmodels.stats.proportion import proportion_confint

In [2]:
df_water = pd.read_table('Data/water.txt')

In [3]:
df_water.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


#### Корреляция Пирсона

In [4]:
df_water[['mortality', 'hardness']].corr(method='pearson')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.654849
hardness,-0.654849,1.0


#### Ответ 1: -0.6548

#### Корреляция Спирмена

In [5]:
df_water[['mortality', 'hardness']].corr(method='spearman')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.631665
hardness,-0.631665,1.0


#### Ответ 2: -0.6317

#### Разбиение городов на северные и южные

In [6]:
df_water[df_water['location'] == 'South'][['mortality', 'hardness']].corr(method='pearson')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.602153
hardness,-0.602153,1.0


In [7]:
df_water[df_water['location'] == 'North'][['mortality', 'hardness']].corr(method='pearson')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.368598
hardness,-0.368598,1.0


#### Ответ 3: -0.3686

#### Корреляция Метьюса

In [8]:
a = 239  # мужчины часто
b = 203  # женщины часто
c = 515  # мужчины редко
d = 718  # женщины редко

In [9]:
(a*d - c*b) / np.sqrt((a+b)*(a+c)*(b+d)*(c+d))

0.10900237458678963

#### Ответ 4: 0.109

In [10]:
M = np.array([[239, 203], [515, 718]])

In [11]:
stats.chi2_contingency(M)

(19.40753078854304,
 1.0558987006638725e-05,
 1,
 array([[198.96597015, 243.03402985],
        [555.03402985, 677.96597015]]))

#### Ответ 5: 5

#### Доверительный интервал для доли

In [12]:
female = b / (a+d)
male = a / (a+c)

In [13]:
male - female

0.1048549151997428

In [14]:
fem = np.append(np.ones(203), np.zeros(718))
mal = np.append(np.ones(239), np.zeros(515))

In [41]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [42]:
proportions_diff_confint_ind(mal, fem)

(0.053905233215813156, 0.13922183141523897)

#### Ответ 6:0.0539

### Проверка гипотезы о равенстве долей

In [31]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [37]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - stats.norm.cdf(z_stat)

In [38]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(mal, fem), 'two-sided'))

p-value: 0.000008


#### Ответ 7: 6

### Расчет хи-квадрат

In [44]:
df_happyness = pd.DataFrame({'No money': [197, 382, 110],
                             'Med money': [111, 685, 342],
                             'Good money': [33, 331, 333]},
                            index=['Unhappy', 'Happy', 'Very happy'])

In [45]:
df_happyness

Unnamed: 0,No money,Med money,Good money
Unhappy,197,111,33
Happy,382,685,331
Very happy,110,342,333


In [53]:
df_happyness.sum().values / 3

array([229.66666667, 379.33333333, 232.33333333])

In [57]:
f_obs = df_happyness.values
f_obs

array([[197, 111,  33],
       [382, 685, 331],
       [110, 342, 333]], dtype=int64)

In [68]:
stats.chi2_contingency(f_obs, correction=True, lambda_=None)

(293.68311039689746,
 2.4964299580093467e-62,
 4,
 array([[ 93.08597464, 153.74722662,  94.16679873],
        [381.6251981 , 630.318542  , 386.0562599 ],
        [214.28882726, 353.93423138, 216.77694136]]))

#### Ответ 8: 293.6831

#### Ответ 9: 62

### Коэффициент крамера

In [73]:
np.sqrt(293.68311039689746/(sum(sum(f_obs))*2))

0.2412013934500338