In [19]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats import weightstats
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

In [6]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - stats.norm.cdf(z_stat)

In [11]:
p1 = 10/34
p2 = 4/16
n1 = 34
n2 = 16
P = float(p1*n1 + p2*n2) / (n1 + n2)
Z_stat = (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [12]:
proportions_diff_z_test(Z_stat, alternative='greater')

0.37293045872523534

### 4 

In [13]:
banknotes = pd.read_table('Data/banknotes.txt')

In [14]:
banknotes.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [16]:
y = banknotes['real']
banknotes.drop('real', axis=1, inplace=True)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(banknotes, y, 
                                                    test_size=50,
                                                    random_state=1)

In [34]:
log1 = LogisticRegression().fit(X_train.iloc[:, :3], y_train)

In [35]:
log2 = LogisticRegression().fit(X_train.iloc[:, 3:], y_train)

In [45]:
log1_pred = log1.predict(X_test.iloc[:, :3])
log2_pred = log2.predict(X_test.iloc[:, 3:])

In [46]:
log1_errors = abs(log1_pred - y_test)
log2_errors = abs(log2_pred - y_test)

In [47]:
sum(log1_errors), sum(log2_errors)

(10, 1)

In [56]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [57]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [52]:
z = proportions_diff_z_stat_rel(log1_errors, log2_errors)
z

2.9386041680175268

In [53]:
proportions_diff_z_test(z, alternative='two-sided')

0.0032969384555543435

In [58]:
print("95%% confidence interval for a difference between proportions: [%f, %f]" \
      % proportions_diff_confint_rel(log1_errors, log2_errors))

95% confidence interval for a difference between proportions: [0.059945, 0.300055]


### 6 

In [72]:
z = (541.5-525)/(100/np.sqrt(100))
z

1.65

In [73]:
proportions_diff_z_test(z, alternative = 'greater')

0.0494714680336481