In [4]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint, binom_test

In [2]:
n1 = 34
n2 = 16
p1 = 10 / 34
p2 = 1 / 4

In [65]:
def prop_diff_confint_ind(sample1, sample2, alpha=.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2)
    p1 = sum(sample1) / len(sample1)
    p2 = sum(sample2) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1) / len(sample1) + p2 * (1 - p2) / len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1) / len(sample1) + p2 * (1 - p2) / len(sample2))
    return left_boundary, right_boundary

def prop_diff_z_stat_ind1(p1, p2, n1, n2):
    P = (p1 * n1 + p2 * n2) / (n1 + n2)
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1 / n1 + 1 / n2))

def prop_diff_z_stat_ind2(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = sum(sample1) / n1
    p2 = sum(sample2) / n2
    P = (p1 * n1 + p2 * n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1 / n1 + 1 / n2))

def prop_diff_z_test(z_stat, alternative="two-sided"):
    if alternative not in ("two-sided", "less", "greater"):
        raise ValueError("alternative not recognized\nshould be 'two-sided', 'less' or 'greater'")
    if alternative == "two-sided":
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    if alternative == "less":
        return scipy.stats.norm.cdf(z_stat)
    if alternative == "greater":
        return 1 - scipy.stats.norm.cdf(z_stat)
    
def prop_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    f = sum(1 if x[0] == 1 and x[1] == 0 else 0 for x in sample)
    g = sum(1 if x[1] == 1 and x[0] == 0 else 0 for x in sample)
    return (f - g) / np.sqrt(f + g - (f - g) ** 2 / n)

def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [61]:
p1, p2, n1, n2

(0.29411764705882354, 0.25, 34, 16)

In [56]:
prop_diff_z_stat_ind1(p1, p2, n1, n2)

0.32410186177608225

In [62]:
round(prop_diff_z_test(0.32410186177608225, "greater"), 4)

0.3729

In [15]:
b_d = pd.read_csv("data/banknotes.txt", sep="\t")

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [21]:
X = b_d.drop("real", axis=1)
y = b_d["real"]

In [26]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=1, test_size=50)

In [87]:
l1 = LogisticRegression(multi_class='ovr', n_jobs=1, solver='liblinear').fit(X_tr[["X1", "X2", "X3"]], y_tr)
l2 = LogisticRegression(multi_class='ovr', n_jobs=1, solver='liblinear').fit(X_tr[["X4", "X5", "X6"]], y_tr)

In [93]:
l1_p = np.abs(l1.predict(X_te[["X1", "X2", "X3"]]) - y_te)
l2_p = np.abs(l2.predict(X_te[["X4", "X5", "X6"]]) - y_te)

In [1]:
3 * 10 ** -3

0.003

In [98]:
prop_diff_z_test(prop_diff_z_stat_rel(l1_p, l2_p))

0.0032969384555543435

In [95]:
np.round(proportions_diff_confint_rel(l1_p, l2_p), 4)

array([0.0599, 0.3001])

In [91]:
np.round(prop_diff_confint_ind(l1_p, l2_p), 4)1

SyntaxError: invalid syntax (Temp/ipykernel_3636/1666781354.py, line 1)

In [44]:
u = 525
dev = 100
n = 100
mean = 541.4

In [51]:
(541.5 - 525) / (100 / 10)

1.65

In [46]:
stats.norm.ppf(1.6399999999999977)

nan

In [53]:
round(prop_diff_z_test(1.65, "greater"), 4)

0.0495