In [25]:
import numpy as np
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic
import pandas as pd

data = pd.read_csv('water.txt', sep="\t")
data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [26]:
def calculate(dataframe):
    mean = np.mean(dataframe)
    mean_std = dataframe.std(ddof=1)/np.sqrt(len(dataframe))
    print _tconfint_generic(mean, mean_std,
    len(dataframe) - 1,
    0.05, 'two-sided')

In [27]:
calculate(data['mortality'])
calculate(data[data['location'] == 'South']['mortality'])

calculate(data[data['location'] == 'South']['hardness'])
calculate(data[data['location'] == 'North']['hardness'])

(1476.0833413552848, 1572.2117406119285)
(1320.1517462936238, 1433.463638321761)
(53.467198692036106, 86.071262846425441)
(21.422487285724259, 39.377512714275738)


In [28]:
from statsmodels.stats.proportion import samplesize_confint_proportion
from statsmodels.stats.proportion import proportion_confint

In [29]:
wilson_interval = proportion_confint(1, 50, method = 'wilson')
print wilson_interval[0]

0.00353925927165


In [30]:
n_samples = int(np.ceil(samplesize_confint_proportion(0.02, 0.01)))
print n_samples

753


In [44]:
from statsmodels.compat.python import lzip, range
import numpy as np
from scipy import stats, optimize
from sys import float_info

from statsmodels.stats.base import AllPairsResults
from statsmodels.tools.sm_exceptions import HypothesisTestWarning


def proportion_confint_custom(count, nobs, alpha=0.05, method='normal'):
    pd_index = getattr(count, 'index', None)
    if pd_index is not None and hasattr(pd_index, '__call__'):
        # this rules out lists, lists have an index method
        pd_index = None
    count = np.asarray(count)
    nobs = np.asarray(nobs)

    q_ = count * 1. / nobs
    alpha_2 = 0.5 * alpha

    if method == 'normal':
        std_ = np.sqrt(q_ * (1 - q_) / nobs)
        dist = stats.norm.isf(alpha / 2.) * std_
        ci_low = q_ - dist
        ci_upp = q_ + dist

    elif method == 'binom_test':
        # inverting the binomial test
        def func(qi):
            return stats.binom_test(q_ * nobs, nobs, p=qi) - alpha
        if count == 0:
            ci_low = 0
        else:
            ci_low = optimize.brentq(func, float_info.min, q_)
        if count == nobs:
            ci_upp = 1
        else:
            ci_upp = optimize.brentq(func, q_, 1. - float_info.epsilon)

    elif method == 'beta':
        ci_low = stats.beta.ppf(alpha_2, count, nobs - count + 1)
        ci_upp = stats.beta.isf(alpha_2, count + 1, nobs - count)

        if np.ndim(ci_low) > 0:
            ci_low[q_ == 0] = 0
            ci_upp[q_ == 1] = 1
        else:
            ci_low = ci_low if (q_ != 0) else 0
            ci_upp = ci_upp if (q_ != 1) else 1

    elif method == 'agresti_coull':
        crit = stats.norm.isf(alpha / 2.)
        nobs_c = nobs + crit**2
        q_c = (count + crit**2 / 2.) / nobs_c
        std_c = np.sqrt(q_c * (1. - q_c) / nobs_c)
        dist = crit * std_c
        ci_low = q_c - dist
        ci_upp = q_c + dist

    elif method == 'wilson':
        crit = stats.norm.isf(alpha / 2.)
        crit2 = crit**2
        denom = 1 + crit2 / nobs
        center = (q_ + crit2 / (2 * nobs)) / denom
        dist = crit * np.sqrt(q_ * (1. - q_) / nobs + crit2 / (4. * nobs**2))
        dist /= denom
        ci_low = center - dist
        ci_upp = center + dist

    # method adjusted to be more forgiving of misspellings or incorrect option name
    elif method[:4] == 'jeff':
        ci_low, ci_upp = stats.beta.interval(1 - alpha, count + 0.5,
                                             nobs - count + 0.5)

    else:
        raise NotImplementedError('method "%s" is not available' % method)

    if pd_index is not None and np.ndim(ci_low) > 0:
        import pandas as pd
        if np.ndim(ci_low) == 1:
            ci_low = pd.Series(ci_low, index=pd_index)
            ci_upp = pd.Series(ci_upp, index=pd_index)
        if np.ndim(ci_low) == 2:
            ci_low = pd.DataFrame(ci_low, index=pd_index)
            ci_upp = pd.DataFrame(ci_upp, index=pd_index)

    return ci_low, ci_upp

In [45]:
normal_interval = proportion_confint_custom(1, 50, method = 'normal')
print normal_interval[0]

-0.0188053070818


In [53]:
from scipy.stats import norm
rv = norm(0,1)
rv.ppf(0.9985)

2.9677379253417944

In [64]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint

def proportions_confint_diff_ind(sum_sample1, sum_sample2, len_sample1, len_sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)   
    p1 = float(sum_sample1) / len_sample1
    p2 = float(sum_sample2) / len_sample2
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len_sample1 + p2 * (1 - p2)/ len_sample2)
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len_sample1 + p2 * (1 - p2)/ len_sample2)
    
    return (left_boundary, right_boundary)

In [65]:
proportions_confint_diff_ind(104, 189, 11037, 11034, 0.05)

(-0.010724297276960124, -0.0046877506750494392)

In [63]:
p = 104.0/11037

p_1 = 189.0/11034

print  (p_1 / (1 - p_1)) / (p / (1 - p))

1.83205394191


In [66]:
wilson_interval = proportion_confint(104, 11037)
print wilson_interval[1]

0.0112252781862


In [69]:
print p_1 / p

1.81780179445


In [88]:
np.random.seed(0)

p_plac = np.append(np.repeat(1.0, 104), np.repeat(0.0, 11037-104))

p_true = np.append(np.repeat(1.0, 189), np.repeat(0.0, 11034-189))

def get_bootstrap_samples(data, n_samples):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

plac_median_scores = map(np.median, get_bootstrap_samples(p_plac, 1000))
true_median_scores = map(np.median, get_bootstrap_samples(p_true, 1000))

def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [89]:
print stat_intervals(plac_median_scores, 0.05)
print stat_intervals(true_median_scores, 0.05)

[ 0.  0.]
[ 0.  0.]
