In [14]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_1samp, t, norm
import scipy.stats.distributions as dist

In [None]:
i_max_temp = [33., 32., 30., 29., 25., 30., 37., 37., 29., 30.,
                   36., 32., 33., 34., 53., 45., 25., 28., 32., 27.,
                   26., 28., 24., 26., 9., 22., 17., 26., 27., 30., 34.]
i_max_temp = np.asarray(i_max_temp)

c_max_temp = [34., 36., 30., 29., 30., 35., 44., 38., 31., 33.,
                   39., 33., 34., 39., 51., 44., 25., 34., 36., 29.,
                   27., 29., 27., 24., 11., 21., 19., 26., 28., 31., 38.]
c_max_temp = np.asarray(c_max_temp)



In [None]:
mean_i = np.mean(i_max_temp)
mean_c = np.mean(c_max_temp)
diff_mean = mean_i - mean_c

diff_std = np.std(i_max_temp - c_max_temp)
print('Mean of Ithaca: ', mean_i)
print('Mean of Canada: ', mean_c)
print('Difference of mean temp: ', diff_mean)

print('Standard Deviations of the differences: ', diff_std)

In [None]:
def autocorr_diff(ts1, ts2):
    '''Calculate autocorrelation of the difference between two paired samples'''
    # put into pandas df
    data = {'ts1':   ts1,
            'ts2':   ts2,
            'diff_ts':   ts1-ts2}

    df = pd.DataFrame(data)
    rho1 = df.diff_ts.autocorr(lag=1)
    
    return rho1
    
def n_prime(ts1, ts2):
    '''
    calculate the effective sample size/equivalent number of independent samples 
    from two paired samples of time series data that have high autocorrelation
    
    $n'  \approx n \frac{1-\rho_1}{1+\rho_1} $
    '''
    n = len(ts1)
    rho1 = autocorr_diff(ts1, ts2)
    nprime = n * ((1-rho1)/(1+rho1))
    
    return rho1, nprime

In [None]:
rho1, nprime = n_prime(i_max_temp, c_max_temp)
print(rho1, nprime)

### Test Difference of Proportion

based on [this example](https://online.stat.psu.edu/stat415/lesson/9/9.4)

In [4]:
## non-smoker data
n1 = 605. # total number of participants
y1 = 351. # number who answered "yes"


## smoker data
n2 = 195. # total number of participants
y2 = 41. # number who answered "yes"


In [25]:
# Null hypothesis is that p1 = p2 
# proportion of sample 1 is equal to sample 2
# two-tailed test will be required

def test_diff_proportion(ts1, ts2, alpha):
    '''
    Calculate the test statistic for testing 
    the difference in two population proportions
    
    Y1 : the number sample 1 that answer 'yes'
    Y2 : the number of sample 2 that answer 'yes'
    n1 : the size of sample 1
    n2 : the size of sample 2
    alpha : significance level
    
    return 
    Z : the test statistic
    p : the p-value at the significance level (alpha)
    
    '''
    n1 = len(ts1)
    n2 = len(ts2)
    y1 = ts1.sum()
    y2 = ts2.sum()
    
    p1 = y1/n1 # proportion of sample 1 who said yes
    p2 = y2/n2 # proportion of sample 2 who said yes
    phat = (Y1+Y2)/(n1 + n2)
    print('phat: ', phat)
    
    std_err = np.sqrt(phat*(1-phat)*(1/n1 + 1/n2))
    
    Z = ((p1 - p2) - 0)/(std_err)
    print('Z: value: ', Z)
    
    # Calculate the  p-value
    # based on the standard normal distribution z-test
    pvalue = 2*dist.norm.cdf(-np.abs(Z)) # Multiplied by two indicates a two tailed testing.
    print("Computed P-value is", pvalue)
    if pvalue < alpha:
        print('Reject null hypothesis, statistical significance found')



In [26]:
test_statistic(y1, y2, n1, n2, 0.05)

0.5801652892561984 0.21025641025641026
phat:  0.49
8.985900954503084
Computed P-value is 2.566230446480293e-19
Reject null hypothesis, statistical significance found
Critical t-value:  1.6467653442385173
