In [24]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

## 분산분석

In [25]:
four_sessions = pd.read_csv('four_sessions.csv')

In [60]:
four_sessions

Unnamed: 0,Page,Time
0,Page 1,164
1,Page 2,178
2,Page 3,175
3,Page 4,155
4,Page 1,172
5,Page 2,191
6,Page 3,193
7,Page 4,166
8,Page 1,177
9,Page 2,182


In [9]:
observed_variance = four_sessions.groupby('Page').mean().var()[0]
print("Observed means", four_sessions.groupby('Page').mean().values.ravel())
print('Variance:', obserbed_variance)

def perm_test(df):
    df = df.copy()
    df['Time'] = np.random.permutation(df['Time'].values)
    return df.groupby('Page').mean().var()[0]

perm_variance = [perm_test(four_sessions) for _ in range(3000)]
print('Pr(Prob)', np.mean([var > observed_variance for var in perm_variance]))

Observed means [172.8 182.6 175.6 164.6]
Variance: 55.426666666666655
Pr(Prob) 0.07


In [20]:
model = smf.ols('Time ~ Page', data=four_sessions).fit()

aov_table = sm.stats.anova_lm(model)
aov_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
Page,3.0,831.4,277.133333,2.739825,0.077586
Residual,16.0,1618.4,101.15,,


## 카이제곱통계

In [102]:
click_rate = pd.read_csv('click_rates.csv')
clicks = click_rate.pivot(index='Click', columns='Headline', values='Rate')
clicks

Headline,Headline A,Headline B,Headline C
Click,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Click,14,8,12
No-click,986,992,988


In [108]:
import random

box = [1] * 34
box.extend([0] * 2966)
random.shuffle(box)

def chi2(observed, expected):
    pearson_residuals = []
    for row, expect in zip(observed, expected):
        pearson_residuals.append([(observe - expect) ** 2 / expect for observe in row])
    # 제곱의 합 반환
    return np.sum(pearson_residuals)

expected_clicks = 34 / 3
expected_noclicks = 1000 - expected_clicks
expected = [34 / 3, 1000 - 34 / 3]
chi2observed = chi2(clicks.values, expected)

def perm_fun(box):
    sample_clicks = [sum(random.sample(box, 1000)),
                    sum(random.sample(box, 1000)),
                    sum(random.sample(box, 1000))]
    sample_noclicks = [1000 - n for n in sample_clicks]
    return chi2([sample_clicks, sample_noclicks], expected)

perm_chi2 = [perm_fun(box) for _ in range(2000)]

resampled_p_value = sum(perm_chi2 > chi2observed) / len(perm_chi2)
print(f'Observed chi2 : {chi2observed:.4f}')
print(f'Resampled p-value: {resampled_p_value:.4f}')



Observed chi2 : 1.6659
Resampled p-value: 0.4950


In [112]:
chisq, pvalue, df, expected = stats.chi2_contingency(clicks)
print(f'Observed chi2 : {chi2observed:.4f}')
print(f'p-value: {pvalue:.4f}')

Observed chi2 : 1.6659
p-value: 0.4348


In [163]:
clicks = clicks[['Headline A', 'Headline B']]

In [164]:
stats.fisher_exact(clicks, alternative='two-sided')

(1.7606490872210954, 0.2835969483984848)

In [None]:
fisher_list=['H', 'small_cat2','small_cat3']
 
import scipy.stats as stats
 
f_val_list=[]
f_p_list=[]
 
for i in chi_list:
    contingency= pd.crosstab(df3[i], df3['target4'])
    f, p =stats.fisher_exact(contingency)
 
    f_val_list.append(f)
    f_p_list.append(p)

## 검정력과 표본 크기

In [192]:
effect_size = sm.stats.proportion_effectsize(0.0121, 0.011)
analysis = sm.stats.TTestIndPower()
result = analysis.solve_power(effect_size=effect_size, alpha=0.05, power=0.8, alternative='larger')
print('Sample Size: %.3f' % result)

Sample Size: 116602.393
