In [None]:
import scipy.stats as spt
import numpy as np
import pandas as pd
import seaborn as sb
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import statsmodels.stats.weightstats as sw
import matplotlib.pyplot as plt



In [None]:
# Two sample t test

fres = np.array([3.7,4.3,2.5,3.3,3.6,3.1])
soph = np.array([1.8,4.2,4.1,2.2,3.2,3.8])
twosam = spt.ttest_ind(fres,soph)
print(twosam)

print('t statistic = %.3f, p-value = %.3f'%(twosam))


print('1학년 :', np.mean(fres))
print('2학년 :', np.mean(soph))

In [None]:
# Dataframe

data1=pd.DataFrame(data=np.array([[3.7,1.8,3.3,4.1],[4.3,4.2,3.7,3.8],[2.5,4.1,3.4,3.5],[3.3,2.2,3.9,3.2],[3.6,3.2,np.nan,2.3]]),
                   columns=['fre','sop','jun','sen'])
print(data1['jun'],'\n')

sop = data1['sop']
jun = data1['jun']
jun = jun.fillna(jun.mean())

print(jun,'\n')

tsam = spt.ttest_ind(sop,jun)
print(tsam)

In [None]:
# Paired t test

before=np.array([68,61,60,68,67,64,66,67,66,67,72,74,61,71,58,77])
after=np.array([56,55,67,62,59,67,50,60,59,53,60,65,62,61,64,57])

pairsam = spt.ttest_rel(before,after)
print(pairsam)

In [None]:
# Statsmodels를 이용한 t test

fre=data1['fre']
sen=data1['sen']


# 가설에는 two-sided / larger / smaller 3가지, 분산에는 pooled, unequal 2가지, value는 두 집단의 평균 차이가 얼마인지 나타내는 수
tsams=sw.ttest_ind(fre,sen,alternative='two-sided',usevar='pooled',value=0)
print('tstat = \t',tsams[0],'\np-val = \t',tsams[1],'\ndegree of freedom = \t',tsams[2])



# 쌍체검정 (One-sided)

ptsams = sw.ttost_paired(before,after,0.1,0.2)
print(ptsams)

In [None]:
# Two-way ANOVA from statsmodels // Referred : https://www.statsmodels.org/stable/anova.html#module-statsmodels.stats.anova

moore = sm.datasets.get_rdataset("Moore", "carData", cache=True)

data2 = moore.data

data2 = data2.rename(columns={"partner.status":"partner_status"})

moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',data=data2).fit()

table = sm.stats.anova_lm(moore_lm, typ=2)

print(table)

In [None]:
# Excel file read
df = pd.read_csv('score1.csv')
print(df)

print(df.grade.unique())

model = ols('score ~ C(grade)', df).fit()
anova_lm(model)

In [None]:
# One-way에서 scipy.stats.f_oneway() 를 사용할 때의 결측치 제거

# data[~np.isnan(data)]

In [None]:
# 오차의 가정 점검

print(spt.shapiro(df.score[df.grade=='fre']),'\n')

print(spt.levene(df.score[df.grade=='fre'],df.score[df.grade=='sop'],df.score[df.grade=='jun'],df.score[df.grade=='sen']),'\n')

print(spt.bartlett(df.score[df.grade=='fre'],df.score[df.grade=='sop'],df.score[df.grade=='jun'],df.score[df.grade=='sen']))

In [None]:
# 사후 검정

from statsmodels.sandbox.stats.multicomp import MultiComparison
from statsmodels.stats.multicomp import pairwise_tukeyhsd

comp = MultiComparison(df.score, df.grade)


# Bonferroni
opt = comp.allpairtest(spt.ttest_ind, method='bonf')
print(opt[0])

# Tukey's HSD
hsd = pairwise_tukeyhsd(df['score'], df['grade'], alpha=0.05)
hsd.summary()

In [None]:
# Two-way ANOVA

data2 = pd.read_csv('score2.csv')
data2.head()

In [None]:
#data2.groupby('grade').agg(len)
data2.groupby('class').agg(len)
#data2.groupby(['grade', 'class']).agg(len)

In [None]:
model2 = ols('score ~ C(grade)*C(class)', data=data2).fit()
anova_lm(model2)

In [None]:
# statsmodels anova_lm type : Referred https://jooskorstanje.com/anova-types-of-sums-of-squares-notebook.html

weekday = ['sat', 'sat', 'sat', 'sat', 'sat', 'sat', 'sun', 'sun', 'sun', 'sun']
weather = ['rain', 'rain', 'rain', 'rain', 'rain', 'sun', 'sun', 'sun', 'sun', 'sun']
sales = [100, 100, 100, 100, 100, 10000, 10000, 10000, 10000, 10000]

data = pd.DataFrame({'weekday': weekday, 'weather': weather, 'sales': sales})
data

In [None]:
# Type I tells us that weekday is more important. The interaction effect is not signifcant.
lm = ols('sales ~ C(weekday)*C(weather)',data=data).fit()
table = sm.stats.anova_lm(lm, typ=1) # Type 1 ANOVA DataFrame
print(table)

In [None]:
# Type II tells us that weather is more important. There is no interaction effect.
lm = ols('sales ~ C(weekday) + C(weather)',data=data).fit()
table = sm.stats.anova_lm(lm, typ=2) # Type 2 ANOVA DataFrame
print(table)

In [None]:
# Type III tells us that weekday is more important. The interaction effect is not signifcant.
lm = ols('sales ~ C(weekday)*C(weather)',data=data).fit()
table = sm.stats.anova_lm(lm, typ=3) # Type 3 ANOVA DataFrame
print(table)