## SECTION01

In [1]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    'Caffeine(mg)' : [
        94.2, 93.7, 95.5, 93.9, 94.0, 95.2, 94.7, 93.5, 92.8, 94.4,
        93.8, 94.6, 93.3, 95.1, 94.3, 94.9, 93.9, 94.8, 95.0, 94.2,
        93.7, 94.4, 95.1, 94.0, 93.6
    ]
})

In [6]:
# 1. 표본 평균
print(df.mean())

# 2. 정규성 검정
from scipy import stats
print(stats.shapiro(df['Caffeine(mg)']))

# 3~5. 단일 표본 t-검정
print(stats.ttest_1samp(df['Caffeine(mg)'], 95, alternative='less'))

print(' ')

statistic, pvalue = stats.ttest_1samp(df['Caffeine(mg)'], 95, alternative='less')
print('{:.10f}'.format(pvalue))

Caffeine(mg)    94.264
dtype: float64
ShapiroResult(statistic=0.9826578166170536, pvalue=0.9322031137746971)
TtestResult(statistic=-5.501737036221897, pvalue=5.8686553916715e-06, df=24)
 
0.0000058687


## SECTION02

In [7]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    '충전기' : ['New'] * 10 + ['Old'] * 10,
    '충전시간' : [
        1.5, 1.6, 1.4, 1.7, 1.5, 1.6, 1.7, 1.4, 1.6, 1.5,
        1.7, 1.8, 1.7, 1.9, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6
    ]
})
print(df.head(2))

   충전기  충전시간
0  New   1.5
1  New   1.6


In [None]:
# 1~3. 독립 표본 t-검정
new_cond = df['충전기'] == 'New'
old_cond = df['충전기'] == 'Old'
print(df[new_cond]['충전시간'].head(2))
print(df[old_cond]['충전시간'].head(2))

print(' ')

from scipy import stats
print(stats.ttest_ind(df[new_cond]['충전시간'], df[old_cond]['충전시간'], alternative='less', equal_var=True))

0    1.5
1    1.6
Name: 충전시간, dtype: float64
10    1.7
11    1.8
Name: 충전시간, dtype: float64
 
TtestResult(statistic=-4.582575694955849, pvalue=0.00011546547787696304, df=18.0)


## SECTION03

In [18]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    'User' : list(range(1, 11)),
    '기존방법' : [60.4, 60.7, 60.5, 60.3, 60.8, 60.6, 60.2, 60.5, 60.7, 60.4],
    '새로운방법' : [59.8, 60.2, 60.1, 59.9, 59.7, 58.4, 57.0, 60.3, 59.6, 59.8]
})
print(df.head(2))

   User  기존방법  새로운방법
0     1  60.4   59.8
1     2  60.7   60.2


In [21]:
# 1. 표본 평균
df['diff'] = df['새로운방법'] - df['기존방법']
print(df['diff'].mean())

# 2~4. 대응 표본 t-검정
from scipy import stats
print(stats.ttest_rel(df['새로운방법'], df['기존방법'], alternative='less'))

-1.0300000000000005
TtestResult(statistic=-3.407973078114844, pvalue=0.0038872633380070652, df=9)


## SECTION04

In [22]:
# 데이터
import pandas as pd
df = pd.read_csv('./data/math.csv')
print(df.head())

    groups  scores
0  group_A      85
1  group_A      88
2  group_A      90
3  group_A      82
4  group_A      87


In [None]:
from scipy import stats 

# 1. Shapiro-Wilk 검정 (정규성)
condA = df['groups'] == 'group_A'
print(stats.shapiro(df[condA]['scores']))

ShapiroResult(statistic=0.9715896670696531, pvalue=0.9051800443853569)


## SECTION04

In [24]:
# 데이터
import pandas as pd
df = pd.read_csv('./data/math.csv')
print(df.head())

    groups  scores
0  group_A      85
1  group_A      88
2  group_A      90
3  group_A      82
4  group_A      87


In [None]:
from scipy import stats

# 1. Shapiro-Wilk 검정 (정규성)
condA = df['groups'] == 'group_A'
print(stats.shapiro(df[condA]['scores']))

condB = df['groups'] == 'group_B'
print(stats.shapiro(df[condB]['scores']))

condC = df['groups'] == 'group_C'
print(stats.shapiro(df[condC]['scores']))

condD = df['groups'] == 'group_D'
print(stats.shapiro(df[condD]['scores']))

print('')

# 2. Levene 검정 (등분산성)
print(stats.levene(df[condA]['scores'], df[condB]['scores'], df[condC]['scores'], df[condD]['scores']))

print('')

# 일원 분산 분석을 위한 모델 학습
from statsmodels.formula.api import ols
model = ols('scores ~ groups', df).fit()

# 3~9. ANOVA 테이블
from statsmodels.stats.anova import anova_lm    # 선형 모델을 명시적으로 생성한 후, 이를 anova_lm() 함수에 전달
print(anova_lm(model))

ShapiroResult(statistic=0.9715896670696531, pvalue=0.9051800443853569)
ShapiroResult(statistic=0.9499422438060351, pvalue=0.6678172590861611)
ShapiroResult(statistic=0.9299424104842702, pvalue=0.44732595113862045)
ShapiroResult(statistic=0.9065684572704982, pvalue=0.25824165549017347)

LeveneResult(statistic=1.757685352622062, pvalue=0.17270284963232108)

             df  sum_sq     mean_sq          F        PR(>F)
C(groups)   3.0   411.8  137.266667  34.174274  1.240642e-10
Residual   36.0   144.6    4.016667        NaN           NaN


## SECTION05

In [29]:
# 데이터
import pandas as pd
df = pd.read_csv('./data/tomato2.csv')
print(df.head())

  비료유형  물주기  수확량
0    A    1  514
1    A    1  480
2    A    1  507
3    A    2  452
4    A    2  526


In [None]:
# R-style 공식(Syntax) 사용하여 모델을 간결하게 정의
import statsmodels.api as sm
from statsmodels.formula.api import ols

# 1~9 이원 분산 분석
model = ols('수확량 ~ C(비료유형) * C(물주기)', data=df).fit()
anova_table = sm.stats.anova_lm(model)
print(anova_table)

                  df        sum_sq      mean_sq         F    PR(>F)
C(비료유형)          2.0   5251.722222  2625.861111  3.184685  0.059334
C(물주기)           3.0   9057.000000  3019.000000  3.661490  0.026460
C(비료유형):C(물주기)   6.0   4271.833333   711.972222  0.863491  0.535426
Residual        24.0  19788.666667   824.527778       NaN       NaN


## SECTION06

In [34]:
# 1. 교통사고 5회 이상 경험 비율
print(30/1000)

# 2~4. 적합도 검정
from scipy.stats import chisquare
observed = [550, 250, 100, 70, 30]
expected = [1000*0.60, 1000*0.25, 1000*0.08, 1000*0.05, 1000*0.02]
print(chisquare(observed, expected))

0.03
Power_divergenceResult(statistic=22.166666666666668, pvalue=0.00018567620386641427)


## SECTION07

In [41]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    '캠프' : ['빅분기']*80 + ['정처기']*100,
    '등록여부' : ['등록']*50 + ['등록안함']*30 + ['등록']*60 + ['등록안함']*40
})
print(df.head())

    캠프 등록여부
0  빅분기   등록
1  빅분기   등록
2  빅분기   등록
3  빅분기   등록
4  빅분기   등록


In [37]:
import pandas as pd
from scipy.stats import chi2_contingency

# 1~3. 독립성 검정
observed = pd.DataFrame([[50, 30], [60, 40]])
print(chi2_contingency(observed))

Chi2ContingencyResult(statistic=0.03535714285714309, pvalue=0.8508492527705047, dof=1, expected_freq=array([[48.88888889, 31.11111111],
       [61.11111111, 38.88888889]]))


In [42]:
# 교차표로 변경
df = pd.crosstab(df['캠프'], df['등록여부'])
print(df)

# 4~6. 독립성 검정
print(chi2_contingency(df))

등록여부  등록  등록안함
캠프            
빅분기   50    30
정처기   60    40
Chi2ContingencyResult(statistic=0.03535714285714309, pvalue=0.8508492527705047, dof=1, expected_freq=array([[48.88888889, 31.11111111],
       [61.11111111, 38.88888889]]))
