# ANOVA

## 1. 데이터 불러오기

In [2]:
import pandas as pd
import numpy as np

In [3]:
df_train = pd.read_csv('data/train.csv', sep=',')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121754 entries, 0 to 121753
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   X1      121754 non-null  int64  
 1   X2      121754 non-null  float64
 2   X3      121754 non-null  float64
 3   X4      121754 non-null  float64
 4   X5      121754 non-null  object 
 5   X6      121754 non-null  int64  
 6   X7      121754 non-null  int64  
 7   X8      121754 non-null  object 
 8   Y1      121754 non-null  int64  
 9   Y2      121754 non-null  int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 9.3+ MB


## 2. 단위 일치

In [4]:
df_train['mk'] = df_train['Y1'] / df_train['X2']
df_train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2,mk
0,134764,56.3,67.5,22.5,AH32,4,97,PL973,467,2241,8.294849
1,9854,115.0,67.0,13.0,AH32,2,27,PL271,1058,1163,9.2
2,94365,59.2,44.6,18.0,AH32-TM,1,14,PL141,270,1094,4.560811
3,110641,52.1,97.4,15.0,A,17,24,PL242,391,2604,7.504798
4,100159,42.9,58.7,14.5,A,4,23,PL233,232,1564,5.407925


In [5]:
df_train['ct'] = df_train['Y2'] / df_train['X4']  /df_train['X3']
df_train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2,mk,ct
0,134764,56.3,67.5,22.5,AH32,4,97,PL973,467,2241,8.294849,1.475556
1,9854,115.0,67.0,13.0,AH32,2,27,PL271,1058,1163,9.2,1.335247
2,94365,59.2,44.6,18.0,AH32-TM,1,14,PL141,270,1094,4.560811,1.36273
3,110641,52.1,97.4,15.0,A,17,24,PL242,391,2604,7.504798,1.782341
4,100159,42.9,58.7,14.5,A,4,23,PL233,232,1564,5.407925,1.837514


## 3. 일원배치 분산분석 및 사후분석

In [6]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [7]:
from statsmodels.sandbox.stats.multicomp import MultiComparison
import scipy.stats

### 1. X8 <-> Y2

#### 1-1 분산분석

In [8]:
model = ols('ct ~ C(X8)', df_train).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(X8),28.0,9913.805655,354.064488,150.698868,0.0
Residual,121725.0,285990.866537,2.349483,,


#### 1-2 사후분석

In [9]:
comp = MultiComparison(df_train['ct'], df_train['X8'])

result = comp.allpairtest(scipy.stats.ttest_ind, method='bonf')

#투키의 HSD - Tuckey's Honestly Significant Difference = "진정으로 유의미한 차이"
from statsmodels.stats.multicomp import pairwise_tukeyhsd
hsd1_1 = pairwise_tukeyhsd(df_train['ct'], df_train['X8'], alpha=0.05)

In [None]:
fig = hsd1_1.plot_simultaneous()

### 2. X8 <-> Y1

#### 2-1 분산분석

In [11]:
model = ols('mk ~ C(X8)', df_train).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(X8),28.0,3574494000.0,127660500.0,12.188839,1.721823e-55
Residual,121725.0,1274894000000.0,10473560.0,,


#### 2-2 사후분석

In [12]:
comp = MultiComparison(df_train['mk'], df_train['X8'])

result = comp.allpairtest(scipy.stats.ttest_ind, method='bonf')

#투키의 HSD - Tuckey's Honestly Significant Difference = "진정으로 유의미한 차이"
from statsmodels.stats.multicomp import pairwise_tukeyhsd
hsd1_2 = pairwise_tukeyhsd(df_train['mk'], df_train['X8'], alpha=0.05)

In [None]:
fig = hsd1_2.plot_simultaneous()

### 3. X5 <-> Y1

#### 3-1 분산분석

In [14]:
model = ols('ct ~ C(X5)', df_train).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(X5),78.0,759.403203,9.735939,4.013686,7.322292999999999e-30
Residual,121675.0,295145.268988,2.425685,,


#### 3-2 사후분석

In [None]:
comp = MultiComparison(df_train['ct'], df_train['X5'])

result = comp.allpairtest(scipy.stats.ttest_ind, method='bonf')

#투키의 HSD - Tuckey's Honestly Significant Difference = "진정으로 유의미한 차이"
from statsmodels.stats.multicomp import pairwise_tukeyhsd
hsd2_1 = pairwise_tukeyhsd(df_train['ct'], df_train['X5'], alpha=0.05)

In [16]:
hsd2_1

<statsmodels.sandbox.stats.multicomp.TukeyHSDResults at 0x24381c6eee0>

In [None]:
fig = hsd2_1.plot_simultaneous(figsize=(20, 20))

### 4. X5 <-> Y1

#### 4-1 분산분석

In [18]:
model = ols('mk ~ C(X5)', df_train).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(X5),78.0,296691900.0,3803743.0,0.362096,1.0
Residual,121675.0,1278171000000.0,10504800.0,,


In [None]:
comp = MultiComparison(df_train['mk'], df_train['X5'])

result = comp.allpairtest(scipy.stats.ttest_ind, method='bonf')

#투키의 HSD - Tuckey's Honestly Significant Difference = "진정으로 유의미한 차이"
from statsmodels.stats.multicomp import pairwise_tukeyhsd
hsd2_2 = pairwise_tukeyhsd(df_train['mk'], df_train['X5'], alpha=0.05)

#### 4-2 사후분석

In [None]:
fig = hsd2_2.plot_simultaneous(figsize=(20, 20))

### 5. X7 <-> Y1

#### 4-1 분산분석

In [21]:
model = ols('mk ~ C(X7)', df_train).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(X7),11.0,763633700.0,69421250.0,6.614582,3.66863e-11
Residual,121742.0,1277705000000.0,10495180.0,,


In [22]:
comp = MultiComparison(df_train['mk'], df_train['X7'])

result = comp.allpairtest(scipy.stats.ttest_ind, method='bonf')

#투키의 HSD - Tuckey's Honestly Significant Difference = "진정으로 유의미한 차이"
from statsmodels.stats.multicomp import pairwise_tukeyhsd
hsd2_1 = pairwise_tukeyhsd(df_train['mk'], df_train['X7'], alpha=0.05)

#### 4-2 사후분석

In [None]:
fig = hsd2_1.plot_simultaneous(figsize=(20, 20))

### 5. X7 <-> Y2

#### 4-1 분산분석

In [25]:
model = ols('ct ~ C(X7)', df_train).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(X7),11.0,7682.516722,698.410611,295.001279,0.0
Residual,121742.0,288222.15547,2.367483,,


In [26]:
comp = MultiComparison(df_train['ct'], df_train['X7'])

result = comp.allpairtest(scipy.stats.ttest_ind, method='bonf')

#투키의 HSD - Tuckey's Honestly Significant Difference = "진정으로 유의미한 차이"
from statsmodels.stats.multicomp import pairwise_tukeyhsd
hsd2_1 = pairwise_tukeyhsd(df_train['ct'], df_train['X7'], alpha=0.05)

#### 4-2 사후분석

In [None]:
fig = hsd2_1.plot_simultaneous(figsize=(15, 10))