In [1]:
# 필요한 패키지 불러오기

library(tidyverse)
library(data.table)
library(readxl)
library(gridExtra)
library(reshape2)
library(agricolae)

"package 'tidyverse' was built under R version 3.6.3"-- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --
√ ggplot2 3.3.2     √ purrr   0.3.4
√ tibble  3.0.4     √ dplyr   1.0.2
√ tidyr   1.1.2     √ stringr 1.4.0
√ readr   1.4.0     √ forcats 0.5.0
"package 'forcats' was built under R version 3.6.3"-- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
"package 'data.table' was built under R version 3.6.3"
Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, first, last

The following object is masked from 'package:purrr':

    transpose

"package 'gridExtra' was built under R version 3.6.3"
Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine

"package 'reshape2' was built under R ver

In [2]:
# 데이터 불러오기
customer <- read_excel("월별소비자동향조사_201801_202010.xlsx")

In [3]:
# 데이터 전처리
category = c('가계수입전망CSI', '현재가계저축CSI',
             '가계저축전망CSI', '주택가격전망CSI',
             '소비지출전망CSI', '의료·보건비 지출전망CSI',
             '교양·오락·문화생활비 지출전망CSI', '의류비 지출전망CSI',
             '외식비 지출전망CSI', '여행비 지출전망CSI', '교육비 지출전망CSI')

# 연령별 데이터
data2 <- customer %>%
  filter(grepl('세+',분류코드별)) %>% 
  filter(지수코드별 %in% category) %>% 
  select(-항목, -단위) %>% 
  melt(id.vars=c('지수코드별', '분류코드별')) %>% 
  mutate(지수코드별 = as.factor(지수코드별), 분류코드별 = as.factor(분류코드별))

data2_b <- data2[c(1:1375), ]
data2_a <- data2[c(1376:1870), ]

# 소득별 데이터
data_wage <- customer %>%
  filter(grepl('만원+',분류코드별)) %>% 
  filter(지수코드별 %in% category) %>% 
  select(-항목, -단위) %>% 
  melt(id.vars=c('지수코드별', '분류코드별')) %>% 
  mutate(지수코드별 = as.factor(지수코드별), 분류코드별 = as.factor(분류코드별))

data_wage_b <- data_wage[c(1:1650), ]
data_wage_a <- data_wage[c(1650:2244), ]

In [4]:
# 분산분석 결과 저장 함수
anova_result = function(df, category) {
  
  aov_list = list()
  summary_list = list()
  posthoc_list = list()
  
  for (i in 1:length(category)){
    aov_model = aov(value ~ 분류코드별 + variable,
                    data=df[df$지수코드별 == category[i], ])
    aov_list[[i]] = aov_model

    summary_list[[i]] = summary(aov_model)
    
    posthoc = HSD.test(aov_model, '분류코드별', group=TRUE)
    posthoc_list[[i]] = posthoc
  }
  
  return(list('aov' = aov_list, 'summary' = summary_list, 'posthoc' = posthoc_list))
  
}

In [5]:
# 분산분석 시행

anova_result_before = anova_result(df=data2_b, category=category)
anova_result_after = anova_result(df=data2_a, category=category)

anova_result_before_wage = anova_result(df = data_wage_b, category = category)
anova_result_after_wage = anova_result(df = data_wage_a, category = category)

In [6]:
for (i in 1:length(category)){
  cat('\n------------------------------', category[i], '에 대한 연령별 분산분석 결과입니다. ------------------------------\n\n')
  print(anova_result_before$summary[[i]])
  print(anova_result_before$posthoc[[i]])
  print(anova_result_after$summary[[i]])
  print(anova_result_after$posthoc[[i]])
}


------------------------------ 가계수입전망CSI 에 대한 연령별 분산분석 결과입니다. ------------------------------

            Df Sum Sq Mean Sq F value   Pr(>F)    
분류코드별   4   4480  1119.9 352.164  < 2e-16 ***
variable    24    610    25.4   7.993 4.68e-14 ***
Residuals   96    305     3.2                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
$statistics
  MSerror Df  Mean       CV      MSD
     3.18 96 97.56 1.827855 1.402332

$parameters
   test     name.t ntr StudentizedRange alpha
  Tukey 분류코드별   5         3.931944  0.05

$means
          value      std  r Min Max Q25 Q50 Q75
40-50세  102.56 2.501333 25  98 108 101 103 104
40세미만 106.24 2.026491 25 103 112 105 106 107
50-60세   96.28 3.553402 25  91 105  94  95  98
60-70세   91.76 3.112876 25  87  98  89  91  95
70세이상  90.96 2.335951 25  87  95  89  91  93

$comparison
NULL

$groups
          value groups
40세미만 106.24      a
40-50세  102.56      b
50-60세   96.28      c
60-70세   91.76      d
70세이상  90.96      d

attr(,"cl

In [7]:
for (i in 1:length(category)){
  cat('\n------------------------------', category[i], '에 대한 연령별 분산분석 결과입니다. ------------------------------\n\n')
  print(anova_result_before_wage$summary[[i]])
  print(anova_result_before_wage$posthoc[[i]])
  print(anova_result_after_wage$summary[[i]])
  print(anova_result_after_wage$posthoc[[i]])
}


------------------------------ 가계수입전망CSI 에 대한 연령별 분산분석 결과입니다. ------------------------------

             Df Sum Sq Mean Sq F value   Pr(>F)    
분류코드별    5   4206   841.2 197.037  < 2e-16 ***
variable     24    826    34.4   8.066 1.33e-15 ***
Residuals   120    512     4.3                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
$statistics
   MSerror  Df     Mean       CV      MSD
  4.269111 120 97.20667 2.125557 1.692611

$parameters
   test     name.t ntr StudentizedRange alpha
  Tukey 분류코드별   6         4.095986  0.05

$means
             value      std  r Min Max Q25 Q50 Q75
100-200만원  92.88 3.004441 25  88 100  91  92  95
100만원미만  88.24 3.205204 25  81  96  87  88  90
200-300만원  97.72 2.491987 25  93 103  96  97  99
300-400만원  98.60 3.366502 25  94 106  96  98 101
400-500만원 101.88 3.395095 25  96 109  99 102 105
500만원이상 103.92 2.722132 25 100 111 102 103 105

$comparison
NULL

$groups
             value groups
500만원이상 103.92      a
400-500만원 101.8