In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings

# 파일 불러와서 열이름 바꾸기
korean, long_term_frgn, short_term_frgn, resident, card_cnt, card_amt, waste_cnt
(이거 예시는 card_amt임)

In [2]:
df = pd.read_csv("files\\data_preprocessing\\korean_preprocessing.csv", encoding='cp949')

# 시계열 피처단위로 변경
df['base_date'] = pd.to_datetime(df['base_date'])

# base_date를 index로 설정
df.index = df['base_date']
df.columns = ['base_date', 'emd_cd', 'korean']

In [3]:
# 필요 없는 열 제거
data = df.drop(['base_date'], 1)  
data

Unnamed: 0_level_0,emd_cd,korean
base_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-31,50110250,7.209306e+06
2018-02-28,50110250,5.579125e+06
2018-03-31,50110250,7.615021e+06
2018-04-30,50110250,8.498349e+06
2018-05-31,50110250,8.916782e+06
...,...,...
2021-02-28,50130620,4.321265e+06
2021-03-31,50130620,4.744309e+06
2021-04-30,50130620,5.236706e+06
2021-05-31,50130620,5.805413e+06


In [4]:
# 한 행정동의 행만 추출
sub_area = data['emd_cd'] == 50110690  ##각행정동코드

# base_date, card_amt 만 갖는 df 생성
sub_df = data[sub_area]
sub_df = sub_df.drop('emd_cd', 1)
sub_df

Unnamed: 0_level_0,korean
base_date,Unnamed: 1_level_1
2018-01-31,2837494.0
2018-02-28,2311217.0
2018-03-31,2609788.0
2018-04-30,2969020.0
2018-05-31,3071392.0
2018-06-30,2821995.0
2018-07-31,2921434.0
2018-08-31,3074936.0
2018-09-30,2512686.0
2018-10-31,2755231.0


In [5]:
# series 형으로 변환
sub_df.to_dict('series')

{'korean': base_date
 2018-01-31    2.837494e+06
 2018-02-28    2.311217e+06
 2018-03-31    2.609788e+06
 2018-04-30    2.969020e+06
 2018-05-31    3.071392e+06
 2018-06-30    2.821995e+06
 2018-07-31    2.921434e+06
 2018-08-31    3.074936e+06
 2018-09-30    2.512686e+06
 2018-10-31    2.755231e+06
 2018-11-30    2.582506e+06
 2018-12-31    2.384707e+06
 2019-01-31    2.692803e+06
 2019-02-28    2.567460e+06
 2019-03-31    2.705826e+06
 2019-04-30    2.934676e+06
 2019-05-31    3.168779e+06
 2019-06-30    2.929936e+06
 2019-07-31    2.996705e+06
 2019-08-31    2.910714e+06
 2019-09-30    2.449941e+06
 2019-10-31    3.056250e+06
 2019-11-30    2.849750e+06
 2019-12-31    2.742843e+06
 2020-01-31    2.908403e+06
 2020-02-29    1.976880e+06
 2020-03-31    1.673146e+06
 2020-04-30    1.791274e+06
 2020-05-31    2.183598e+06
 2020-06-30    2.420549e+06
 2020-07-31    2.717458e+06
 2020-08-31    2.997326e+06
 2020-09-30    2.076840e+06
 2020-10-31    2.530107e+06
 2020-11-30    2.415463e+06

# ARIMA

In [9]:
# ARIMA에 적합한 float로 바꾸는 과정
sub_df_float = sub_df[:].astype(np.float)

In [10]:
# ADF 검정 - 정상성 확인하는 검정 (귀무 : 정상성 만족 x, 대립 : 정상성 만족)
from statsmodels.tsa.stattools import adfuller

result = adfuller(sub_df)
print('ADF Statistic : %f' % result[0])
print('p-value : %f' % result[1])
print('Critical Values :')
for key, value in result[4].items():
    print('\t%s: %3f' % (key, value))

ADF Statistic : -0.331548
p-value : 0.920916
Critical Values :
	1%: -3.646135
	5%: -2.954127
	10%: -2.615968


In [11]:
# 차분
diff_1 = sub_df_float.diff(periods=1).iloc[1:]

# ADF 검정 - 정상성 확인하는 검정 (귀무 : 정상성 만족 x, 대립 : 정상성 만족)
result = adfuller(diff_1)
print('ADF Statistic : %f' % result[0])
print('p-value : %f' % result[1])
print('Critical Values :')
for key, value in result[4].items():
    print('\t%s: %3f' % (key, value))

ADF Statistic : -4.073305
p-value : 0.001072
Critical Values :
	1%: -3.646135
	5%: -2.954127
	10%: -2.615968


In [12]:
from pmdarima.arima import auto_arima

model_arima= auto_arima(sub_df,trace=True, error_action='ignore',suppress_warnings=True,stepwise=False,seasonal=True)

model_arima.fit(sub_df)

 ARIMA(0,1,0)(0,0,0)[1] intercept   : AIC=1167.885, Time=0.08 sec
 ARIMA(0,1,1)(0,0,0)[1] intercept   : AIC=1166.675, Time=0.02 sec
 ARIMA(0,1,2)(0,0,0)[1] intercept   : AIC=1168.471, Time=0.04 sec
 ARIMA(0,1,3)(0,0,0)[1] intercept   : AIC=1169.305, Time=0.05 sec
 ARIMA(0,1,4)(0,0,0)[1] intercept   : AIC=1169.373, Time=0.09 sec
 ARIMA(0,1,5)(0,0,0)[1] intercept   : AIC=1173.485, Time=0.09 sec
 ARIMA(1,1,0)(0,0,0)[1] intercept   : AIC=1166.391, Time=0.02 sec
 ARIMA(1,1,1)(0,0,0)[1] intercept   : AIC=1168.497, Time=0.05 sec
 ARIMA(1,1,2)(0,0,0)[1] intercept   : AIC=1170.262, Time=0.12 sec
 ARIMA(1,1,3)(0,0,0)[1] intercept   : AIC=1169.844, Time=0.12 sec
 ARIMA(1,1,4)(0,0,0)[1] intercept   : AIC=1170.750, Time=0.11 sec
 ARIMA(2,1,0)(0,0,0)[1] intercept   : AIC=1168.352, Time=0.03 sec
 ARIMA(2,1,1)(0,0,0)[1] intercept   : AIC=1168.950, Time=0.09 sec
 ARIMA(2,1,2)(0,0,0)[1] intercept   : AIC=1165.545, Time=0.42 sec
 ARIMA(2,1,3)(0,0,0)[1] intercept   : AIC=1169.188, Time=0.26 sec
 ARIMA(3,1

ARIMA(order=(2, 1, 2), scoring_args={}, seasonal_order=(0, 0, 0, 1),

In [13]:
# ARIMA 모델 
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm

# (AR=2, 차분=1, MA=2) 파라미터로 ARIMA 모델을 학습합니다.
model = ARIMA(sub_df.korean.values, order=(2,1,2))   

#trend : constant를 가지고 있는지, c - constant / nc - no constant
#disp : 수렴 정보를 나타냄
model_fit = model.fit(trend='c', full_output=True, disp=True)
print(model_fit.summary())

statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been deprecated in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and
statsmodels.tsa.SARIMAX. These will be removed after the 0.12 release.

statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained.

removed, use:


  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()


                             ARIMA Model Results                              
Dep. Variable:                    D.y   No. Observations:                   41
Model:                 ARIMA(2, 1, 2)   Log Likelihood                -574.904
Method:                       css-mle   S.D. of innovations         280013.629
Date:                Sat, 11 Sep 2021   AIC                           1161.808
Time:                        02:23:07   BIC                           1172.090
Sample:                             1   HQIC                          1165.552
                                                                              
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.798e+04   7200.997     -2.497      0.013   -3.21e+04   -3867.867
ar.L1.D.y     -0.2188      0.157     -1.391      0.164      -0.527       0.090
ar.L2.D.y      0.2828      0.157      1.800      0.0

statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been deprecated in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and
statsmodels.tsa.SARIMAX. These will be removed after the 0.12 release.

statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained.

removed, use:


  return np.sqrt(np.diag(-inv(hess)))


# 예측하기

In [14]:
# 2단위 이후의 예측결과
fore = model_fit.forecast(steps=2)
print(fore)

(array([2372989.00339452, 2382794.0187213 ]), array([280013.62944316, 355320.63264383]), array([[1824172.37450558, 2921805.63228345],
       [1686378.3757754 , 3079209.66166719]]))
