In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings

# 파일 불러와서 열이름 바꾸기
korean, long_term_frgn, short_term_frgn, resident, card_cnt, card_amt, waste_cnt
(이거 예시는 card_amt임)

In [2]:
df = pd.read_csv("files\\data_preprocessing\\long_term_frgn_preprocessing.csv", encoding='cp949')

# 시계열 피처단위로 변경
df['base_date'] = pd.to_datetime(df['base_date'])

# base_date를 index로 설정
df.index = df['base_date']
df.columns = ['base_date', 'emd_cd', 'long_term_frgn']  
df

Unnamed: 0_level_0,base_date,emd_cd,long_term_frgn
base_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-31,2018-01-31,50110250,427986.3363
2018-02-28,2018-02-28,50110250,427240.8119
2018-03-31,2018-03-31,50110250,513638.2631
2018-04-30,2018-04-30,50110250,474601.3510
2018-05-31,2018-05-31,50110250,395262.5483
...,...,...,...
2021-02-28,2021-02-28,50130620,109325.4866
2021-03-31,2021-03-31,50130620,118258.8891
2021-04-30,2021-04-30,50130620,129227.7480
2021-05-31,2021-05-31,50130620,144213.2650


In [3]:
# 필요 없는 열 제거
data = df.drop(['base_date'], 1)    
data

Unnamed: 0_level_0,emd_cd,long_term_frgn
base_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-31,50110250,427986.3363
2018-02-28,50110250,427240.8119
2018-03-31,50110250,513638.2631
2018-04-30,50110250,474601.3510
2018-05-31,50110250,395262.5483
...,...,...
2021-02-28,50130620,109325.4866
2021-03-31,50130620,118258.8891
2021-04-30,50130620,129227.7480
2021-05-31,50130620,144213.2650


In [4]:
# 한 행정동의 행만 추출
sub_area = data['emd_cd'] == 50130570

# base_date, card_amt 만 갖는 df 생성
sub_df = data[sub_area]
sub_df = sub_df.drop('emd_cd', 1)
sub_df

Unnamed: 0_level_0,long_term_frgn
base_date,Unnamed: 1_level_1
2018-01-31,118461.4013
2018-02-28,106045.5585
2018-03-31,100251.6625
2018-04-30,102557.3566
2018-05-31,93698.7504
2018-06-30,89662.5931
2018-07-31,93663.9496
2018-08-31,107978.9339
2018-09-30,106936.871
2018-10-31,96278.5122


In [5]:
# series 형으로 변환
sub_df.to_dict('series')

{'long_term_frgn': base_date
 2018-01-31    118461.4013
 2018-02-28    106045.5585
 2018-03-31    100251.6625
 2018-04-30    102557.3566
 2018-05-31     93698.7504
 2018-06-30     89662.5931
 2018-07-31     93663.9496
 2018-08-31    107978.9339
 2018-09-30    106936.8710
 2018-10-31     96278.5122
 2018-11-30    105941.0045
 2018-12-31    106990.0537
 2019-01-31    139284.6000
 2019-02-28    115583.1014
 2019-03-31    136098.6202
 2019-04-30    124571.5270
 2019-05-31    128867.6020
 2019-06-30    126843.4737
 2019-07-31    129494.4424
 2019-08-31    131648.6642
 2019-09-30    114231.6733
 2019-10-31    114972.3618
 2019-11-30    109860.2387
 2019-12-31    108230.5669
 2020-01-31    113784.8773
 2020-02-29    100887.3583
 2020-03-31    154885.5721
 2020-04-30    117226.2986
 2020-05-31    121199.3372
 2020-06-30    134622.5207
 2020-07-31    124926.6413
 2020-08-31    127780.6499
 2020-09-30    105939.0085
 2020-10-31    131605.9413
 2020-11-30    135700.4343
 2020-12-31    145291.3107

# ARIMA

In [6]:
# ARIMA에 적합한 float로 바꾸는 과정
sub_df_float = sub_df[:].astype(np.float)

In [7]:
# ADF 검정 - 정상성 확인하는 검정 (귀무 : 정상성 만족 x, 대립 : 정상성 만족)
from statsmodels.tsa.stattools import adfuller

result = adfuller(sub_df)
print('ADF Statistic : %f' % result[0])
print('p-value : %f' % result[1])
print('Critical Values :')
for key, value in result[4].items():
    print('\t%s: %3f' % (key, value))

ADF Statistic : -1.993480
p-value : 0.289439
Critical Values :
	1%: -3.605565
	5%: -2.937069
	10%: -2.606986


In [8]:
# 차분
diff_1 = sub_df_float.diff(periods=1).iloc[1:]

# ADF 검정 - 정상성 확인하는 검정 (귀무 : 정상성 만족 x, 대립 : 정상성 만족)
result = adfuller(diff_1)
print('ADF Statistic : %f' % result[0])
print('p-value : %f' % result[1])
print('Critical Values :')
for key, value in result[4].items():
    print('\t%s: %3f' % (key, value))

ADF Statistic : -11.062702
p-value : 0.000000
Critical Values :
	1%: -3.605565
	5%: -2.937069
	10%: -2.606986


In [9]:
from pmdarima.arima import auto_arima

model_arima= auto_arima(sub_df,trace=True, error_action='ignore',suppress_warnings=True,stepwise=False,seasonal=True)

model_arima.fit(sub_df)

 ARIMA(0,1,0)(0,0,0)[1] intercept   : AIC=920.491, Time=0.01 sec
 ARIMA(0,1,1)(0,0,0)[1] intercept   : AIC=913.175, Time=0.12 sec
 ARIMA(0,1,2)(0,0,0)[1] intercept   : AIC=915.430, Time=0.08 sec
 ARIMA(0,1,3)(0,0,0)[1] intercept   : AIC=917.321, Time=0.07 sec
 ARIMA(0,1,4)(0,0,0)[1] intercept   : AIC=918.918, Time=0.08 sec
 ARIMA(0,1,5)(0,0,0)[1] intercept   : AIC=923.716, Time=0.10 sec
 ARIMA(1,1,0)(0,0,0)[1] intercept   : AIC=913.894, Time=0.03 sec
 ARIMA(1,1,1)(0,0,0)[1] intercept   : AIC=915.200, Time=0.06 sec
 ARIMA(1,1,2)(0,0,0)[1] intercept   : AIC=inf, Time=0.35 sec
 ARIMA(1,1,3)(0,0,0)[1] intercept   : AIC=918.970, Time=0.17 sec
 ARIMA(1,1,4)(0,0,0)[1] intercept   : AIC=920.321, Time=0.15 sec
 ARIMA(2,1,0)(0,0,0)[1] intercept   : AIC=915.401, Time=0.06 sec
 ARIMA(2,1,1)(0,0,0)[1] intercept   : AIC=916.520, Time=0.12 sec
 ARIMA(2,1,2)(0,0,0)[1] intercept   : AIC=918.384, Time=0.28 sec
 ARIMA(2,1,3)(0,0,0)[1] intercept   : AIC=910.132, Time=0.47 sec
 ARIMA(3,1,0)(0,0,0)[1] inter

ARIMA(order=(2, 1, 3), scoring_args={}, seasonal_order=(0, 0, 0, 1),

In [12]:
# ARIMA 모델 
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm

# (AR=2, 차분=1, MA=2) 파라미터로 ARIMA 모델을 학습합니다.
model = ARIMA(sub_df.long_term_frgn.values, order=(0,1,1))

#trend : constant를 가지고 있는지, c - constant / nc - no constant
#disp : 수렴 정보를 나타냄
model_fit = model.fit(trend='c', full_output=True, disp=True)
print(model_fit.summary())

statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been deprecated in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and
statsmodels.tsa.SARIMAX. These will be removed after the 0.12 release.

statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained.

removed, use:




                             ARIMA Model Results                              
Dep. Variable:                    D.y   No. Observations:                   41
Model:                 ARIMA(0, 1, 1)   Log Likelihood                -452.220
Method:                       css-mle   S.D. of innovations          14853.256
Date:                Sat, 11 Sep 2021   AIC                            910.439
Time:                        04:43:06   BIC                            915.580
Sample:                             1   HQIC                           912.311
                                                                              
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        454.5699   1066.705      0.426      0.670   -1636.134    2545.274
ma.L1.D.y     -0.5720      0.187     -3.062      0.002      -0.938      -0.206
                                    Roots           

# 예측하기

In [13]:
# 2단위 이후의 예측결과
fore = model_fit.forecast(steps=2)
print(fore)

(array([127692.71851035, 128147.28846026]), array([14853.25648105, 16156.43876799]), array([[ 98580.87075436, 156804.56626634],
       [ 96481.25035658, 159813.32656395]]))
