In [None]:
"""
The general method for the ARIMA implementation comes from:

https://towardsdatascience.com/machine-learning-part-19-time-series-and-autoregressive-integrated-moving-average-model-arima-c1005347b0d7
https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/
"""

In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.arima.model import ARIMAResults
from sklearn import metrics
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from pmdarima import auto_arima
from sklearn.model_selection import TimeSeriesSplit

#Imported to ignore warnings from ARIMA
import warnings
warnings.filterwarnings("ignore")

#10 import statements

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#3 lines written by me

In [None]:
"""
Easy formatting data written by me
"""

def read_data(pathname, indicatorcode):
    df = pd.read_excel(pathname)
    
    df2 = df.loc[df['Indicator Code'] == indicatorcode]
    
    #Dropping these columns as they are not needed for the forecast
    df2 = df2.drop(columns=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'])
    
    df3 = df2.swapaxes('index', 'columns')
    
    return df3

#6 lines written by me

In [None]:
"""
Method for this block of code is refactored from:

https://towardsdatascience.com/machine-learning-part-19-time-series-and-autoregressive-integrated-moving-average-model-arima-c1005347b0d7
"""

def stationarityTest(data):
    
    #Find out if the data is stationary or not
    #Using Rolling Statistics (Rolling Mean and Rolling Std)
    rolling_mean = data.rolling(window=10).mean()
    rolling_std = data.rolling(window=10).std()
    
    plt.plot(data, color='blue', label='original')
    plt.plot(rolling_mean, color='red', label='Rolling Mean')
    plt.plot(rolling_std, color='green', label='Rolling Std')
    plt.legend(loc='best')
    plt.title('RM vs RSTD vs ORI')
    
    #Augmented Dickey-Fuller Test
    result = adfuller(data.dropna())
    print('ADF Statistic: ', result[0])
    print('p-value: ', result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t{}: {}'.format(key, value))
        
#14 lines from guide

In [None]:
"""
Order Differencing function inspired and adapted from:

https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/
"""

def make_stationary(data):
    
#1st Order Differencing
    data_first_od = data.diff()
    data_first_od_dropped = data_first_od.dropna(inplace=True)
    
    return data_first_od

In [None]:
"""
line 9,10,15 (MDA) adapted and modified from: https://gist.github.com/bshishov/5dc237f59f019b26145648e2124ca1c9

line 12 (MAPE) adapted from: https://www.statology.org/mape-python/
"""

def performance_metrics(y_test, y_pred):
    
    sign1 = np.sign(np.array(y_test[1:]) - np.array(y_test[:-1]))
    sign2 = np.sign(np.array(y_pred[1:]) - np.array(y_pred[:-1]))
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    mape = np.mean(np.abs((y_test - y_pred) / y_test )) *100
    mae = metrics.mean_absolute_error(y_test, y_pred)
    r2 = metrics.r2_score(y_test, y_pred)
    mda = np.mean((sign1 == sign2).astype(int))
    mean = np.mean(y_test)
    si = (rmse/mean)*100
    
    print("RMSE: ", rmse)
    print("MAPE: ", mape)
    print("MAE: ", mae)
    print("Scatter Index: ", si)
    print("MDA: ", mda)
    print("Mean of actual: ", mean)
    
#16 lines, 9 lines written by me, 4 documentation 3 modi

In [None]:
data = read_data('/Users/farhanhabibie/Desktop/Farhan Thesis Code /UG-Project-Farhan/Indonesia Macro Dataset.xlsx', 
                     'SL.UEM.TOTL.FE.ZS')
data = data.dropna()
data_first_ord = make_stationary(data)
#1 line written by me

In [None]:
stationarityTest(data)

In [None]:
stationarityTest(data_first_ord.diff())

In [None]:
#Find AR order(p), differencing order(d), MA(q)

#Find AR order(p) using a PACF, only considering the 
plot_pacf(data.diff().dropna(), lags=13)

#Find MA order (q) using acf plot
plot_acf(data.diff().dropna())

#2 lines from documentation

In [None]:
"""
adapted and modified from: 
https://towardsdatascience.com/time-series-from-scratch-train-test-splits-and-evaluation-metrics-4fd654de1b3
"""

train = data[:int(len(data)*0.8)]
test = data[int(len(data)*0.8):]

In [None]:
"""
reused and modified from: 

https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/

However, these are just function calls to ARIMA model in statsmodels.
"""

model = ARIMA(train, order=(0,0,0)) 
model = model.fit()
predict = model.predict(start=len(train), end=len(train)+len(test)-1)
#3 lines from guide but modified

In [None]:
"""
Code written by me, however (line 9) was also used in a previous Introduction to AI Course I partook in,
at City University of London

Link to repository provided: 
https://github.com/LabiKSV/intro-to-ai-farhan-labi/blob/main/Linear%20Regression%20Label%20Encoder.ipynb
"""

df_compare = pd.DataFrame({'Actual' : test.values.flatten(), 'Predicted' : predict.values})
newindex = test.index.values.tolist()
df_compare.index = newindex
df_compare.plot(title='Unemployment (M) Actual vs Predicted')
performance_metrics(test.values, predict.values)
#4 lines written by me

In [None]:
predict.mean()

In [None]:
data.mean()

In [None]:
"""
Idea for using pmdarima attributed to the video on youtube:

https://www.youtube.com/watch?v=8FCDpFhd1zk&ab_channel=NachiketaHebbar
"""

autoarima = auto_arima(train, 
                       trace=True, 
                       d=0, 
                       suppress_warnings=True, n_fits=50, test='adf')

autoarima.summary()

#2 lines from youtube

In [None]:
"""
64 Lines of code
"""