In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn import metrics
import matplotlib as plt
import os

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
"""
Function that loads and prepares the dataframe. 

Converts the data into time-series readable.
"""
def initialFormat (filepath, indicatorcode):
    #Reds the file and creates a dataframe from it
    df = pd.read_excel(filepath)
    
    #Choose what to forecast using indicator code
    df_icode = df.loc[df['Indicator Code'] == indicatorcode]
    
    #Dropping these columns as they are not needed for the forecast
    df_icode = df_icode.drop(columns=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'])
    
    #Swap axis so it is in the proper format
    df_formatted = df_icode.swapaxes("index", "columns")
    
    #Renaming column name to 'values' to make reference easier
    for col_names in df_formatted.columns:
        name = df_formatted.rename(columns={col_names : "values"})
        return name
    
    print("Data has been formatted. Number of observations are: ", len(df_formatted))
    
    return df_formatted

In [4]:
"""
The first thing we need to do is convert the time series data into a usable
supervised learning format.

The first method tried is called the lag method for One-Step Univariate Forecasting

Tutorial for this provided by machinelearningmastery.com
"""

def laggedMethod(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df_super = pd.DataFrame(data)
    cols = list()
    
    for i in range (n_in, 0, -1):
        cols.append(df_super.shift(i))
        
    for i in range (0, n_out):
        cols.append(df_super.shift(-i))
        
    agg = pd.concat(cols, axis=1)
    
    if dropnan:
        agg.dropna(inplace=True)
    return agg.values
