# Forecaster

## Libraries, options, paths, and functions

In [1]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor

# set options
pd.set_option('display.float_format', '{:,.2f}'.format)


## Loading data

In [2]:
# load mevs_ca data
with pd.HDFStore('./stats_can_data/mevs_ca.hdf5') as save:
    mevs_ca = save['mevs_ca']
    metadata = save.get_storer('mevs_ca').attrs.metadata

# quick look at raw data

mevs_ca.info()
display(mevs_ca.tail())
display(mevs_ca.describe())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243 entries, 0 to 242
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   ref_date  243 non-null    datetime64[ns]
 1   geo       243 non-null    object        
 2   pop_ca    243 non-null    int64         
 3   gdp_ca    243 non-null    int64         
 4   year      243 non-null    int64         
 5   quarter   243 non-null    int64         
 6   gdp_ca_g  242 non-null    float64       
 7   pop_ca_g  242 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(4), object(1)
memory usage: 17.1+ KB


Unnamed: 0,ref_date,geo,pop_ca,gdp_ca,year,quarter,gdp_ca_g,pop_ca_g
238,2020-07-01,Canada,38037204,2014029,2020,3,41.08,0.39
239,2020-10-01,Canada,38033014,2058185,2020,4,9.06,-0.04
240,2021-01-01,Canada,38068872,2082980,2021,1,4.91,0.38
241,2021-04-01,Canada,38153211,2066339,2021,2,-3.16,0.89
242,2021-07-01,Canada,38246108,2093927,2021,3,5.45,0.98


Unnamed: 0,pop_ca,gdp_ca,year,quarter,gdp_ca_g,pop_ca_g
count,243.0,243.0,243.0,243.0,242.0,242.0
mean,27937488.21,1172230.77,1990.88,2.49,3.17,1.25
std,5647744.08,526778.45,17.57,1.12,5.04,0.63
min,18092000.0,342687.0,1961.0,1.0,-37.38,-0.04
25%,23337021.0,734120.0,1976.0,1.5,1.22,0.93
50%,27928837.0,1086810.0,1991.0,2.0,3.13,1.22
75%,32520738.5,1682737.0,2006.0,3.0,5.21,1.5
max,38246108.0,2123207.0,2021.0,4.0,41.08,8.41


## Training, validation, and testing datasets

In [3]:
# split data into training, validation, and testing samples

features = mevs_ca[1:].drop(['ref_date','geo','pop_ca','gdp_ca','gdp_ca_g'], axis=1)
target = mevs_ca[1:]['gdp_ca_g']

features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.4, random_state=12345) 

features_valid, features_test, target_valid, target_test = train_test_split(
    features_valid, target_valid, test_size=0.5, random_state=12345)

print('Dimensions of features samples:')
print('Raw Data:',features.shape)
print('Training: {:.3%}'.format(len(features_train)/len(features)))
print('Validation: {:.3%}'.format(len(features_valid)/len(features)))
print('Testing: {:.3%}'.format(len(features_test)/len(features)))

print('\nDimensions of target samples:')
print('Raw Data:',target.shape)
print('Training: {:.3%}'.format(len(target_train)/len(features)))
print('Validation: {:.3%}'.format(len(target_valid)/len(features)))
print('Testing: {:.3%}'.format(len(target_test)/len(features)))

Dimensions of features samples:
Raw Data: (242, 3)
Training: 59.917%
Validation: 19.835%
Testing: 20.248%

Dimensions of target samples:
Raw Data: (242,)
Training: 59.917%
Validation: 19.835%
Testing: 20.248%


## Model selection
### Decision Tree Regression

In [4]:
# decision tree regressor

# define starting values and results table
dt_best_model = None
dt_best_depth = 0
dt_best_rmse = 1000


results=pd.DataFrame(columns=['Model','RMSE','Depth','Trees'])

for depth in range(1, 16):
    # fit the model
    model = DecisionTreeRegressor(random_state=12345, max_depth=depth)
    model.fit(features_train, target_train)
    
    # calculate accuracy of predictions
    predictions_valid = model.predict(features_valid)
    rmse = mean_squared_error(target_valid, predictions_valid) **0.5

    # save results
    new_result = pd.Series(['Decision Tree Regressor',rmse,depth,0], index=results.columns)
    results = results.append(new_result, ignore_index=True)
    
    if rmse < dt_best_rmse:
        dt_best_model = model
        dt_best_depth = depth
        dt_best_rmse = rmse

print('Top-5 best models:')
display(results.sort_values(by='RMSE').head())
print("Best decision tree model's RMSE: ",dt_best_rmse,", Depth: ", dt_best_depth,'.', sep='')

Top-5 best models:


  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)


Unnamed: 0,Model,RMSE,Depth,Trees
10,Decision Tree Regressor,6.72,11,0
0,Decision Tree Regressor,9.08,1,0
5,Decision Tree Regressor,11.78,6,0
4,Decision Tree Regressor,11.82,5,0
1,Decision Tree Regressor,11.84,2,0


Best decision tree model's RMSE: 6.715689152513674, Depth: 11.


### Random Forest Regression

In [5]:
# random forest regressor

rf_best_model = None
rf_best_depth = 0
rf_best_rmse = 1000
rf_best_trees = 0

for est in range(1, 20):
    for depth in range(1,20):
        model = RandomForestRegressor(random_state=12345, max_depth=depth, n_estimators=est)
        model.fit(features_train, target_train)
    
        predictions_valid = model.predict(features_valid)
        rmse = mean_squared_error(target_valid, predictions_valid) **0.5
    
        new_result = pd.Series(['Random Forest Regressor',rmse,depth,est], index=results.columns)
        results = results.append(new_result, ignore_index=True)
    
        if rmse < rf_best_rmse:
            rf_best_model = model
            rf_best_depth = depth
            rf_best_trees = est
            rf_best_rmse = rmse
    
print('Top-5 best models:')
display(results.sort_values(by='RMSE').head())
print("Best random forest model's RMSE: ",rf_best_rmse,", Trees: ",rf_best_trees, ", Depth: ", rf_best_depth,'.', sep='')

  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = results.append(new_result, ignore_index=True)
  results = re

Top-5 best models:


  results = results.append(new_result, ignore_index=True)


Unnamed: 0,Model,RMSE,Depth,Trees
16,Random Forest Regressor,6.26,2,1
17,Random Forest Regressor,6.55,3,1
21,Random Forest Regressor,6.7,7,1
10,Decision Tree Regressor,6.72,11,0
19,Random Forest Regressor,6.93,5,1


Best random forest model's RMSE: 6.264897663237037, Trees: 1, Depth: 2.


### Linear Regression

In [6]:
# linear regression

reg_model = LinearRegression()
reg_model.fit(features_train, target_train)
predictions_valid = reg_model.predict(features_valid)

rmse = mean_squared_error(target_valid, predictions_valid) **0.5

new_result = pd.Series(['Linear Regression',rmse,0,0], index=results.columns)
results = results.append(new_result, ignore_index=True)

print("Linear regression model's RMSE: ", rmse)

Linear regression model's RMSE:  6.7320435989244745


  results = results.append(new_result, ignore_index=True)


## Model testing

In [7]:
# compare model performance on test data

print('Model performance on test data:\n')
print('RMSE:')

predictions_test = dt_best_model.predict(features_test)
print('Decision Tree: ', mean_squared_error(target_test, predictions_test) **0.5)

predictions_test = rf_best_model.predict(features_test)
print('Random Forest: ', mean_squared_error(target_test, predictions_test) **0.5)

predictions_test = reg_model.predict(features_test)
print('Linear Regression: ', mean_squared_error(target_test, predictions_test) **0.5)

Model performance on test data:

RMSE:
Decision Tree:  9.852571128283012
Random Forest:  9.32232580601011
Linear Regression:  3.412113571043459


## Results

In [8]:
display(results.sort_values(by='RMSE').head(10))

Unnamed: 0,Model,RMSE,Depth,Trees
16,Random Forest Regressor,6.26,2,1
17,Random Forest Regressor,6.55,3,1
21,Random Forest Regressor,6.7,7,1
10,Decision Tree Regressor,6.72,11,0
376,Linear Regression,6.73,0,0
19,Random Forest Regressor,6.93,5,1
24,Random Forest Regressor,6.99,10,1
220,Random Forest Regressor,7.76,16,11
320,Random Forest Regressor,7.88,2,17
201,Random Forest Regressor,7.92,16,10


## Sanity check

In [9]:
dummy_clf = DummyRegressor(strategy="mean")
dummy_clf.fit(features_test, target_test)

print('RMSE:')
print('Dummy model:', mean_squared_error(target_test, dummy_clf.predict(features_test)) **0.5)


RMSE:
Dummy model: 3.3867639476555804
