In [1]:
# Import libraries 
import os
import pandas as pd
import numpy as np
import json

import mlflow

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from pathlib import Path
from functools import reduce

from datetime import datetime
from hts import HTSRegressor
import hts.functions
import collections
from hts.hierarchy import HierarchyTree
from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter("ignore")

# settings
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (20, 8)

## Utility functions 

In [2]:
# function to fix the ags 
def fix_ags5(x):
    if len((str(x))) == 4:
        return '0' + str(x)
    else: 
        return x

In [3]:
# Function to add the column to the main data
def add_column_to_main_data(data, cluster_data, col_name):
    
    cluster_data['ags5'] = cluster_data['ags5'].apply(fix_ags5)
    
    # Get the cluster and ags 5 and set ags 5 as index 
    cluster_info = cluster_data.set_index('ags5').to_dict()[col_name]
    
    data['cluster'] = '0'
    data['cluster'] = data['ags5'].map(cluster_info)

    # check if the cluster have been allotted correctly 
    print("original cluster data")
    print(cluster_data[col_name].value_counts())
    print("New data")
    print(data.drop_duplicates(subset=['ags5'])['cluster'].value_counts())
    
    return data 

## Read the data

In [4]:
# Read the data
df = pd.read_csv('data_from_2010_to_2019_unemployment_rate.csv', converters={'ags2': str, 'ags5': str})
df.shape

(48120, 3)

In [5]:
df.head()

Unnamed: 0,ags5,date,unemployment_rate
0,1001,2010-01-31,13.7
1,1001,2010-02-28,14.1
2,1001,2010-03-31,13.6
3,1001,2010-04-30,13.1
4,1001,2010-05-31,12.5


In [6]:
df.tail()

Unnamed: 0,ags5,date,unemployment_rate
48115,16077,2019-08-31,7.0
48116,16077,2019-09-30,6.5
48117,16077,2019-10-31,6.5
48118,16077,2019-11-30,6.3
48119,16077,2019-12-31,6.5


## Data Preparation

In [7]:
# Add AGS 2
def get_ags2(x):
    return x[0:2]

df['ags2'] = df['ags5'].apply(get_ags2)
df.head()

Unnamed: 0,ags5,date,unemployment_rate,ags2
0,1001,2010-01-31,13.7,1
1,1001,2010-02-28,14.1,1
2,1001,2010-03-31,13.6,1
3,1001,2010-04-30,13.1,1
4,1001,2010-05-31,12.5,1


## ML Flow Experiment Setup 

In [8]:
def train_heirarchical_cluster_model(data, agregate_col, params, cluster_type="ags2"):
    
    ''' Generate a run name '''
    run_name = 'hierarchical_' + '_'.join(list(params.values())[0:2])
    
    with mlflow.start_run(run_name=run_name):
        
        # Create a list of kreis
        kreis_list = list(data['ags5'].unique())
        
        ''' Generate the dataset from the cluster with the ags and total summation '''
        print("Generating the hierarchical dataset...")
    
        # Filter Data by relevant columns 
        relevant_cols = ['ags5', 'unemployment_rate', 'date']
        relevant_cols.append(agregate_col)
        df = data[relevant_cols]
    
        # Get bottom level data - ags5
        df_ags5 = df.pivot(index="date", columns="ags5", values="unemployment_rate")
        
        # Get middle level data - aggregate_col
        df_middle = df.groupby(["date", agregate_col]).sum().reset_index(drop=False).pivot(index="date", 
                                                                           columns=agregate_col, 
                                                                           values="unemployment_rate")
        
        print(f"Got {df_middle.shape[1]} clusters..")
        
        # Get the top level data
        df_total = df.groupby("date")["unemployment_rate"].sum().to_frame().rename(columns={"unemployment_rate": "total"})
        
        # Join the data frames
        hdf = df_ags5.join(df_middle).join(df_total)

        # Set the index in datetime format
        hdf.index = pd.to_datetime(hdf.index)
        
        print("The dataset size is", hdf.shape)
        
        # Create the hierarchical cluster set 
        cluster_set = df.groupby(agregate_col)['ags5'].apply(lambda x: list(set(x))).to_dict()
        
        # Add total to the dictionary
        cluster_set['total'] = list(cluster_set.keys())
    
        ''' Model Fitting '''
        
        # Get the params
        model_type = params['model']
        rev_type = params['revision_method']
        time_steps = params['time_steps']
        internal_params = params['model_params']
        
        # Divide the data into train and test sets
        train_hdf = hdf.head(len(hdf) - time_steps)
        test_hdf = hdf.tail(time_steps)
        
        print(f"Fitting the model {model_type} with revision method {rev_type}.")
        
        # Fit the model 
        hts_model = HTSRegressor(model=model_type, revision_method=rev_type, n_jobs=0, **internal_params)
        hts_model.fit(train_hdf, cluster_set)
        
        print(f"Predicting for the next {time_steps} time steps.")
        
        # Get the predictions 
        preds = hts_model.predict(steps_ahead=time_steps)
        
        ''' Model Evaluation '''
        
        # Get the predicted vales 
        actual_preds = preds.tail(time_steps)
        
        # Check if there are negative values in the predictions 
        negative_pred = (actual_preds < 0).values.any()
        if negative_pred:
            print("There are negative values in the predictions.")
        else: 
            print("No negative values found in the predictions")
            
        # Check if the prediction and test have the same size
        assert actual_preds.shape[0] == test_hdf.shape[0]
        
        # Calculate the mse for each kreis
        total_mse = 0
        total_rmse = 0
        for kreis in kreis_list: 
            total_mse  += mean_squared_error(y_pred=actual_preds[kreis].values, y_true=test_hdf[kreis].values, squared=True)
            total_rmse += mean_squared_error(y_pred=actual_preds[kreis].values, y_true=test_hdf[kreis].values, squared=False)
#             print(total_mse, total_rmse)
        
        # Calculate average mse 
        average_mse = total_mse/len(kreis_list)
        average_rmse = total_rmse/len(kreis_list)
        print("The average error is:", average_mse)
        
        
        ''' Log experiment details in ML Flow '''
        # Log params
        mlflow.log_params(params)
        mlflow.log_params(internal_params)
        mlflow.log_param("Cluster Type", cluster_type)
        mlflow.log_param("Cluster Set", cluster_set)
        
        # Log metrics
        mlflow.log_metric("mse", average_mse)
        mlflow.log_metric("rmse", average_rmse)
        
        negative_pred = 1 if negative_pred else 0 
        mlflow.log_metric("negative_preds", negative_pred)        
        
        return preds
        
        
        

## Model Testing and Parameter tuning

In [65]:
# Set the params 
params = {
    'model':'sarimax',
    'revision_method':'BU',
    'time_steps': 12,
    'model_params': {
        'order': (2, 2, 2)
    }
}

# Run the function 
predictions = train_heirarchical_cluster_model(data=df,
                                 agregate_col='ags2', 
                                 params=params,
                                 cluster_type="ags2")

Generating the hierarchical dataset...
Got 16 clusters..
The dataset size is (120, 418)
Fitting the model sarimax with revision method BU.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [02:24<00:00,  2.90it/s]
Fitting models:   8%|████▉                                                           | 32/418 [00:00<00:02, 163.91it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:02<00:00, 183.24it/s]


No negative values found in the predictions
The average error is: 0.7136935475057907


Revision types to the model.

* **AHP** — average historical proportions (top-down approach),
* **PHA** — proportions of historical averages (top-down approach),
* **FP** — the forecasted proportions (top-down approach),
* **OLS** — the optimal combination using OLS,
* **WLSS** - optimal combination using structurally weighted OLS,
* **WLSV** - optimal combination using variance-weighted OLS.

### Run revision iterations

In [63]:
# Run all combinations for models 
model_types = ['sarimax']
revisions = ['BU', 'AHP', 'PHA', 'FP', 'OLS', 'WLSS', 'WLSV']

# Set the params 
params = {
    'model':'sarimax',
    'revision_method':'BU',
    'time_steps': 12,
    'model_params': {
        'order': (2, 1, 2)
    }
}

for m in model_types:
    for r in revisions:
        print(f"Model: {m} and Revision: {r}")
        
        # Change params 
        params['model'] = m
        params['revision_method'] = r
        
        # Run the prediction model  
        predictions = train_heirarchical_cluster_model(data=df,
                                         agregate_col='ags2', 
                                         params=params)

Model: sarimax and Revision: BU
Generating the hierarchical dataset...
Got 16 clusters..
The dataset size is (120, 418)
Fitting the model sarimax with revision method BU.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [01:59<00:00,  3.49it/s]
Fitting models:   5%|███▏                                                            | 21/418 [00:00<00:02, 188.54it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:02<00:00, 148.26it/s]


No negative values found in the predictions
The average error is: 0.14641924858639638
Model: sarimax and Revision: AHP
Generating the hierarchical dataset...
Got 16 clusters..
The dataset size is (120, 418)
Fitting the model sarimax with revision method AHP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [02:35<00:00,  2.69it/s]
Fitting models:   3%|█▉                                                              | 13/418 [00:00<00:03, 118.30it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:03<00:00, 134.95it/s]


No negative values found in the predictions
The average error is: 0.4131294997850292
Model: sarimax and Revision: PHA
Generating the hierarchical dataset...
Got 16 clusters..
The dataset size is (120, 418)
Fitting the model sarimax with revision method PHA.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [02:48<00:00,  2.49it/s]
Fitting models:   6%|███▌                                                            | 23/418 [00:00<00:01, 223.80it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:02<00:00, 196.82it/s]


No negative values found in the predictions
The average error is: 0.4528342008754722
Model: sarimax and Revision: FP
Generating the hierarchical dataset...
Got 16 clusters..
The dataset size is (120, 418)
Fitting the model sarimax with revision method FP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [02:31<00:00,  2.76it/s]
Fitting models:   5%|███                                                             | 20/418 [00:00<00:02, 195.49it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:02<00:00, 158.64it/s]


(16, 120)
No negative values found in the predictions
The average error is: 26.527558187863697
Model: sarimax and Revision: OLS
Generating the hierarchical dataset...
Got 16 clusters..
The dataset size is (120, 418)
Fitting the model sarimax with revision method OLS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [02:51<00:00,  2.44it/s]
Fitting models:  11%|██████▋                                                         | 44/418 [00:00<00:01, 215.32it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:02<00:00, 202.81it/s]


There are negative values in the predictions.
The average error is: 87.64391731569015
Model: sarimax and Revision: WLSS
Generating the hierarchical dataset...
Got 16 clusters..
The dataset size is (120, 418)
Fitting the model sarimax with revision method WLSS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [03:00<00:00,  2.32it/s]
Fitting models:   5%|██▉                                                             | 19/418 [00:00<00:02, 184.46it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:02<00:00, 162.99it/s]


There are negative values in the predictions.
The average error is: 4.829053266776892
Model: sarimax and Revision: WLSV
Generating the hierarchical dataset...
Got 16 clusters..
The dataset size is (120, 418)
Fitting the model sarimax with revision method WLSV.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [02:28<00:00,  2.82it/s]
Fitting models:  10%|██████▌                                                         | 43/418 [00:00<00:01, 214.60it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 230.29it/s]


There are negative values in the predictions.
The average error is: 108.12431593973784


## Models with custom clusters

In [9]:
# read the pca clusters by Amit 
cluster1 = pd.read_csv('df_final_stationary.csv', converters={'ags5':str, 'cluster':str})
cluster1_input = add_column_to_main_data(df, cluster1, 'cluster')

original cluster data
0    332
2     67
1      2
Name: cluster, dtype: int64
New data
0    332
2     67
1      2
Name: cluster, dtype: int64


In [15]:
# Set the params 
params = {
    'model':'sarimax',
    'revision_method':'BU',
    'time_steps': 12,
    'model_params': {
        'order': (1, 1, 2),
        'Seasonal_order': (1, 0, 2, 12),
        'trend': 't'
    }
}

# Run the function 
predictions = train_heirarchical_cluster_model(data=cluster1_input,
                                 agregate_col='cluster', 
                                 params=params,
                                 cluster_type="pca clusters ")

Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model sarimax with revision method BU.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [02:24<00:00,  2.80it/s]
Fitting models:   2%|█▌                                                               | 10/405 [00:00<00:04, 96.29it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:03<00:00, 108.05it/s]


No negative values found in the predictions
The average error is: 0.33549409016305043


In [12]:
predictions

Unnamed: 0,total,0,1,2,12072,09778,15084,09279,05512,07336,...,05566,05114,05119,05954,04011,05513,08311,06413,06412,05162
2010-01-31,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2010-02-28,3215.728277,2696.158623,45.068760,697.441771,10.182352,21.029312,14.885475,25.263996,9.188157,10.441419,...,11.097031,11.507215,11.975326,41.476504,11.242676,14.017330,-2942.708510,7.599474,-582.215430,-883.080275
2010-03-31,3226.877372,2714.953146,45.084134,698.529681,10.182333,20.900868,14.878815,25.414209,9.306414,10.709941,...,11.349092,11.499709,11.972892,41.479253,11.348171,14.370025,-2974.766544,7.441088,-582.829469,-883.886904
2010-04-30,3089.899632,2582.290753,43.701448,683.159944,10.025183,19.837872,14.451293,24.037783,9.146713,10.266229,...,10.714932,11.417884,11.597963,39.995974,11.310055,14.047799,-2839.609260,7.906260,-559.510517,-848.899184
2010-05-31,2895.340039,2385.678012,42.269617,669.221364,9.191637,18.232475,13.826904,22.192561,8.941844,9.698692,...,10.170271,11.386776,11.628856,38.293974,11.139743,13.786346,-2638.018427,8.014611,-523.003082,-793.764342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-01-08,1870.549771,1518.064902,26.153477,467.398122,4.303086,11.741827,6.451676,14.673506,6.539645,6.311926,...,7.372984,9.432014,9.764160,24.361509,8.980641,11.723917,-1685.192373,6.428440,-334.227292,-506.557753
2019-01-09,1820.539344,1469.532761,25.830131,460.427221,4.304600,11.361734,6.454733,14.238765,6.402435,6.122587,...,7.205529,9.442798,9.607117,23.808067,8.982978,11.613547,-1642.455563,6.496742,-327.472522,-496.372498
2019-01-10,1758.174544,1410.236353,25.456686,445.793594,4.306123,10.858396,6.465435,13.723825,6.317931,5.962557,...,6.942239,9.456235,9.437773,23.111879,8.985992,11.506725,-1586.426256,6.585278,-320.279588,-485.526819
2019-01-11,1745.483458,1398.980761,25.405595,438.105374,4.306810,10.733410,6.470204,13.641405,6.372121,5.992824,...,6.846372,9.459666,9.426463,22.965973,8.986722,11.510666,-1572.964348,6.607098,-319.834288,-484.855292


In [90]:
# Run the code for hts

# Run all combinations for models 
model_types = ['sarimax']
revisions = ['BU', 'AHP', 'PHA', 'FP', 'OLS', 'WLSS', 'WLSV']

# Set the params 
params = {
    'model':'auto_arima',
    'revision_method':'BU',
    'time_steps': 12,
    'model_params': {
        'order': (2, 1, 2)
    }
}

for m in model_types:
    for r in revisions:
        print(f"Model: {m} and Revision: {r}")
        
        # Change params 
        params['model'] = m
        params['revision_method'] = r
        
        # Run the prediction model  
        predictions = train_heirarchical_cluster_model(data=cluster1_input,
                                         agregate_col='cluster', 
                                         params=params,
                                         cluster_type="clusters by Amit")

Model: sarimax and Revision: BU
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model sarimax with revision method BU.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [01:59<00:00,  3.40it/s]
Fitting models:   9%|██████                                                          | 38/405 [00:00<00:01, 189.69it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 209.39it/s]


No negative values found in the predictions
The average error is: 0.14641924858639638
Model: sarimax and Revision: AHP
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model sarimax with revision method AHP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [02:12<00:00,  3.07it/s]
Fitting models:   6%|████                                                            | 26/405 [00:00<00:01, 252.15it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 225.90it/s]


No negative values found in the predictions
The average error is: 0.4131294997850292
Model: sarimax and Revision: PHA
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model sarimax with revision method PHA.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [02:07<00:00,  3.19it/s]
Fitting models:   4%|██▌                                                             | 16/405 [00:00<00:02, 150.10it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:02<00:00, 194.15it/s]


No negative values found in the predictions
The average error is: 0.4528342008754722
Model: sarimax and Revision: FP
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model sarimax with revision method FP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [02:08<00:00,  3.16it/s]
Fitting models:   2%|█▍                                                                | 9/405 [00:00<00:04, 87.03it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:02<00:00, 153.85it/s]


(3, 120)
No negative values found in the predictions
The average error is: 26.527558187863697
Model: sarimax and Revision: OLS
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model sarimax with revision method OLS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [01:54<00:00,  3.53it/s]
Fitting models:   6%|███▉                                                            | 25/405 [00:00<00:01, 249.18it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 262.21it/s]


There are negative values in the predictions.
The average error is: 373.3821404417998
Model: sarimax and Revision: WLSS
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model sarimax with revision method WLSS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [02:38<00:00,  2.56it/s]
Fitting models:   6%|███▋                                                            | 23/405 [00:00<00:01, 224.74it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:02<00:00, 186.33it/s]


There are negative values in the predictions.
The average error is: 9.014238244241934
Model: sarimax and Revision: WLSV
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model sarimax with revision method WLSV.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [02:37<00:00,  2.57it/s]
Fitting models:   5%|███▎                                                            | 21/405 [00:00<00:01, 204.62it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 210.70it/s]


There are negative values in the predictions.
The average error is: 9721.030155019933


### Cluster 2: k-Modes Clusters

In [17]:
# Read the data 
cluster2 = pd.read_csv('kmodes3.csv', converters={'ags5':str, 'cluster':str})
print(cluster2.shape)
cluster2.head()

(401, 3)


Unnamed: 0,kreis,ags5,cluster
0,"Flensburg, Stadt",1001,1
1,"Kiel, Landeshauptstadt",1002,2
2,"Lübeck, Hansestadt",1003,2
3,"Neumünster, Stadt",1004,0
4,Dithmarschen,1051,1


In [18]:
cluster2_input = add_column_to_main_data(df, cluster2, 'cluster')

original cluster data
2    190
0    114
1     97
Name: cluster, dtype: int64
New data
2    190
0    114
1     97
Name: cluster, dtype: int64


In [20]:
# Run the code for hts

# Run all combinations for models 
model_types = ['auto_arima']
revisions = ['BU', 'AHP', 'PHA', 'FP', 'OLS', 'WLSS', 'WLSV']

# Set the params 
params = {
    'model':'auto_arima',
    'revision_method':'BU',
    'time_steps': 12,
    'model_params': {
        'order': (2, 1, 2)
    }
}

for m in model_types:
    for r in revisions:
        print(f"Model: {m} and Revision: {r}")
        
        # Change params 
        params['model'] = m
        params['revision_method'] = r
        
        # Run the prediction model  
        predictions = train_heirarchical_cluster_model(data=cluster2_input,
                                         agregate_col='cluster', 
                                         params=params,
                                         cluster_type="clusters by Cinny kModes")

Model: auto_arima and Revision: BU
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method BU.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [30:14<00:00,  4.48s/it]
Fitting models:   2%|█▌                                                               | 10/405 [00:00<00:04, 88.86it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:03<00:00, 119.27it/s]


No negative values found in the predictions
The average error is: 0.1707600377188778
Model: auto_arima and Revision: AHP
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method AHP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [30:27<00:00,  4.51s/it]
Fitting models:   4%|██▋                                                             | 17/405 [00:00<00:02, 164.42it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:02<00:00, 181.77it/s]


No negative values found in the predictions
The average error is: 0.41566813827985644
Model: auto_arima and Revision: PHA
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method PHA.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [45:34<00:00,  6.75s/it]
Fitting models:   2%|█▏                                                                | 7/405 [00:00<00:05, 67.37it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [00:04<00:00, 90.95it/s]


No negative values found in the predictions
The average error is: 0.45455251183218054
Model: auto_arima and Revision: FP
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method FP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [46:48<00:00,  6.93s/it]
Fitting models:   1%|▉                                                                 | 6/405 [00:00<00:07, 53.74it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [00:04<00:00, 88.66it/s]


(3, 120)
There are negative values in the predictions.
The average error is: 162543.5457608644
Model: auto_arima and Revision: OLS
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method OLS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [25:29<00:00,  3.78s/it]
Fitting models:  11%|██████▊                                                         | 43/405 [00:00<00:01, 210.30it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 220.91it/s]


There are negative values in the predictions.
The average error is: 143.8252612693931
Model: auto_arima and Revision: WLSS
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method WLSS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [25:09<00:00,  3.73s/it]
Fitting models:   3%|██▏                                                             | 14/405 [00:00<00:02, 138.48it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:02<00:00, 163.31it/s]


There are negative values in the predictions.
The average error is: 1.9039785850359117
Model: auto_arima and Revision: WLSV
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method WLSV.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [23:08<00:00,  3.43s/it]
Fitting models:   3%|██                                                              | 13/405 [00:00<00:03, 127.93it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:02<00:00, 154.39it/s]


There are negative values in the predictions.
The average error is: 2344.1947095309984


## Cluster 3 - tsne

In [21]:
# Read the tsne data
cluster3 = pd.read_csv('df_tsne_cluster.csv', converters={'ags5':str, 'cluster':str})
print(cluster3.shape)
cluster3.head()

(401, 179)


Unnamed: 0,data_index,cluster,kreis,ags5,ags2,supermarkets_population,supermarkets_average_distance,public_transport_availability,average_distance_bus_stop,average_distance_train_station,...,room_type_location,district_settlement_structure,type_of_settlement_structure,urban_/_rural,metropolitan_region,metropolitan_area,east_west,border_proximity,support_area_status,eligible_area
0,0,2,"Flensburg, Stadt",1001,1,92,500,35,240,2901,...,2,4,3,2,99,99,1,1,C,1
1,1,2,"Kiel, Landeshauptstadt",1002,1,92,460,37,268,2037,...,2,1,2,1,99,99,1,0,C/D,1
2,2,2,"Lübeck, Hansestadt",1003,1,90,532,37,297,1927,...,1,1,2,1,5,99,1,0,C/D,1
3,3,2,"Neumünster, Stadt",1004,1,85,588,37,316,1648,...,2,3,2,2,5,99,1,0,D,1
4,4,0,Dithmarschen,1051,1,51,1864,35,448,3517,...,4,4,3,2,5,99,1,0,C,1


In [22]:
cluster3_input = add_column_to_main_data(df, cluster3, 'cluster')

original cluster data
0    179
1    129
2     93
Name: cluster, dtype: int64
New data
0    179
1    129
2     93
Name: cluster, dtype: int64


In [None]:
# Run the code for hts

# Run all combinations for models 
model_types = ['auto_arima']
revisions = ['BU', 'AHP', 'PHA', 'FP', 'OLS', 'WLSS', 'WLSV']

# Set the params 
params = {
    'model':'sarimax',
    'revision_method':'BU',
    'time_steps': 12,
    'model_params': {
        'order': (2, 1, 2)
    }
}

for m in model_types:
    for r in revisions:
        print(f"Model: {m} and Revision: {r}")
        
        # Change params 
        params['model'] = m
        params['revision_method'] = r
        
        # Run the prediction model  
        predictions = train_heirarchical_cluster_model(data=cluster3_input,
                                         agregate_col='cluster', 
                                         params=params,
                                         cluster_type="clusters by Prakhar tsne")

Model: auto_arima and Revision: BU
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method BU.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [22:27<00:00,  3.33s/it]
Fitting models:  12%|███████▉                                                        | 50/405 [00:00<00:01, 242.58it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 203.72it/s]


No negative values found in the predictions
The average error is: 0.1707600377188778
Model: auto_arima and Revision: AHP
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method AHP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [21:19<00:00,  3.16s/it]
Fitting models:  13%|████████                                                        | 51/405 [00:00<00:01, 251.88it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 218.85it/s]


No negative values found in the predictions
The average error is: 0.41566813827985644
Model: auto_arima and Revision: PHA
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method PHA.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [25:02<00:00,  3.71s/it]
Fitting models:   4%|██▌                                                             | 16/405 [00:00<00:02, 153.16it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:02<00:00, 185.74it/s]


No negative values found in the predictions
The average error is: 0.45455251183218054
Model: auto_arima and Revision: FP
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method FP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [23:18<00:00,  3.45s/it]
Fitting models:   0%|                                                                          | 0/405 [00:00<?, ?it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:02<00:00, 164.10it/s]


(3, 120)
There are negative values in the predictions.
The average error is: 2035.9767636240258
Model: auto_arima and Revision: OLS
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method OLS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [27:36<00:00,  4.09s/it]
Fitting models:   5%|███▏                                                            | 20/405 [00:00<00:03, 100.22it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:03<00:00, 108.98it/s]


There are negative values in the predictions.
The average error is: 247.9078859229583
Model: auto_arima and Revision: WLSS
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method WLSS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [39:22<00:00,  5.83s/it]
Fitting models:   3%|█▋                                                              | 11/405 [00:00<00:03, 107.20it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:03<00:00, 126.54it/s]


There are negative values in the predictions.
The average error is: 3.002982747207719
Model: auto_arima and Revision: WLSV
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method WLSV.


Fitting models:  28%|█████████████████▋                                              | 112/405 [09:16<21:21,  4.37s/it]

## Auto ARIMA Experiments

In [98]:
# Run the code for hts

# Run all combinations for models 
model_types = ['auto_arima']
revisions = ['BU', 'AHP', 'PHA', 'FP', 'OLS', 'WLSS', 'WLSV']

# Set the params 
params = {
    'model':'sarimax',
    'revision_method':'BU',
    'time_steps': 12,
    'model_params': {
        'order': (2, 1, 2)
    }
}

for m in model_types:
    for r in revisions:
        print(f"Model: {m} and Revision: {r}")
        
        # Change params 
        params['model'] = m
        params['revision_method'] = r
        
        # Run the prediction model  
        predictions = train_heirarchical_cluster_model(data=cluster1_input,
                                         agregate_col='cluster', 
                                         params=params,
                                         cluster_type="clusters by Amit")

Model: auto_arima and Revision: BU
Generating the hierarchical dataset...
Got 3 clusters..


Fitting models:   0%|                                                                          | 0/405 [00:00<?, ?it/s]

The dataset size is (120, 405)
Fitting the model auto_arima with revision method BU.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [17:15<00:00,  2.56s/it]
Fitting models:   4%|██▊                                                             | 18/405 [00:00<00:02, 167.06it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 252.45it/s]


No negative values found in the predictions
The average error is: 0.1707600377188778
Model: auto_arima and Revision: AHP
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method AHP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [21:25<00:00,  3.17s/it]
Fitting models:  13%|████████▍                                                       | 53/405 [00:00<00:01, 262.47it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 278.75it/s]


No negative values found in the predictions
The average error is: 0.41566813827985644
Model: auto_arima and Revision: PHA
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method PHA.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [17:42<00:00,  2.62s/it]
Fitting models:   6%|███▋                                                            | 23/405 [00:00<00:01, 227.77it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 235.61it/s]


No negative values found in the predictions
The average error is: 0.45455251183218054
Model: auto_arima and Revision: FP
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method FP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [17:51<00:00,  2.65s/it]
Fitting models:   5%|███▍                                                            | 22/405 [00:00<00:01, 219.92it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 231.96it/s]


(3, 120)
There are negative values in the predictions.
The average error is: 306020.5769341778
Model: auto_arima and Revision: OLS
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method OLS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [17:52<00:00,  2.65s/it]
Fitting models:  11%|███████▎                                                        | 46/405 [00:00<00:01, 231.64it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 231.66it/s]


There are negative values in the predictions.
The average error is: 247.90788592295766
Model: auto_arima and Revision: WLSS
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method WLSS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [17:49<00:00,  2.64s/it]
Fitting models:  12%|███████▉                                                        | 50/405 [00:00<00:01, 251.16it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 233.90it/s]


There are negative values in the predictions.
The average error is: 3.002982747207723
Model: auto_arima and Revision: WLSV
Generating the hierarchical dataset...
Got 3 clusters..
The dataset size is (120, 405)
Fitting the model auto_arima with revision method WLSV.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 405/405 [18:09<00:00,  2.69s/it]
Fitting models:   5%|███▎                                                            | 21/405 [00:00<00:01, 204.53it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 405/405 [00:01<00:00, 231.61it/s]


There are negative values in the predictions.
The average error is: 4028.080432061559
