In [1]:
import pandas as pd

# 1. Data Class

In [2]:
class Data():
    
    def __init__(self, dataFrame, file_format):
        '''
        INPUTS:
        ------
        dataFrame: pandas dataframe containing no null values in either wide or long format
        file_format: either 'wide' or 'long'
            'Wide': must have dates as columns, ags5s as index
            'Long': Must have 3 columns: ['ags5', 'date', 'unemployment_rate']
        
        Constraints:
        ------------
        input csv file must have a column called ags5 for both long and wide
        if the input csv is wide: # TODO
        if the input csv is long, there must be a column called 'unemployment_rate' and a column called 'date'      
        '''
        
        ## imports
        import pandas as pd
        
        # convert to lowercase
        file_format = file_format.lower()
        assert file_format == 'wide' or file_format == 'long'
        
        if file_format == 'long':
            
            data_long = dataFrame
            
            # do all the checks
            assert all(necessary_col in data_long.columns for necessary_col in ['ags5', 'date', 'unemployment_rate'])
            
            # process:
            data_long = data_long[['ags5', 'date', 'unemployment_rate']] # throw away other columns
            data_long['ags5'] = data_long['ags5'].apply(self.fix_ags5)
        
            # save:
            self.data_long = data_long
            
        elif file_format == 'wide':
            # TODO
            data_wide = dataFrame
            
            # process
            data_wide.index = [self.fix_ags5(ags5) for ags5 in data_wide.index]
            
            # save
            self.data_wide = data_wide            
            
        
        
    def long(self):
        '''
        Returns:
        --------
        A dataframe with columns: 'ags5', 'date', 'unemployment_rate'
        '''
        try:
            return self.data_long
        except AttributeError:
            # create the long version
            data_long = pd.DataFrame(   
                [(wide_df.index[i], wide_df.columns[j], wide_df.iloc[i, j]) 
                for i in range(len(wide_df.index)) 
                for j in range(len(wide_df.columns))])
            
            data_long.columns = ['ags5', 'date', 'unemployment_rate']
            
            self.data_long = data_long # store
            
            return self.data_long
        
    def wide(self):
        '''
        Returns:
        --------
        A dataframe with ags5 as index and dates as columns
        '''
        
        try:
            return self.data_wide
        
        except AttributeError:
            # create the wide version
            all_dates = self.data_long['date'].unique()
            all_ags5 = self.data_long['ags5'].unique()
            all_dates.sort()
            all_ags5.sort()
            grouped_data_long = self.data_long.groupby('date')
            
            data_wide = pd.DataFrame(index=all_ags5)
            for date in all_dates:
                df_of_date = grouped_data_long.get_group(date)
                df_of_date.set_index('ags5', inplace=True, drop=True)
                df_of_date = df_of_date.drop('date', axis=1)
                data_wide = data_wide.merge(df_of_date, left_index=True, right_index=True)
            
            data_wide.columns = all_dates
            
            self.data_wide = data_wide
            
            return data_wide
        
        
    ## util functions: 
    def fix_ags5(self, x):
        """Function to fix the format of ag5 """
        if len(str(x))==4:
            return '0'+str(x)
        else:
            return str(x)

In [3]:
wide_df = pd.read_csv('data/Alo_Quote.csv')
wide_df.columns = ['ags5'] + list(wide_df.columns[1:])
wide_df.set_index('ags5', inplace=True)
wide_df

Unnamed: 0_level_0,2007-05-31,2007-06-30,2007-07-31,2007-08-31,2007-09-30,2007-10-31,2007-11-30,2007-12-31,2008-01-31,2008-02-29,...,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31
ags5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,12.7,12.2,12.5,12.3,11.7,11.2,10.8,10.7,11.6,11.5,...,9.6,9.1,8.7,8.7,8.8,9.4,9.2,9.1,9.1,8.9
1002,12.4,12.1,12.5,12.8,12.1,11.7,11.4,11.4,11.9,12.0,...,9.2,8.9,8.7,8.4,8.3,8.7,8.7,8.6,8.7,8.3
1003,12.8,12.7,12.9,12.9,12.7,12.4,12.3,12.4,12.8,12.7,...,8.6,8.3,8.2,8.1,8.2,8.7,8.8,8.9,8.9,8.6
1004,11.1,11.2,11.6,11.5,10.9,10.9,10.7,10.8,11.4,11.1,...,9.4,9.3,9.2,9.1,9.1,9.5,9.8,9.6,9.4,9.1
1051,9.9,9.5,9.7,9.7,9.0,8.8,9.2,9.6,10.7,10.6,...,6.2,5.8,5.7,5.8,6.2,6.8,6.9,6.6,6.4,6.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16073,13.1,12.6,12.2,12.1,11.3,11.0,11.1,11.5,12.7,12.6,...,6.0,5.8,5.8,5.7,5.9,6.5,6.6,6.5,6.3,6.0
16074,11.2,10.6,10.3,10.4,10.1,9.8,9.3,9.9,11.6,11.3,...,4.8,4.5,4.5,4.3,4.5,5.0,5.1,5.0,4.7,4.6
16075,10.9,9.9,10.0,10.0,9.3,9.1,9.1,9.9,11.4,11.3,...,5.1,4.8,4.6,4.7,4.9,5.5,5.5,5.3,5.0,4.8
16076,13.1,12.5,12.5,12.5,11.7,11.2,11.0,11.8,13.4,13.1,...,5.5,5.3,5.1,5.0,5.0,5.6,5.6,5.4,5.1,4.9


In [4]:
data1 = Data(wide_df, 'wide')
data1.long()

Unnamed: 0,ags5,date,unemployment_rate
0,01001,2007-05-31,12.7
1,01001,2007-06-30,12.2
2,01001,2007-07-31,12.5
3,01001,2007-08-31,12.3
4,01001,2007-09-30,11.7
...,...,...,...
67764,16077,2021-01-31,7.3
67765,16077,2021-02-28,7.3
67766,16077,2021-03-31,7.2
67767,16077,2021-04-30,6.9


In [5]:
long_df = pd.read_csv('data/data_from_2010_to_2021_unemployment_rate.csv')
long_df

Unnamed: 0,ags5,date,unemployment_rate
0,1001,2010-01-31,13.7
1,1001,2010-02-28,14.1
2,1001,2010-03-31,13.6
3,1001,2010-04-30,13.1
4,1001,2010-05-31,12.5
...,...,...,...
54932,16077,2021-01-31,7.3
54933,16077,2021-02-28,7.3
54934,16077,2021-03-31,7.2
54935,16077,2021-04-30,6.9


In [6]:
data2 = Data(long_df, 'long')
data2.wide()

Unnamed: 0,2010-01-31,2010-02-28,2010-03-31,2010-04-30,2010-05-31,2010-06-30,2010-07-31,2010-08-31,2010-09-30,2010-10-31,...,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31
01001,13.7,14.1,13.6,13.1,12.5,12.1,12.5,12.7,11.9,11.9,...,9.6,9.1,8.7,8.7,8.8,9.4,9.2,9.1,9.1,8.9
01002,11.2,11.1,10.9,11.0,10.6,10.4,10.7,11.0,10.4,10.3,...,9.2,8.9,8.7,8.4,8.3,8.7,8.7,8.6,8.7,8.3
01003,12.1,12.2,12.1,11.9,11.3,10.9,11.2,11.0,10.5,10.3,...,8.6,8.3,8.2,8.1,8.2,8.7,8.8,8.9,8.9,8.6
01004,11.9,11.9,11.8,11.5,11.4,11.2,11.9,11.7,11.3,10.8,...,9.4,9.3,9.2,9.1,9.1,9.5,9.8,9.6,9.4,9.1
01051,10.2,10.2,9.8,8.7,8.0,7.7,8.1,8.1,7.5,7.4,...,6.2,5.8,5.7,5.8,6.2,6.8,6.9,6.6,6.4,6.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16073,11.5,11.9,11.8,10.9,10.4,10.1,10.0,9.4,8.8,8.4,...,6.0,5.8,5.8,5.7,5.9,6.5,6.6,6.5,6.3,6.0
16074,10.1,10.2,9.6,8.9,8.1,8.0,8.1,7.9,7.7,7.6,...,4.8,4.5,4.5,4.3,4.5,5.0,5.1,5.0,4.7,4.6
16075,10.5,10.7,10.4,9.2,8.4,8.0,8.0,7.6,7.5,7.1,...,5.1,4.8,4.6,4.7,4.9,5.5,5.5,5.3,5.0,4.8
16076,12.0,12.2,11.7,10.6,9.9,9.4,9.7,9.2,8.8,8.5,...,5.5,5.3,5.1,5.0,5.0,5.6,5.6,5.4,5.1,4.9


# 2. Model

## 2.1 The Abstract Class:

In [7]:
class AbstractModel():
    
    def __init__(self, output_save_location, params=None, unemploymentRateData=None, otherData=None, model_save_location=None):
        '''
        Inputs:
        -------
        pretrained: set True if you want to load a model, set false of you want to retrain a model
        output_save_location: location, including the csv file where the outputs are stored
        model_save_location: currently only used by neural network, i.e. used for all models which you don't do walk forward validation or any kind of retraining
        if model_save_location = None:
            i.e. we do walk forward validation
            params: params of the model, a dictionary
            unemploymentRateData: an instance of the Data class containing the unemployment rate, use it to train a model
            otherData: a dictionary containing other data, e.g. cluster values for VAR model
        '''
        
        import numpy as np
        import pandas as pd
        
        self.output_save_location = output_save_location
        self.params = params
        
        if model_save_location is None:
            ## we use walk forward validation i.e. retraining happens
            assert params is not None
            assert unemploymentRateData is not None
            
            self.unemploymentRateData = unemploymentRateData
            self.otherData = otherData
        else:
            ## a fixed network without retraining, e.g a NN 
            self.model_save_location = model_save_location
            self.unemploymentRateData = unemploymentRateData
            self.otherData = otherData
    
    
    ## abstractmethod
    def wf_pred(self, n):
        '''
        Inputs:
        -------
        self: contains all the training data, params etc
        n: number of timestamps to pred, with walk forward validation
        '''
        pass
    
    
    def getWalkForwardPred(self, n):
        '''
        Inputs:
        -------
        n: number of timestamps to pred, with walk forward validation
        '''
        
        ur_data_long = self.unemploymentRateData.long()
        last_date_train = max(ur_data_long['date'])
        
        '''1. Check if it is already calculated'''
        ## read the file
        try:
            output_csv = pd.read_csv(self.output_save_location, index_col=0)
            output_csv.index = [fix_ags5(ags5)  for ags5 in output_csv.index]
            
            ## check if the calculation has already been done
            columns_needed= [str(last_date_train)+'_'+str(i) for i in range(n)]
            
            if all(col_needed in output_csv.columns for col_needed in columns_needed):
                # this data has already been generated
                return_df = output_csv[columns_needed]
                return_df = return_df.sort_index(axis=1) # sort columns
                return_df.columns = list(range(n)) # rename columns
                print("directly from csv")
                return return_df

            else:
                predictions = self.wf_pred(self.unemploymentRateData, self.otherData, n, self.params) # get the preds
                predictions.columns = [str(last_date_train)+'_'+str(i) for i in predictions.columns] #rename the columns
                # save the predictions
                output_csv.drop([col for col in predictions.columns if col in output_csv.columns], axis=1, inplace=True) # drop columns that already exist
                output_csv = output_csv.merge(predictions, left_index=True, right_index=True)
                output_csv.to_csv(self.output_save_location)
                print('Appending to csv')
                return predictions
                
            
        except FileNotFoundError:
            # make the file
            predictions = self.wf_pred(self.unemploymentRateData, self.otherData, n, self.params) # get the preds
            predictions.columns = [str(last_date_train)+'_'+str(i) for i in predictions.columns] #rename the columns
            
            predictions.to_csv(self.output_save_location)   
            print('making csv')
            return predictions
    
    def getWalkForwardPred_3Months(self):
        return self.getWalkForwardPred(3)
    
    ## Utils
    def fix_ags5(self, x):
        """Function to fix the format of ag5 """
        if len(str(x))==4:
            return '0'+str(x)
        else:
            return str(x)

In [8]:
class VARModel(AbstractModel):
    
    def __init__(self, output_save_location, params, unemploymentRateData, cluster_df):
        '''
        Inputs:
        -------
        pretrained: set True if you want to load a model, set false of you want to retrain a model
        output_save_location: location, including the csv file where the outputs are stored
        params: a dictionary containing
            required keys: 'second_diff' and 'lag_value'
            optional keys: 'start_date', 'end_date'
        unemploymentRateData: an instance of the class Data()
        cluster_df: a df with two columns:
            'ags5': ags5 of a kreis
            'cluster': the cluster id
        '''
        
        self.cluster_df = cluster_df
        
        super().__init__(output_save_location, params, unemploymentRateData, otherData=cluster_df, model_save_location=None)     
            


    def wf_pred(self, data, cluster_df, n, params):
        '''
        Inputs:
        -------
        input_df: a dataframe containing unemployment data in LONG format, required columns:
            'ags5' as a string of len 5,
            'date' as a numpy.datetime64
            'unemployment_rate'
        cluster_df: a df with two columns:
            'ags5': ags5 of a kreis
            'cluster': the cluster id
        n: number of timestamps od prediction
        params: a dictionary containing
            required keys: 'second_diff' and 'lag_value'
            optional keys: 'start_date', 'end_date'
            
        Returns:
        --------
        a pd DataFrame with:
            0, 1, 2, ... n: as columns
            ags5 as rows
        '''



        ''' Utility Functions '''

        # Invert the transformation to get the real forecast
        def invert_transformation(df_train, df_forecast, second_diff=False):

            """Revert back the differencing to get the forecast to original scale."""
            df_fc = df_forecast.copy()
            columns = df_train.columns
            for col in columns:        
                # Roll back 2nd Diff
                if second_diff:
                    df_fc[str(col)+'_diff'] = (df_train[col].iloc[-1]-df_train[col].iloc[-2]) + df_fc[str(col)+'_diff'].cumsum()
                # Roll back 1st Diff
                df_fc[str(col)+'_forecast'] = df_train[col].iloc[-1] + df_fc[str(col)+'_diff'].cumsum()
            return df_fc

        '''Get the long format from the Data class'''
        input_df = data.long()
        
        ''' Data Preparation '''
        ## cropping dates
        if 'start_date' in params.keys() or 'end_date' in params.keys():
            ## cropping required:
            if 'start_date' not in params.keys():
                params['start_date'] = min(input_df['date'])
            if 'end_date' not in params.keys():
                params['end_date'] = max(input_df['date'])
            # clip the data
            start_date = params['start_date']
            end_date = params['end_date']
            mask = (input_df['date'] >= start_date) & (input_df['date'] < end_date)
            input_df = input_df[(mask)].reset_index(drop=True)

        # Create unemployment dataframe
        unemployment_rate_df = pd.DataFrame(input_df['date'].unique())
        unemployment_rate_df.columns = ['date']

        kreis_list = list(input_df['ags5'].unique())
        for kreis in kreis_list:
            unemployment_rate_df[kreis] = input_df.loc[input_df['ags5']==kreis, 'unemployment_rate'].reset_index(drop=True)

        # set date as index
        unemployment_rate_df.set_index('date', inplace=True)

        ''' Cluster iterations '''

        # output df:
        output_df = pd.DataFrame(index=range(n))

        # Get cluster list 
        cluster_list = list((cluster_df['cluster'].unique()))

        for cluster in cluster_list:
            cluster_kreis_list = list(cluster_df.loc[(cluster_df['cluster']==cluster), 'ags5'].unique())

            # Check if the kreis has a single item
            if len(cluster_kreis_list) == 1:
                print("Single item in the cluster. Hence skipping.")
                continue

            # get cluster data 
            cluster_data = unemployment_rate_df[cluster_kreis_list]

            num_steps = 1 # number of pred per wf step
            for i in range(n):

                # Differentiate the data twice if asked else once
                if_second_diff = params['second_diff']

                df_differenced = cluster_data.diff().dropna()
                if if_second_diff:
                    df_differenced = df_differenced.diff().dropna()

                # Get the lag value 
                lag_value = params['lag_value']

                # Feed the model data
                from statsmodels.tsa.api import VAR
                import warnings
                warnings.filterwarnings("ignore")
                model = VAR(df_differenced)

                # Fit the model to lag
                model_fitted = model.fit(lag_value)

                '''Forecasting section'''
                # Get the lag order
                lag_order = model_fitted.k_ar

                # Input data for forecasting
                forecast_input = df_differenced.values[-lag_order:]

                # Get the forecasts            
                fc = model_fitted.forecast(y=forecast_input, steps=num_steps)
                df_forecast = pd.DataFrame(fc, index=[i], columns=cluster_data.columns + '_diff')

                # Invert the forecast results
                df_results = invert_transformation(cluster_data, df_forecast, second_diff=if_second_diff)
                final_results = df_results.filter(regex='_forecast')

                # add the preds to training data
                final_results.columns = [col_name[:5] for col_name in final_results.columns] ## remove _forecast
                cluster_data = cluster_data.append(final_results) # add to the training data

            output_df = output_df.join(cluster_data.iloc[-n:]) # append the preds

        return output_df.T

In [9]:
output_save_location = 'cache/VAR/output.csv'
params = {
    'lag_value': 9, 
    'second_diff': False,
    'start_date': '2007-05-01' # b4 that nan values
}
unemploymentRateData = data1 # using data 2 will work just as fine

In [10]:
def fix_ags5(x):
    """Function to fix the format of ag5 """
    if len(str(x))==4:
        return '0'+str(x)
    else:
        return str(x)

cluster_data = pd.read_csv('data/cluster_data.csv')
cluster_data['ags5'] = cluster_data['ags5'].apply(fix_ags5)
cluster_data

Unnamed: 0,ags5,PCA_Cluster,Kmodes_cluster,tsne_cluster,ags2
0,01001,0,1,2,1
1,01002,2,2,2,1
2,01003,2,2,2,1
3,01004,0,0,2,1
4,01051,0,1,0,1
...,...,...,...,...,...
396,16073,0,1,0,16
397,16074,0,1,0,16
398,16075,0,1,0,16
399,16076,0,1,0,16


In [11]:
CLUSTER_TO_USE = 'PCA_Cluster'
cluster_df = cluster_data[['ags5', CLUSTER_TO_USE]]
cluster_df.columns = ['ags5', 'cluster']
cluster_df

Unnamed: 0,ags5,cluster
0,01001,0
1,01002,2
2,01003,2
3,01004,0
4,01051,0
...,...,...
396,16073,0
397,16074,0
398,16075,0
399,16076,0


In [12]:
VARObject = VARModel(output_save_location, params, unemploymentRateData, cluster_df)
VARObject

<__main__.VARModel at 0x2776512ae48>

In [13]:
VARObject.getWalkForwardPred_3Months()

making csv


Unnamed: 0,2021-05-31_0,2021-05-31_1,2021-05-31_2
01001,9.104045,8.927351,8.817409
01004,9.318700,9.267051,9.298819
01051,6.221203,6.058978,5.870172
01053,5.544228,5.494591,5.537900
01054,5.384197,5.172882,4.897772
...,...,...,...
15002,9.715284,9.810737,9.781917
15003,8.881911,9.204855,9.426835
16051,6.879414,6.894062,6.967905
02000,7.840170,7.874041,8.053467


In [14]:
VARObject.getWalkForwardPred_3Months()

directly from csv


Unnamed: 0,0,1,2
01001,9.104045,8.927351,8.817409
01004,9.318700,9.267051,9.298819
01051,6.221203,6.058978,5.870172
01053,5.544228,5.494591,5.537900
01054,5.384197,5.172882,4.897772
...,...,...,...
15002,9.715284,9.810737,9.781917
15003,8.881911,9.204855,9.426835
16051,6.879414,6.894062,6.967905
02000,7.840170,7.874041,8.053467


In [15]:
VARObject.getWalkForwardPred(2)

directly from csv


Unnamed: 0,0,1
01001,9.104045,8.927351
01004,9.318700,9.267051
01051,6.221203,6.058978
01053,5.544228,5.494591
01054,5.384197,5.172882
...,...,...
15002,9.715284,9.810737
15003,8.881911,9.204855
16051,6.879414,6.894062
02000,7.840170,7.874041


In [16]:
VARObject.getWalkForwardPred(6)

Appending to csv


Unnamed: 0,2021-05-31_0,2021-05-31_1,2021-05-31_2,2021-05-31_3,2021-05-31_4,2021-05-31_5
01001,9.104045,8.927351,8.817409,8.677896,8.318210,8.272175
01004,9.318700,9.267051,9.298819,9.076865,8.692929,8.613833
01051,6.221203,6.058978,5.870172,5.600756,5.258174,5.284033
01053,5.544228,5.494591,5.537900,5.444529,5.244123,5.162517
01054,5.384197,5.172882,4.897772,4.410093,4.019445,4.111618
...,...,...,...,...,...,...
15002,9.715284,9.810737,9.781917,9.683438,8.950043,8.515502
15003,8.881911,9.204855,9.426835,9.386193,8.711290,8.221398
16051,6.879414,6.894062,6.967905,6.707537,6.174690,5.887035
02000,7.840170,7.874041,8.053467,8.041640,7.892496,7.885934


In [17]:
VARObject.getWalkForwardPred(5)

directly from csv


Unnamed: 0,0,1,2,3,4
01001,9.104045,8.927351,8.817409,8.677896,8.318210
01004,9.318700,9.267051,9.298819,9.076865,8.692929
01051,6.221203,6.058978,5.870172,5.600756,5.258174
01053,5.544228,5.494591,5.537900,5.444529,5.244123
01054,5.384197,5.172882,4.897772,4.410093,4.019445
...,...,...,...,...,...
15002,9.715284,9.810737,9.781917,9.683438,8.950043
15003,8.881911,9.204855,9.426835,9.386193,8.711290
16051,6.879414,6.894062,6.967905,6.707537,6.174690
02000,7.840170,7.874041,8.053467,8.041640,7.892496
