In [52]:
import pandas as pd
import numpy as np
import os

#from skforecast.ForecasterAutoreg import ForecasterAutoregMultiSeries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error 
from sklearn.preprocessing import StandardScaler 


PATH =  os.path.join(os.getcwd(), r'data\final_datasets')
SAVE_PATH = os.path.join(os.getcwd(), r'data\train_datasets')

In [44]:
def create_lagged_features(df, lag):
    """
    Creates a dataframe with history for all variables except date
    
    Parameters:
    df (pd.DataFrame): Initial dataframe with time series and date as column
    lag (int): Number of lags in history.

    Returns:
    pd.DataFrame: New dataframe with history
    """
    lagged_df = df.drop('date',axis = 1).copy()
    
    # Adding lags
    for i in range(1, lag + 1):
        shifted_df = df.shift(i)
        shifted_df.columns = [f'{col}_lag{i}' for col in df.columns]
        lagged_df = pd.concat([lagged_df, shifted_df], axis=1)
    
    # Adding data and dropping first nans
    lagged_df = pd.concat([lagged_df, df['date']], axis = 1)
    lagged_df.dropna(inplace=True)
    
    return lagged_df

def make_split(df, test_size):
    '''
    Makes a time series split for hold-out validation

    Parameters:
    df (pd.DataFrame): dataframe to split
    test_size (float): a fraction of dataframe to split

    Returns:
    x_train, x_test: train, test parts
    '''

    test_len = int(len(df)*test_size)
    x_train, x_test = df.iloc[:-test_len, :], df.iloc[-test_len:, :]

    return x_train, x_test

In [53]:
# Fintech
df = pd.read_csv(os.path.join(PATH, 'final_fintech.csv'))
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by = 'date', inplace = True)

lagged_df = create_lagged_features(df, 50)
x_train, x_test = make_split(lagged_df, test_size = 0.1)
x_train.to_csv(os.path.join(SAVE_PATH, 'fintech_train.csv'), index = False)
x_test.to_csv(os.path.join(SAVE_PATH, 'fintech_test.csv'), index = False)

In [54]:
# Renewable energy

df = pd.read_csv(os.path.join(PATH, 'final_renewable_energy.csv'))
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by = 'date', inplace = True)

lagged_df = create_lagged_features(df, 50)
x_train, x_test = make_split(lagged_df, test_size = 0.1)
x_train.to_csv(os.path.join(SAVE_PATH, 'renewable_energy_train.csv'), index = False)
x_test.to_csv(os.path.join(SAVE_PATH, 'renewable_energy_test.csv'), index = False)

In [55]:
# Healthcare services

df = pd.read_csv(os.path.join(PATH, 'final_healthcare_services.csv'))
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by = 'date', inplace = True)

lagged_df = create_lagged_features(df, 50)
x_train, x_test = make_split(lagged_df, test_size = 0.1)
x_train.to_csv(os.path.join(SAVE_PATH, 'healthcare_services_train.csv'), index = False)
x_test.to_csv(os.path.join(SAVE_PATH, 'healthcare_services_test.csv'), index = False)

In [56]:
# Industrial goods

df = pd.read_csv(os.path.join(PATH, 'final_industrial_goods.csv'))
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by = 'date', inplace = True)

lagged_df = create_lagged_features(df, 50)
x_train, x_test = make_split(lagged_df, test_size = 0.1)
x_train.to_csv(os.path.join(SAVE_PATH, 'industrial_goods_train.csv'), index = False)
x_test.to_csv(os.path.join(SAVE_PATH, 'industrial_goods_test.csv'), index = False)