In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

sns.set(style='darkgrid', palette='muted', font_scale=1.5, rc={'figure.figsize':(20,10)})

RANDOM_SEED = 40
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

### Load Data

In [2]:
def load_data(path, columns, start=None, stop=None):
    df = pd.read_pickle(path)
    return df.loc[start:stop, columns]

In [3]:
df = load_data('../../data/ica_summary.pkl', columns=['artiklar'], stop='2021-03-28')

In [4]:
df.tail()

Unnamed: 0,artiklar
2021-03-24,1949.0
2021-03-25,3807.0
2021-03-26,4856.0
2021-03-27,884.0
2021-03-28,0.0


### Create all features

In [5]:
def icas_egna_features(df):
    features = {}

    def add_weekday(features):
        features['weekday'] = pd.Series(df.index.weekday, index=df.index)
    
    def add_day_of_month(features):
        features['day_of_month'] = pd.Series(df.index.day, index=df.index)
    
    def add_holiday(features):
        holidays = ['2021-01-01', '2021-01-06', '2021-04-02', '2021-04-04', '2021-04-05', '2020-05-21', '2020-05-31', '2020-06-06', '2020-06-20', '2020-10-31', '2020-12-25', '2020-12-26']
        features['holiday'] = pd.Series(np.where(df.index.isin(holidays), 1, 0), index=df.index)
    
    def add_before_holiday(features):
        features['before_holiday'] = pd.Series(0, index=df.index)
        before_holidays = ['2021-01-01', '2021-04-02', '2020-06-06', '2020-12-24']
        for holiday in before_holidays:
            for i in range(1, 4):
                features['before_holiday'][pd.to_datetime(holiday) - pd.DateOffset(i)] = i
    
    def add_payday(features, payday, pension=False):
        if pension:
            if 'pension' not in features:
                features['pension'] = pd.Series(0, index=df.index) 
                series = features['pension']
            else:
                series = features['pension']
                
        if not pension:
            features['payday'] = pd.Series(0, index=df.index)
            series = features['payday']
        
        for i, day in enumerate(features['day_of_month']):
            if day == payday:
                if features['weekday'].iloc[i] in [5, 6] or features['holiday'].iloc[i]:
                    paydays = i - 1 if features['weekday'].iloc[i-1] not in [5, 6] or features['holiday'].iloc[i -1] else i - 2
                    series.iloc[paydays] = 1 
                else:
                    series.iloc[i] = 1
    
    add_weekday(features)
    add_day_of_month(features)
    add_holiday(features)
    add_before_holiday(features)
    add_payday(features, 25)
    add_payday(features, 18, pension=True)
    add_payday(features, 17, pension=True)

    return features
    

In [6]:
features = icas_egna_features(df)
for k, v in features.items():
    print(type(v))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


### Feature engineering

In [7]:
from itertools import combinations, chain

def feature_combinations(df, features):
    features_combinations = list(chain.from_iterable([combinations(features, i) for i in range(1, len(features))]))

    df_list = [df]
    for f_comb in features_combinations:
        ts = df.copy()
        for feat in f_comb:
            ts[feat] = features[feat]
        df_list.append(ts)
    
    return df_list

In [8]:
df_list = feature_combinations(df, features)
df_list = df_list

In [9]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

def scale_data(df_list):
    scaled_df_list = []
    for df in df_list:
        min_max_scaler_x = MinMaxScaler(feature_range=(0, 1))
        min_max_scaler_y = MinMaxScaler(feature_range=(0, 1))

        robust_scaler_x = RobustScaler()
        robust_scaler_y = RobustScaler()

        standard_scaler_x = StandardScaler()
        standard_scaler_y = StandardScaler()
        
        scalers = [[min_max_scaler_x, min_max_scaler_y], [robust_scaler_x, robust_scaler_y], [standard_scaler_x, standard_scaler_y]]
#     Unscaled doesn't work here!

#         if len(df.columns) == 1:
#             scaled_df_list.append({'X': df.iloc[:].to_numpy().reshape(-1, 1), 
#                                    'y': df.iloc[:].to_numpy().reshape(-1, 1), 
#                                    'scaler_x': None, 
#                                    'scaler_y': None,
#                                    'features': df.columns})
#         else:
#             scaled_df_list.append({'X': df.iloc[:, :].to_numpy(), 
#                                    'y': df.iloc[:, 0].to_numpy().reshape(-1, 1), 
#                                    'scaler_x': None, 
#                                    'scaler_y': None,
#                                    'features': df.columns})

        for scaler in scalers:
            if len(df.columns) == 1:
                scaled_df_list.append({'X': scaler[0].fit_transform(df.iloc[:].to_numpy().reshape(-1, 1)), 
                                       'y': scaler[1].fit_transform(df.iloc[:].to_numpy().reshape(-1, 1)), 
                                       'scaler_x': scaler[0], 
                                       'scaler_y': scaler[1],
                                       'features': df.columns})
            else:
                scaled_df_list.append({'X': scaler[0].fit_transform(df.iloc[:, :].to_numpy()), 
                                       'y': scaler[1].fit_transform(df.iloc[:, 0].to_numpy().reshape(-1, 1)), 
                                       'scaler_x': scaler[0], 
                                       'scaler_y': scaler[1],
                                       'features': df.columns})
    
    return scaled_df_list

In [10]:
scaled_data = scale_data(df_list)

In [11]:
def create_time_step(train_test, time_steps):
    time_step_df = []
    for time_step in time_steps:
        for df in train_test:
            new_df = {key: df[key] for key in ['scaler_y', 'scaler_x', 'features']}
            
            X_y = [[df['X'][i: (i + time_step)], df['y'][i + time_step]] for i in range(len(df['X']) - time_step)]
            
            new_df['X'], new_df['y'] = np.array([x for x, _ in X_y]), np.array([y for _, y in X_y])
            new_df['time_steps'] = time_step

            time_step_df.append(new_df)
            
    return time_step_df

In [12]:
time_steps = [7, 14]

time_step_df = create_time_step(scaled_data, time_steps)

In [13]:
def split_train_test(scaled_data, test_size):
    
    return [{'X_train': df['X'][:len(df['X']) - test_size], 
             'y_train': df['y'][:len(df['X']) - test_size],
             'X_test': df['X'][len(df['X']) - test_size:],
             'y_test': df['y'][len(df['X']) - test_size:],
             'scaler_x': df['scaler_x'],
             'scaler_y': df['scaler_y'],
             'time_steps': df['time_steps'],
             'features': df['features']} 
             for df in scaled_data]

In [14]:
train_test = split_train_test(time_step_df, 7)
print(len(train_test))

378


In [15]:
def run_model(list_df):
    for i, df in enumerate(list_df, 1):        
        model = Sequential()
        model.add(Bidirectional(
                  LSTM(units=128, input_shape=(df['X_train'].shape[1], df['X_train'].shape[2])),
                  merge_mode='sum'))
        model.add(Dense(units=1))
        model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
        
        print(f'Training {i}/{len(list_df)}'.center(50, '-'))
                
        history = model.fit(
        df['X_train'], df['y_train'],
        epochs=50,
        batch_size=32,
        validation_data=(df['X_test'], df['y_test']),
        shuffle=False,
        verbose=0
        )

        df['history'] = {'loss': history.history['loss'], 'val_loss': history.history['val_loss']}

        predictions = model.predict(df['X_test'])
        if df['scaler_y']:
            df['predictions'] = df['scaler_y'].inverse_transform(predictions)
            df['y_test'] = df['scaler_y'].inverse_transform(df['y_test'])
        else:
            df['predictions'] = predictions
        
        df['score'] = {'RMSE': np.sqrt(mean_squared_error(y_true=df['y_test'], y_pred=df['predictions'])),
                       'MAE': mean_absolute_error(y_true=df['y_test'], y_pred=df['predictions']),
                       'r2': r2_score(y_true=df['y_test'], y_pred=df['predictions'])}

In [16]:
run_model([train_test[0]])

-------------------Training 1/1-------------------


In [18]:
import pickle
def save_results(df_list):
    with open('fitted_df_new.pkl', 'wb') as f:
        pickle.dump(df_list, f)
        


In [19]:
def open_results():
    with open('./fitted_data/trained_data.pkl', 'rb') as f:
        return pickle.load(f)


In [20]:
# save_results(train_test)
# fitted_dfs = open_results()