## Modeling and predictions

in this notebook we show how we compute the models for predictions

techinques: normalization, pca, lstm (with keras)

In [None]:
import pandas as pd
pd.options.display.max_columns = 100
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from collections import OrderedDict
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation
from keras import optimizers
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
# matplotlib inline
import warnings
warnings.filterwarnings("ignore")

dataframe_dict = pd.read_csv('dict_to_cast_events_when_loading.csv',delimiter = ";")
dictionary = {}
for index, row in dataframe_dict.iterrows():
    dictionary[row['EVENTO']] = row['TIPO']
print(dictionary)

train = pd.read_csv('train_final_after_weather_encoding.csv', dtype=dictionary)
test = pd.read_csv('test_final_after_weather_encoding.csv', dtype=dictionary)

we decided to compute a model for every sensor, so first we make a list of all possible key_2 in the intersection of 
train and test dataset

In [None]:
train_dfs = dict(tuple(train.groupby('KEY_2')))
test_dfs = dict(tuple(test.groupby('KEY_2')))

train_keys = sorted(train_dfs)
test_keys = sorted(test_dfs)

# We should only train keys that are present in both train and set sets
to_be_trained_keys = list(set(train_keys) & set(test_keys))
#sorts and removes duplicates
to_be_trained_keys = list(OrderedDict.fromkeys(to_be_trained_keys))
to_be_trained_keys = sorted(to_be_trained_keys)

trained_keys = []

print(to_be_trained_keys)
print("Number of keys to be trained:" + str(len(to_be_trained_keys)))

then we prepare a dataframe for saving the result in the correct format for the submission

In [None]:
result_df = pd.DataFrame(columns=['KEY', 'KM', 'DATETIME_UTC', 'PREDICTION_STEP', 'SPEED_AVG'])

then for every key we normalize the data, apply pca and then compute a model and predict the results, saving them directly in a dataframe in the correct format

In [None]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):

        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j + 1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg


def select_subdataframe(key,dataframe):
    dataframe = dataframe[key]
    dataframe.set_index('DATETIME_UTC', inplace=True)
    dataframe.drop('KEY_2', axis=1, inplace=True)
    return dataframe

def normalization(dataframe):
    values= dataframe.values
    scaler = StandardScaler()
    scaled_dataframe = scaler.fit_transform(values)
    return scaled_dataframe

def PCA(data, n_components):
    pca = PCA(n_components)
    scaled_data = pca.fit_transform(data)
    explained_variance = pca.explained_variance_ratio
    return scaled_data
    
def prepare_data_for_prediction(dataframe,timeSteps,features):
    supervised.drop(supervised.columns[range(features + 1, (features*2) + 0)], axis=1, inplace=True)
    supervised.drop(supervised.columns[range(features + 2, (features*2) + 1)], axis=1, inplace=True)
    supervised.drop(supervised.columns[range(features + 3, (features*2) + 2)], axis=1, inplace=True)
    supervised.drop(supervised.columns[range(features + 4, (features*2) + 3)], axis=1, inplace=True)
    supervised= supervised.values
    X = supervised[:, :features * timeSteps]
    Y = supervised[:, features * timeSteps:]
    X = X.reshape(X.shape[0], timeSteps, features)
    return [X,Y]

def print_results(y_pred,X_test,_n_pca_components):
    y_true = []
    y_predicted = []
    for i in range(0, ahead):
        y_pred_i = y_pred[:, i]
        y_pred_i = y_pred_i.reshape(y_test.shape[0], 1)
        inv_new = np.concatenate((y_test_i, X_test[:, -(n_pca_components-1):]), axis=1)
        inv_new = scaler.inverse_transform(inv_new)
        final_pred = inv_new[:, 0]
        y_predicted.append(final_pred)
        y_test_i = y_test[:, i]
        y_test_i = y_test_i.reshape(len(y_test_i), 1)
        inv_new = np.concatenate((y_test_i, X_test[:, -(n_pca_components-1):]), axis=1)
        inv_new = scaler.inverse_transform(inv_new)
        actual_pred = inv_new[:, 0]
        y_true.append(actual_pred)
        plt.plot(final_pred[:200], label="prediction", c="b")
        plt.plot(actual_pred[:200], label="actual data", c="r")
        plt.xlim(0, 100)
        plt.ylim(0, 300)
        plt.yticks([])
        plt.xticks([])
        plt.title("comparison between prediction and actual data")
        plt.legend()
        plt.show()
        print("mean absolute error:")
        print(mean_absolute_error(final_pred, actual_pred))
        print("mean squared error:")
        print(mean_squared_error(final_pred, actual_pred)))

def compute_results_dataframe(results_df,test_df, ahead, key, y_predicted):
    temp_df = pd.DataFrame(columns=['KEY', 'KM', 'DATETIME_UTC', 'PREDICTION_STEP', 'SPEED_AVG'])

    i = 0
    test_df_truncated = test_df.tail(-ahead)
    for index, row in test_df_truncated.iterrows():
        # index is datetime
        k = test_df_truncated.index.get_loc(index)
        for j in range(0, ahead):
            print(key.split('_')[0], key.split('_')[1], index, str(j + 1), y_predicted[j][k])
            temp_df.loc[i+j] = [key.split('_')[0]] + [key.split('_')[1]] + [str(index)] + [str(j+1)] + [str(y_predicted[j][k])]
        i+=4
    result_df = result_df.append(temp_df)
    return results_df

    
for key in to_be_trained_keys:
    #we compute the model and predictions only for sensors with more than 5 values, because
    #making a 4 step ahead prediction with less than 4 data makes no sense 
    n_samples_train = len(train_dfs[key].index) 
    n_samples_test = len(test_dfs[key].index)    
    
    if (n_samples_train >= 5 and n_samples_test >= 5):
        
        #we select only the part of the dataframe corresponding to the actual key and we remove the key from
        #the attributes
        trained_keys.append(key)
        train_df = select_subdataframe(key,train_dfs)
        test_df = select_subdataframe(key,test_dfs)

        #then we normalize the data 
        scaled_train = normalization(train_df)
        scaled_test = normalization(test_df)
        
        #to work the num of components of pca need to be at least equal at the num of samples
        n_components_for_pca = 20
        if(n_samples_train) < 20 or (n_samples_test < 20) :
            n_components_for_pca = min(n_samples_train, n_samples_test)
          
        scaled_train = pca(scaled_train, n_components_for_pca)
        scaled_test = pca(scaled_test, n_components_for_pca)

        
        #then we select how many time steps ahead make the prediction and how many time steps consider to predict the model
        timeSteps = 1
        ahead = 4
        
        #then we convert the data into a supervised problem using a sliding windows approach
        supervised_train = series_to_supervised(scaled_train, n_in=timeSteps, n_out=ahead)
        supervised_test = series_to_supervised(scaled_test, n_in=timeSteps, n_out=ahead)


        features_train = n_components_for_pca
        features_test = n_components_for_pca

      
        training_data = prepare_data_for_prediction(supervise_train,timeSteps,features_train)
        X_train = training_data[0]
        y_train = training_data[1]
        
        test_data = prepare_data_for_prediction(supervise_test,timeSteps,features_test)
        X_test = test_data[0]
        y_test = test_data[1]

        #then we build the model using keras
        
        NUM_NEURONS_FirstLayer = 80
        NUM_NEURONS_SecondLayer = 50
        EPOCHS = 30

        model = Sequential()
        model.add(LSTM(NUM_NEURONS_FirstLayer, input_shape=(timeSteps, X_train.shape[2]), return_sequences=True))
        model.add(LSTM(NUM_NEURONS_SecondLayer, input_shape=(NUM_NEURONS_FirstLayer, 1)))

        model.add(Dense(ahead))
        sgd = optimizers.SGD(lr=0.1, decay = 1e-6, momentum = 0.9, nesterov=True)
        model.compile(loss='mean_absolute_error', optimizer=sgd)

        history = model.fit(X_train, y_train, epochs=EPOCHS, shuffle=True, batch_size=24,
                            verbose=2)
        model.save('model_' + str(key) + ".h5")

        y_pred = model.predict(X_test)
        X_test = X_test.reshape(X_test.shape[0], X_test.shape[2] * X_test.shape[1])

        print_results(y_pred, X_test,n_components_for_pca)
        
        results_df = compute_results_dataframe(results_df,test_df, ahead, key, y_predicted)

at the end we save the results

In [None]:
result_df.to_csv('results.csv', encoding='utf-8', index=False)