In [1]:
# import optuna
# from optuna.trial import Trial
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from typing import List, Any, Tuple
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation
from keras.metrics import AUC
from matplotlib import pyplot
from tqdm import tqdm

In [3]:
# Data Loading
print("Loading data...")
X_model = pd.read_csv('../data/X_model.csv')
Y_model = pd.read_csv('../data/Y_model.csv')

Loading data...


In [28]:
# Data Preprocessing
print("Preprocessing data...")
def as_timeseries_df(X: pd.DataFrame) -> np.ndarray:
    """Converts a DataFrame with `['gender', 'age_code', 'region_code', 'c', 't', 's']` columns and additional time column with timestamps to a time series DataFrame.

    Args:
        X (pd.DataFrame): DataFrame. The data should match the following format:
            gender: 1 (if man) | 2 (if woman)
            age_code: Integer
            region_code: Integer
            r"c2022[0-9]{2}": Number of logins. Integer
            r"t2022[0-9]{2}": Number of logins with money transfer. Integer
            r"s2022[0-9]{2}": Duration of logins. Float
    
    Returns:
        np.ndarray: Time series DataFrame. Each column's description (in order):
            id: Integer. Same as initial DataFrame's index.
            time: Timestamp.
            gender: 1 (if man) | 2 (if woman)
            age_code: Integer
            region_code: Integer
            c: Number of logins. Integer
            t: Number of logins with money transfer. Integer
            s: Duration of logins. Float
    """
    # Data cleaning
    print("Cleaning data...")
    X = X.fillna(0)

    # Data transformation
    print("Transforming data...")
    n = len(X)

    X['id'] = X.index
    X_timeseries = pd.wide_to_long(X, ["c", "t", "s"], i="id", j="time").reset_index()
    return np.array([X_timeseries[X_timeseries['id'] == i] for i in range(len(X_model))])


Preprocessing data...


In [15]:
# Columns: id, time, region_code, gender, age_code, c, t, s
X_timeseries = as_timeseries_df(X_model)

Cleaning data...
Transforming data...


In [24]:
X_timeseries.head()
# (800000, 238, 8)

Unnamed: 0,id,time,region_code,gender,age_code,c,t,s
0,0,20220101,7,1,13,0.0,0.0,0.0
1,1,20220101,1,1,5,0.0,0.0,0.0
2,2,20220101,2,2,6,0.0,0.0,0.0
3,3,20220101,1,2,1,0.0,0.0,0.0
4,4,20220101,1,2,5,0.0,0.0,0.0


In [44]:
X_timeseries = X_timeseries.values

In [45]:
X_timeseries

array([[0.0000000e+00, 2.0220101e+07, 7.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 2.0220101e+07, 1.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [2.0000000e+00, 2.0220101e+07, 2.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       ...,
       [7.9999700e+05, 2.0220826e+07, 7.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 6.0000000e+00],
       [7.9999800e+05, 2.0220826e+07, 1.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [7.9999900e+05, 2.0220826e+07, 1.0000000e+00, ..., 1.0000000e+00,
        1.0000000e+00, 7.2000000e+01]])

In [None]:
X_augmented.shape

In [None]:
# Train Test Split
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X_augmented, Y_model, test_size=0.2, random_state=42) 

In [None]:
# design network
# References: 
# - https://towardsdatascience.com/choosing-the-right-hyperparameters-for-a-simple-lstm-using-keras-f8e9ed76f046
# - https://stats.stackexchange.com/questions/242238/what-is-considered-a-normal-quantity-of-outliers
# - https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
hidden_nodes = int(2/3 * (X_train.shape[1] * X_train.shape[2]))
model = Sequential()
model.add(LSTM(hidden_nodes, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('softmax'))
model.compile(
    loss='mae',
    optimizer='adam',
    metrics=[AUC()]
)

In [None]:
print("Training model...")
history = model.fit(
    X_augmented,
    y_train,
    epochs=50,
    batch_size=100,
    validation_data=(X_test, y_test), 
    verbose=2,
    shuffle=False
)

In [None]:
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()
