In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import math
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import MinMaxScaler

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

**1. Data Preprocessing Based on Luke's Result**

In [3]:
f_data = pd.read_csv('no_dup_data.csv')

In [4]:
f_data.head(10)

Unnamed: 0.1,Unnamed: 0,customer_id,account_id,ed_id,event_name,Date,Time,journey_steps_until_end
0,0,-784961211,1773350293,12,application_web_approved,2023-03-22,08:45:22,1
1,1,-784961211,1773350293,19,application_web_view,2023-03-22,13:32:10,2
2,14,-784961211,1773350293,3,application_web_submit,2023-03-22,13:32:10,3
3,15,-784961211,1773350293,2,campaign_click,2023-03-22,14:45:22,4
4,16,-784961211,1773350293,19,application_web_view,2023-07-27,14:57:56,5
5,21,-784961211,1773350293,19,application_web_view,2023-08-29,16:01:06,6
6,24,15849251,383997507,4,browse_products,2021-11-04,14:11:15,1
7,25,15849251,383997507,4,browse_products,2021-11-04,14:11:29,2
8,26,15849251,383997507,4,browse_products,2021-11-04,14:12:10,3
9,27,15849251,383997507,4,browse_products,2021-11-04,14:12:21,4


In [5]:
f_data.shape

(55853910, 8)

In [6]:
# a helper function to randomly sample from the original data for further RNN 
def bagging_func(n, whole_data):
    account_ids = whole_data['account_id'].unique()
    
    sub_account_id = np.random.choice(account_ids, size=n, replace=False)
    sample_f_data = whole_data[whole_data['account_id'].isin(sub_account_id)]

    return sample_f_data

In [7]:
test = bagging_func(1000, f_data)

In [8]:
# getting all journeys for each account
def journey_list(sample_f_data):
    ed_id_lists = sample_f_data.groupby(['account_id'])['ed_id'].apply(list).reset_index()
    
    max_length = ed_id_lists['ed_id'].apply(len).max()
    ed_id_lists['ed_id'] = ed_id_lists['ed_id'].apply(lambda x: x + [0] * (max_length - len(x)))
    
    return ed_id_lists

In [9]:
# the code of Luke, possible future modifications, which defines what is the ultimate result
def event_label(x):
    act_list = np.array([29,12,15])
    ord_list = np.array([7,18])

    is_act = False
    is_ord = False

    # parse and get condition checks
    # if any activated, 
    if any(np.in1d(x,act_list)):
        is_act = True
    if any(np.in1d(x,ord_list)):
        is_ord = True

    # activated only
    if is_act == True and is_ord == False:
        return "Activated, No Order"

    # ordered only
    elif is_act == False and is_ord == True:
        return "Ordered, Not Activated"

    # activated and ordered
    elif is_act == True and is_ord == True:
        return "Activated and Ordered"
    
    # accounts neither fit
    else:
        return "Neither"

# vectorize function
event_label_vec = np.vectorize(event_label)

In [10]:
# discard useless labels, getting only the necessary labels for RNN model
def binary_labels(ed_id_lists):
    necessary_labels = ["Activated, No Order", "Activated and Ordered"]
    ed_id_lists = ed_id_lists[ed_id_lists['customer_label'].isin(necessary_labels)]

    return ed_id_lists

In [11]:
# turning `customer_label` to true labels
def true_labels(ed_id_lists):
    codes, uniques = pd.factorize(ed_id_lists['customer_label'])
    ed_id_lists['customer_label'] = codes

    return ed_id_lists

In [12]:
# combining the previous functions together
def preprocessing(n, f_data):
    sample_f_data = bagging_func(n, f_data)
    sample_f_data = sample_f_data[['customer_id', 'account_id', 'ed_id', 'journey_steps_until_end']]
    
    ed_id_lists = journey_list(sample_f_data)
    ed_id_lists["customer_label"] = event_label_vec(ed_id_lists["ed_id"])

    ed_id_lists = binary_labels(ed_id_lists)
    ed_id_lists = true_labels(ed_id_lists)

    return ed_id_lists

In [13]:
# take part of the dataset
ed_id_lists = preprocessing(1000, f_data)
ed_id_lists.head(10)

Unnamed: 0,account_id,ed_id,customer_label
0,-2126367060,"[2, 12, 5, 4, 5, 4, 4, 4, 4, 4, 4, 11, 4, 4, 1...",0
1,-2115929492,"[2, 12, 11, 1, 5, 6, 5, 5, 1, 0, 0, 0, 0, 0, 0...",0
2,-2114878072,"[19, 19, 19, 3, 19, 12, 19, 4, 4, 4, 4, 4, 4, ...",0
3,-2093667389,"[2, 12, 1, 1, 1, 24, 1, 24, 24, 24, 1, 1, 5, 6...",0
4,-2090522827,"[2, 19, 3, 19, 19, 3, 19, 19, 3, 19, 3, 19, 3,...",0
5,-2082736687,"[12, 4, 1, 6, 4, 11, 5, 6, 4, 11, 5, 0, 0, 0, ...",0
6,-2062755641,"[1, 1, 1, 19, 19, 19, 19, 19, 19, 3, 19, 12, 1...",0
7,-2059967457,"[19, 19, 19, 19, 19, 19, 3, 12, 4, 4, 4, 4, 4,...",0
8,-2059070041,"[21, 22, 2, 12, 19, 19, 0, 0, 0, 0, 0, 0, 0, 0...",0
9,-2057671282,"[29, 12, 6, 11, 7, 5, 4, 8, 11, 5, 4, 19, 5, 4...",1


In [14]:
ed_id_lists.customer_label.unique()

array([0, 1])

In [15]:
ed_id_lists['customer_label'].describe()

count    1000.000000
mean        0.198000
std         0.398692
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: customer_label, dtype: float64

**2. Building RNN Model**

In [16]:
def data_loader(input_lists, split = 0.2, b_size = 128):
    # adding masks for each observation
    X = input_lists['ed_id']  # Features
    X = X.values.tolist()
    mask = [[1 if x > 0 else x for x in sublist] for sublist in X]
    y = input_lists['customer_label']  # Target variable
    y = y.values.tolist()
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test, mask_train, mask_test = train_test_split(X, y, mask, test_size=split, random_state=42)

    train_dset = torch.utils.data.TensorDataset(torch.tensor(X_train,
                                                         dtype=torch.long),
                                            torch.tensor(y_train,
                                                         dtype=torch.long),
                                            torch.tensor(mask_train,
                                                         dtype=torch.float))

    test_dset = torch.utils.data.TensorDataset(torch.tensor(X_test,
                                                            dtype=torch.long),
                                              torch.tensor(y_test,
                                                            dtype=torch.long),
                                              torch.tensor(mask_test,
                                                            dtype=torch.float))
    
    # getting data loader for training and predicting process
    batch_size = b_size
    
    train_loader = torch.utils.data.DataLoader(train_dset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=2,
                              drop_last=True
                             )
    
    test_loader = torch.utils.data.DataLoader(test_dset,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=2,
                              drop_last=True
                             )

    return train_loader, test_loader

In [17]:
train_loader, test_loader = data_loader(ed_id_lists, split = 0.2, b_size = 128)

In [18]:
# small test of correctness
x, y, m = next(iter(train_loader))
print(x.shape, y.shape, m.shape)

print(x)
print(x.shape)
print(y)
print(y.shape)
print(m)
print(m.shape)

torch.Size([128, 307]) torch.Size([128]) torch.Size([128, 307])
tensor([[ 2,  3,  3,  ...,  0,  0,  0],
        [29, 12,  4,  ...,  0,  0,  0],
        [12,  4,  4,  ...,  0,  0,  0],
        ...,
        [12,  1,  6,  ...,  0,  0,  0],
        [15, 18,  4,  ...,  0,  0,  0],
        [12, 24,  1,  ...,  0,  0,  0]])
torch.Size([128, 307])
tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 1, 0])
torch.Size([128])
tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 

In [19]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, \
                 num_rec_layers=1, rec_layer=nn.LSTM):
        super(LSTMClassifier, self).__init__()
        # define all layers we need,
        # their parameters will be initialized automatically

        # nn.Embedding layer turns input sentences into word embeddings
        # with input and output dimension given by vocab_size and embedding_dim
        # self.word_embeddings = nn.Embedding(embedding_dim)

        # depending on the value of num_rec_layers, the corresponding number
        # of rec_layers (either RNN or LSTM) with batch_first=True and hidden
        # dimension given by hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.num_rec_layers = num_rec_layers
        self.rnn1 = rec_layer(embedding_dim, hidden_dim, batch_first=True)
        if self.num_rec_layers == 2:
            self.rnn2 = rec_layer(hidden_dim, hidden_dim, batch_first=True)

        # a final linear layer with sigmoid activation with input and output
        # dimension given by hidden_dim and 1.
        self.sigmoid = nn.Sigmoid()
        self.hidden2label = nn.Linear(hidden_dim, 1)

    def forward(self, sentences, mask):
        # sentences shape: [B, L], mask shape: [B, L]
        # embedding = word_embeddings(embedding_dim)# call your embedding layer, output shape: [B, L, DE]
        embedding = self.word_embeddings(sentences)
        out, hidden = self.rnn1(embedding)# call your rnn1, output shape: [B, L, DH]
        if self.num_rec_layers == 2:
            out, hidden = self.rnn2(out, hidden)# call your rnn2, output shape: [B, L, DH]
        out = (out*mask[:, :, None]).mean(dim=1) # shape: [B, DH]
        res = self.sigmoid(self.hidden2label(out))# call your hidden2label, output shape: [B, 1]
        #print(res)
        #print(res.shape)
        return res

In [20]:
class Ensemble(nn.Module):
  def __init__(self, net, num_ensemble=5, seed_val=42):
      super(Ensemble, self).__init__()
      self.ensembles = nn.ModuleList()
      self.sigmoid = nn.Sigmoid()

      for i in range(num_ensemble):
          torch.manual_seed(seed_val*i+1)
          if torch.cuda.is_available(): # To randomize init of NNs for Ensembles
              torch.cuda.manual_seed(seed_val*i+1)
          self.ensembles.append(net[i])

      self.final = nn.Linear(num_ensemble, num_ensemble)

  def forward(self, X_in_list, m_in_list):
      #a = [net(X_in_list[i], m_in_list[i]) for i, net in enumerate(self.ensembles)]
      #print(a[0].shape)
      #print(a[1].shape)

      x = torch.cat([net(X_in_list[i], m_in_list[i]) for i, net in enumerate(self.ensembles)])
      x = x.reshape(-1, len(self.ensembles))
      x = self.sigmoid(self.final(x))
      
      return x

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


**3. Demo RNN classifier for correctness**

In [22]:
lstm1 = LSTMClassifier(128, 128, 500)
lstm2 = LSTMClassifier(128, 64, 500)
lstm3 = LSTMClassifier(128, 256, 500)

lstm1.to(device)
lstm2.to(device)
lstm3.to(device)
num_of_bagging = 3

lstm_list = [lstm1, lstm2, lstm3]

lstm = Ensemble(lstm_list, num_of_bagging)
lstm

Ensemble(
  (ensembles): ModuleList(
    (0): LSTMClassifier(
      (word_embeddings): Embedding(500, 128)
      (rnn1): LSTM(128, 128, batch_first=True)
      (sigmoid): Sigmoid()
      (hidden2label): Linear(in_features=128, out_features=1, bias=True)
    )
    (1): LSTMClassifier(
      (word_embeddings): Embedding(500, 128)
      (rnn1): LSTM(128, 64, batch_first=True)
      (sigmoid): Sigmoid()
      (hidden2label): Linear(in_features=64, out_features=1, bias=True)
    )
    (2): LSTMClassifier(
      (word_embeddings): Embedding(500, 128)
      (rnn1): LSTM(128, 256, batch_first=True)
      (sigmoid): Sigmoid()
      (hidden2label): Linear(in_features=256, out_features=1, bias=True)
    )
  )
  (sigmoid): Sigmoid()
  (final): Linear(in_features=3, out_features=3, bias=True)
)

In [26]:
# demo forward pass with the mini-batch that we generated above
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x1, y1, m1 = next(iter(train_loader))
x1 = x1.to(device)
y1 = y1.to(device)
m1 = m1.to(device)

x2, y2, m2 = next(iter(train_loader))
x2 = x2.to(device)
y2 = y2.to(device)
m2 = m2.to(device)

x3, y3, m3 = next(iter(train_loader))
x3 = x3.to(device)
y3 = y3.to(device)
m3 = m3.to(device)

x = [x1, x2, x3]
y = [y1, y2, y3]
m = [m1, m2, m3]
     
y_pred = lstm(x, m)
print(y_pred.shape)
print(y_pred[:10])

torch.Size([128, 3])
tensor([[0.4398, 0.5536, 0.3872],
        [0.4401, 0.5533, 0.3868],
        [0.4389, 0.5531, 0.3850],
        [0.4396, 0.5534, 0.3865],
        [0.4390, 0.5531, 0.3852],
        [0.4390, 0.5531, 0.3847],
        [0.4405, 0.5530, 0.3868],
        [0.4400, 0.5534, 0.3869],
        [0.4396, 0.5535, 0.3868],
        [0.4390, 0.5526, 0.3846]], grad_fn=<SliceBackward0>)


**4. The final training / testing loop**

In [27]:
def train_epoch(bagging_data, model, lossfun, optimizer, device, n_bagging):
    inputs, labels, mask = cook_data(bagging_data, n_bagging)
    
    model.train()
    
    # # iterate over mini-batches
    # for it, (inputs, labels, mask) in enumerate(bagging_data[i]):
    #     # move everything to the device
    inputs = [i.to(device) for i in inputs]
    labels = [i.to(device) for i in labels]
    mask = [i.to(device) for i in mask]

    model.zero_grad()
    # forward pass
    output = model(inputs, mask)

    labels = torch.stack(labels, dim=0)
    labels = labels.float()

    loss = lossfun(output.view(-1), labels.view(-1))
    # backward pass
    loss.backward()

    # update model parameters
    optimizer.step()

def evaluate(bagging_data, model, lossfun, device, n_bagging, outer_eval=False):
    inputs, labels, mask = cook_data(bagging_data, n_bagging, outer_eval)
    
    model.eval()
    total_acc = 0.0
    total_loss = 0.0
    # iterate over mini-batches

    inputs = [i.to(device) for i in inputs]
    labels = [i.to(device) for i in labels]
    mask = [i.to(device) for i in mask]

    model.zero_grad()
    # forward pass
    output = model(inputs, mask)

    labels = torch.stack(labels, dim=0)
    labels = labels.float()

    loss = lossfun(output.view(-1), labels.view(-1))
    total_loss += loss.item()

    # calculate test accuracy
    pred = output.view(-1) > 0.5
    correct = (pred == labels.view(-1).bool())
    total_acc += torch.sum(correct).item() / len(correct)

    return total_loss, total_acc

def cook_data(bagging_data_list, num_of_bagging, outer_eval=False):     
    if outer_eval == False:
        X = [[] for _ in range(num_of_bagging)]
        y = [[] for _ in range(num_of_bagging)]
        m = [[] for _ in range(num_of_bagging)]
    
        count = 0
        
        for d_l in bagging_data_list:
            for it, (inputs, labels, mask) in enumerate(d_l):
                X[count].append(inputs)
                y[count].append(labels)
                m[count].append(mask)
            count += 1
    
        #print(X)
        #print(len(X))
    
        X = [torch.cat(i, dim=0) for i in X]
        y = [torch.cat(i, dim=0) for i in y]
        m = [torch.cat(i, dim=0) for i in m]
        
        return X, y, m
        
    else:
        X = [[]]
        y = [[]]
        m = [[]]
    
        count = 0
        
        for d_l in bagging_data_list:
            for it, (inputs, labels, mask) in enumerate(d_l):
                X[count].append(inputs)
                y[count].append(labels)
                m[count].append(mask)
            count += 1
    
        #print(X)
        #print(len(X))
    
        X = [torch.cat(i, dim=0) for i in X]
        y = [torch.cat(i, dim=0) for i in y]
        m = [torch.cat(i, dim=0) for i in m]
        
        return X, y, m

def train(n, split, n_bagging, whole_data, model, lossfun, optimizer, \
          device, num_epochs, batch_size):
    train_loss_ = []
    test_loss_ = []
    train_acc_ = []
    test_acc_ = []

    bagging_train_data_list = []
    bagging_test_data_list = []

    # for training, we need to train on different dataset
    for i in range(n_bagging):
        train_loader, test_loader = data_loader(preprocessing(n, whole_data), split, batch_size)
        bagging_train_data_list.append(train_loader)
        bagging_test_data_list.append(test_loader)

    #print(bagging_train_data_list[0])
    #print(len(bagging_train_data_list[1]))
    
    # X, y, m = cook_data(bagging_train_data_list, n_bagging)
    # print(len(X))
    # print((X[0]))
    # print((X[0].shape))
    # print((X[1].shape))
    # print(len(X[1]))
    
    # irerate over training epochs
    for epoch in range(num_epochs):
        # at each step, we do a training epoch and evaluate on train and test data
        train_epoch(bagging_train_data_list, model, lossfun, optimizer, device, n_bagging)
        train_loss, train_acc = evaluate(bagging_train_data_list, model, lossfun, device, n_bagging, outer_eval=False)
        train_loss_.append(train_loss)
        train_acc_.append(train_acc)
        test_loss, test_acc = evaluate(bagging_test_data_list, model, lossfun, device, n_bagging, outer_eval=False)
        test_loss_.append(test_loss)
        test_acc_.append(test_acc)

        print(f'Epoch: {epoch+1:3d}/{num_epochs:3d} '
              f'Training Loss: {train_loss_[epoch]:.3f}, Testing Loss: {test_loss_[epoch]:.3f}, '
              f'Training Acc: {train_acc_[epoch]:.3f}, Testing Acc: {test_acc_[epoch]:.3f}')

    return train_loss_, train_acc_, test_loss_, test_acc_

In [28]:
lstm1 = LSTMClassifier(128, 128, 500)
lstm2 = LSTMClassifier(128, 64, 500, num_rec_layers=2)
lstm3 = LSTMClassifier(128, 256, 500)

lstm1.to(device)
lstm2.to(device)
lstm3.to(device)
num_of_bagging = 3

lstm_list = [lstm1, lstm2, lstm3]

lstm = Ensemble(lstm_list, num_of_bagging)
lstm

Ensemble(
  (ensembles): ModuleList(
    (0): LSTMClassifier(
      (word_embeddings): Embedding(500, 128)
      (rnn1): LSTM(128, 128, batch_first=True)
      (sigmoid): Sigmoid()
      (hidden2label): Linear(in_features=128, out_features=1, bias=True)
    )
    (1): LSTMClassifier(
      (word_embeddings): Embedding(500, 128)
      (rnn1): LSTM(128, 64, batch_first=True)
      (rnn2): LSTM(64, 64, batch_first=True)
      (sigmoid): Sigmoid()
      (hidden2label): Linear(in_features=64, out_features=1, bias=True)
    )
    (2): LSTMClassifier(
      (word_embeddings): Embedding(500, 128)
      (rnn1): LSTM(128, 256, batch_first=True)
      (sigmoid): Sigmoid()
      (hidden2label): Linear(in_features=256, out_features=1, bias=True)
    )
  )
  (sigmoid): Sigmoid()
  (final): Linear(in_features=3, out_features=3, bias=True)
)

In [31]:
learning_rate = 0.01
optimizer = optim.RMSprop(lstm.parameters(), lr=learning_rate)

lossfun = nn.BCELoss(reduction='mean')

In [32]:
# let's first train a vanilla RNN
batch_size = 64
a, b, c, d = train(3000, 0.2, num_of_bagging, f_data, lstm, lossfun, \
                   optimizer, device, num_epochs=15, batch_size = batch_size)

Epoch:   1/ 15 Training Loss: 0.572, Testing Loss: 0.569, Training Acc: 0.760, Testing Acc: 0.770
Epoch:   2/ 15 Training Loss: 0.555, Testing Loss: 0.552, Training Acc: 0.778, Testing Acc: 0.784
Epoch:   3/ 15 Training Loss: 0.544, Testing Loss: 0.538, Training Acc: 0.778, Testing Acc: 0.784
Epoch:   4/ 15 Training Loss: 0.538, Testing Loss: 0.528, Training Acc: 0.776, Testing Acc: 0.784
Epoch:   5/ 15 Training Loss: 0.534, Testing Loss: 0.526, Training Acc: 0.777, Testing Acc: 0.784
Epoch:   6/ 15 Training Loss: 0.530, Testing Loss: 0.523, Training Acc: 0.777, Testing Acc: 0.784
Epoch:   7/ 15 Training Loss: 0.531, Testing Loss: 0.526, Training Acc: 0.777, Testing Acc: 0.784
Epoch:   8/ 15 Training Loss: 0.528, Testing Loss: 0.521, Training Acc: 0.777, Testing Acc: 0.784
Epoch:   9/ 15 Training Loss: 0.524, Testing Loss: 0.519, Training Acc: 0.777, Testing Acc: 0.784
Epoch:  10/ 15 Training Loss: 0.524, Testing Loss: 0.517, Training Acc: 0.777, Testing Acc: 0.784


KeyboardInterrupt: 

In [32]:
# new_bagging_train_data_list = []
# new_bagging_test_data_list = []

# new_train_loader, new_test_loader = data_loader(preprocessing(3000, f_data), 0.2, batch_size)

# for i in range(num_of_bagging):
#     new_bagging_train_data_list.append(new_train_loader)
#     new_bagging_test_data_list.append(new_test_loader)

In [33]:
# test_loss, test_acc = evaluate(new_bagging_train_data_list, rnn, lossfun, device, num_of_bagging, outer_eval=False)
# print(test_loss)
# print(test_acc)