In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import math
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import MinMaxScaler

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

**Data Preprocessing Based on Luke's Result**

In [3]:
f_data = pd.read_csv('no_dup_data.csv')

In [4]:
f_data.head(10)

Unnamed: 0.1,Unnamed: 0,customer_id,account_id,ed_id,event_name,Date,Time,journey_steps_until_end
0,0,-784961211,1773350293,12,application_web_approved,2023-03-22,08:45:22,1
1,1,-784961211,1773350293,19,application_web_view,2023-03-22,13:32:10,2
2,14,-784961211,1773350293,3,application_web_submit,2023-03-22,13:32:10,3
3,15,-784961211,1773350293,2,campaign_click,2023-03-22,14:45:22,4
4,16,-784961211,1773350293,19,application_web_view,2023-07-27,14:57:56,5
5,21,-784961211,1773350293,19,application_web_view,2023-08-29,16:01:06,6
6,24,15849251,383997507,4,browse_products,2021-11-04,14:11:15,1
7,25,15849251,383997507,4,browse_products,2021-11-04,14:11:29,2
8,26,15849251,383997507,4,browse_products,2021-11-04,14:12:10,3
9,27,15849251,383997507,4,browse_products,2021-11-04,14:12:21,4


In [5]:
f_data.shape

(55853910, 8)

In [308]:
# a helper function to randomly sample from the original data for further RNN 
def bagging_func(n, whole_data):
    account_ids = whole_data['account_id'].unique()
    
    sub_account_id = np.random.choice(account_ids, size=n, replace=False)
    sample_f_data = whole_data[whole_data['account_id'].isin(sub_account_id)]

    return sample_f_data

In [309]:
# getting all journeys for each account
def journey_list(sample_f_data):
    ed_id_lists = sample_f_data.groupby(['account_id'])['ed_id'].apply(list).reset_index()
    
    max_length = ed_id_lists['ed_id'].apply(len).max()
    ed_id_lists['ed_id'] = ed_id_lists['ed_id'].apply(lambda x: x + [0] * (max_length - len(x)))
    
    return ed_id_lists

In [310]:
# the code of Luke, possible future modifications, which defines what is the ultimate result
def event_label(x):
    act_list = np.array([29,12,15])
    ord_list = np.array([7,18])

    is_act = False
    is_ord = False

    # parse and get condition checks
    # if any activated, 
    if any(np.in1d(x,act_list)):
        is_act = True
    if any(np.in1d(x,ord_list)):
        is_ord = True

    # activated only
    if is_act == True and is_ord == False:
        return "Activated, No Order"

    # ordered only
    elif is_act == False and is_ord == True:
        return "Ordered, Not Activated"

    # activated and ordered
    elif is_act == True and is_ord == True:
        return "Activated and Ordered"
    
    # accounts neither fit
    else:
        return "Neither"

# vectorize function
event_label_vec = np.vectorize(event_label)

In [311]:
# discard useless labels, getting only the necessary labels for RNN model
def binary_labels(ed_id_lists):
    necessary_labels = ["Activated, No Order", "Activated and Ordered"]
    ed_id_lists = ed_id_lists[ed_id_lists['customer_label'].isin(necessary_labels)]

    return ed_id_lists

In [312]:
# turning `customer_label` to true labels
def true_labels(ed_id_lists):
    codes, uniques = pd.factorize(ed_id_lists['customer_label'])
    ed_id_lists['customer_label'] = codes

    return ed_id_lists

In [313]:
# combining the previous functions together
def preprocessing(n, f_data):
    sample_f_data = bagging_func(n, f_data)
    sample_f_data = sample_f_data[['customer_id', 'account_id', 'ed_id', 'journey_steps_until_end']]
    
    ed_id_lists = journey_list(sample_f_data)
    ed_id_lists["customer_label"] = event_label_vec(ed_id_lists["ed_id"])

    ed_id_lists = binary_labels(ed_id_lists)
    ed_id_lists = true_labels(ed_id_lists)

    return ed_id_lists

In [314]:
# take part of the dataset
ed_id_lists = preprocessing(1000, f_data)
ed_id_lists.head(10)

Unnamed: 0,account_id,ed_id,customer_label
0,-2142472542,"[1, 21, 22, 2, 12, 1, 4, 11, 1, 5, 4, 11, 5, 6...",0
1,-2137698057,"[12, 4, 4, 24, 24, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...",1
2,-2135733428,"[12, 19, 24, 1, 21, 1, 1, 21, 1, 21, 1, 1, 0, ...",1
3,-2130983468,"[2, 4, 11, 5, 6, 19, 19, 19, 3, 19, 12, 19, 6,...",0
4,-2124229342,"[12, 4, 24, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",1
5,-2119376334,"[21, 2, 22, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
6,-2113160832,"[29, 12, 4, 11, 5, 6, 7, 1, 2, 4, 0, 0, 0, 0, ...",0
7,-2111359975,"[1, 19, 19, 19, 19, 12, 4, 4, 4, 4, 29, 4, 4, ...",0
8,-2106361931,"[2, 6, 19, 19, 19, 19, 19, 19, 3, 12, 6, 19, 1...",1
9,-2097829422,"[12, 21, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",1


In [315]:
ed_id_lists.customer_label.unique()

array([0, 1])

In [316]:
ed_id_lists['customer_label'].describe()

count    1000.000000
mean        0.774000
std         0.418448
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: customer_label, dtype: float64

**Building RNN Model**

In [295]:
# adding masks for each observation
X = ed_id_lists['ed_id']  # Features
X = X.values.tolist()
mask = [[1 if x > 0 else x for x in sublist] for sublist in X]
y = ed_id_lists['customer_label']  # Target variable
y = y.values.tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, mask_train, mask_test = train_test_split(X, y, mask, test_size=0.2, random_state=42)

In [296]:
train_dset = torch.utils.data.TensorDataset(torch.tensor(X_train,
                                                         dtype=torch.long),
                                            torch.tensor(y_train,
                                                         dtype=torch.long),
                                            torch.tensor(mask_train,
                                                         dtype=torch.float))

test_dset = torch.utils.data.TensorDataset(torch.tensor(X_test,
                                                        dtype=torch.long),
                                          torch.tensor(y_test,
                                                        dtype=torch.long),
                                          torch.tensor(mask_test,
                                                        dtype=torch.float))

In [297]:
# getting data loader for training and predicting process
batch_size = 128

train_loader = torch.utils.data.DataLoader(train_dset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=2
                         )

test_loader = torch.utils.data.DataLoader(test_dset,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=2
                         )

In [298]:
# small test of correctness
x, y, m = next(iter(train_loader))
print(x.shape, y.shape, m.shape)

print(x)
print(x.shape)
print(y)
print(y.shape)
print(m)
print(m.shape)

torch.Size([128, 345]) torch.Size([128]) torch.Size([128, 345])
tensor([[ 2, 12,  4,  ...,  0,  0,  0],
        [12,  2,  1,  ...,  0,  0,  0],
        [21, 15,  1,  ...,  0,  0,  0],
        ...,
        [ 2,  5,  4,  ...,  0,  0,  0],
        [ 2, 12,  4,  ...,  0,  0,  0],
        [ 4, 11,  5,  ...,  0,  0,  0]])
torch.Size([128, 345])
tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
        0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0])
torch.Size([128])
tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 

In [299]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [300]:
class RNNClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, \
                 num_rec_layers=1, rec_layer=nn.RNN):
        super(RNNClassifier, self).__init__()
        # define all layers we need,
        # their parameters will be initialized automatically

        # nn.Embedding layer turns input sentences into word embeddings
        # with input and output dimension given by vocab_size and embedding_dim
        # self.word_embeddings = nn.Embedding(embedding_dim)

        # depending on the value of num_rec_layers, the corresponding number
        # of rec_layers (either RNN or LSTM) with batch_first=True and hidden
        # dimension given by hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.num_rec_layers = num_rec_layers
        self.rnn1 = rec_layer(embedding_dim, hidden_dim, batch_first=True)
        if self.num_rec_layers == 2:
            self.rnn2 = rec_layer(hidden_dim, hidden_dim, batch_first=True)

        # a final linear layer with sigmoid activation with input and output
        # dimension given by hidden_dim and 1.
        self.sigmoid = nn.Sigmoid()
        self.hidden2label = nn.Linear(hidden_dim, 1)

    def forward(self, sentences, mask):
        # sentences shape: [B, L], mask shape: [B, L]
        # embedding = word_embeddings(embedding_dim)# call your embedding layer, output shape: [B, L, DE]
        embedding = self.word_embeddings(sentences)
        out, hidden = self.rnn1(embedding)# call your rnn1, output shape: [B, L, DH]
        if self.num_rec_layers == 2:
            out, hidden = self.rnn2(out, hidden)# call your rnn2, output shape: [B, L, DH]
        out = (out*mask[:, :, None]).mean(dim=1) # shape: [B, DH]
        res = self.sigmoid(self.hidden2label(out))# call your hidden2label, output shape: [B, 1]
        #print(res)
        #print(res.shape)
        return res

In [244]:
class Ensemble(nn.Module):
    def __init__(self, modelA, modelB):
        super(Ensemble, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        self.classifier = nn.Linear(2, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x1, x2):
        A = self.modelA(x1, x2)
        B = self.modelB(x1, x2)
        #print(A)
        #print(B)
        x = torch.cat((A, B), dim=1)
        x = self.classifier(self.sigmoid(x))
        return x

In [245]:
# create a particular instance of the model, do a 1 layer vaniila RNN with embedding_dim=hidden_dim=128
rnn1 = RNNClassifier(128, 128, 500)
rnn1.to(device)

rnn2 = RNNClassifier(128, 128, 500)
rnn2.to(device)

rnn = Ensemble(rnn1, rnn2)

In [246]:
# an EmbeddingBag module containing 10 tensors of size 3
embedding_sum = nn.EmbeddingBag(2, 3, mode='sum')
# a batch of 2 samples of 4 indices each
input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long)
offsets = torch.tensor([1, 4, 4, 5, 6, 4, 3, 1], dtype=torch.long)
embedding_sum(input, offsets)

RuntimeError: offsets[0] has to be 0, i.e., the first sequence in the mini-batch has to start from position 0. However, got 1
[ torch.LongTensor{} ]

In [216]:
embedding_sum

EmbeddingBag(10, 3, mode='sum', padding_idx=2)

In [247]:
# demo forward pass with the mini-batch that we generated above
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = x.to(device)
y = y.to(device)
m = m.to(device)

y_pred = rnn(x, m)
print(y_pred.shape)
print(y_pred[:10])

torch.Size([128, 1])
tensor([[-0.1780],
        [-0.1796],
        [-0.1783],
        [-0.1787],
        [-0.1784],
        [-0.1799],
        [-0.1787],
        [-0.1785],
        [-0.1781],
        [-0.1832]], grad_fn=<SliceBackward0>)


In [210]:
learning_rate = 0.01
optimizer =optim.RMSprop(rnn.parameters(), lr=learning_rate)

lossfun = nn.BCELoss(reduction='mean')

In [211]:
def train_epoch(train_loader, model, lossfun, optimizer, device):
    model.train()
    # iterate over mini-batches
    for it, (inputs, labels, mask) in enumerate(train_loader):
        # move everything to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        mask = mask.to(device)

        model.zero_grad()
        # forward pass
        output = model(inputs, mask)
        #print(output)
        #print(output.size())
        #print(it)

        loss = lossfun(output.view(-1), labels.float())
        # backward pass
        loss.backward()

        # update model parameters
        optimizer.step()

def evaluate(loader, model, lossfun, device):
    model.eval()
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    # iterate over mini-batches
    for it, (inputs, labels, mask) in enumerate(loader):
        # move everything to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        mask = mask.to(device)

        # forward pass
        output = model(inputs, mask)

        # calculate loss value
        loss = lossfun(output.view(-1), labels.float())
        total_loss += loss.item()

        # calculate test accuracy
        pred = output.view(-1) > 0.5
        correct = (pred == labels.bool())
        total_acc += torch.sum(correct).item() / len(correct)

    total = it + 1
    return total_loss / total, total_acc / total


def train(train_loader, test_loader, model, lossfun, optimizer, \
          device, num_epochs):
    train_loss_ = []
    test_loss_ = []
    train_acc_ = []
    test_acc_ = []
    # irerate over training epochs
    for epoch in range(num_epochs):
        # at each step, we do a training epoch and evaluate on train and test data
        train_epoch(train_loader, model, lossfun, optimizer, device)
        train_loss, train_acc = evaluate(train_loader, model, lossfun, device)
        train_loss_.append(train_loss)
        train_acc_.append(train_acc)
        test_loss, test_acc = evaluate(test_loader, model, lossfun, device)
        test_loss_.append(test_loss)
        test_acc_.append(test_acc)

        print(f'Epoch: {epoch+1:3d}/{num_epochs:3d} '
              f'Training Loss: {train_loss_[epoch]:.3f}, Testing Loss: {test_loss_[epoch]:.3f}, '
              f'Training Acc: {train_acc_[epoch]:.3f}, Testing Acc: {test_acc_[epoch]:.3f}')

    return train_loss_, train_acc_, test_loss_, test_acc_

In [212]:
# let's first train a vanilla RNN
a, b, c, d = train(train_loader, test_loader, rnn, lossfun, \
                   optimizer, device, num_epochs=20)

RuntimeError: all elements of input should be between 0 and 1

In [91]:
new_sample_f_data = bagging_func(1000, account_ids)

In [92]:
new_sample_f_data = new_sample_f_data[['customer_id', 'account_id', 'ed_id', 'journey_steps_until_end']]

In [93]:
# getting all journeys for each account
new_ed_id_lists = new_sample_f_data.groupby(['account_id'])['ed_id'].apply(list).reset_index()

max_length = new_ed_id_lists['ed_id'].apply(len).max()
new_ed_id_lists['ed_id'] = new_ed_id_lists['ed_id'].apply(lambda x: x + [0] * (max_length - len(x)))

new_ed_id_lists

Unnamed: 0,account_id,ed_id
0,-2142286233,"[4, 4, 19, 19, 3, 19, 19, 12, 3, 19, 3, 19, 12..."
1,-2140681834,"[2, 12, 1, 4, 4, 4, 4, 4, 5, 4, 11, 5, 5, 6, 5..."
2,-2136749248,"[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,-2128701787,"[12, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,-2121857580,"[19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 3, 12..."
...,...,...
995,2120394917,"[2, 12, 24, 24, 1, 21, 1, 21, 1, 1, 1, 1, 1, 1..."
996,2122558653,"[12, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 21, 21, ..."
997,2127192426,"[29, 12, 7, 5, 11, 8, 4, 6, 2, 4, 27, 28, 0, 0..."
998,2138084987,"[12, 18, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0, 0,..."


In [94]:
new_event_label_vec = np.vectorize(event_label)
new_ed_id_lists["customer_label"] = new_event_label_vec(new_ed_id_lists["ed_id"])

In [95]:
ed_id_lists.head()

Unnamed: 0,account_id,ed_id,customer_label
0,-2147441542,"[22, 2, 19, 19, 19, 19, 19, 3, 12, 4, 1, 1, 1,...",0
1,-2146729686,"[2, 19, 19, 19, 3, 19, 12, 19, 4, 4, 4, 4, 4, ...",1
2,-2146530361,"[29, 2, 19, 19, 19, 19, 19, 19, 3, 19, 3, 12, ...",1
3,-2146371138,"[29, 2, 12, 4, 4, 11, 5, 4, 11, 5, 6, 7, 8, 27...",1
4,-2145897751,"[21, 12, 2, 29, 4, 4, 4, 4, 4, 4, 4, 11, 1, 5,...",1


In [96]:
necessary_labels = ["Activated, No Order", "Activated and Ordered"]

In [97]:
new_ed_id_lists = new_ed_id_lists[new_ed_id_lists['customer_label'].isin(necessary_labels)]

In [98]:
codes, uniques = pd.factorize(new_ed_id_lists['customer_label'])
new_ed_id_lists['customer_label'] = codes

In [99]:
new_ed_id_lists.head(10)

Unnamed: 0,account_id,ed_id,customer_label
0,-2142286233,"[4, 4, 19, 19, 3, 19, 19, 12, 3, 19, 3, 19, 12...",0
1,-2140681834,"[2, 12, 1, 4, 4, 4, 4, 4, 5, 4, 11, 5, 5, 6, 5...",0
2,-2136749248,"[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
3,-2128701787,"[12, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
4,-2121857580,"[19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 3, 12...",0
5,-2118091065,"[19, 19, 3, 19, 19, 3, 19, 19, 3, 19, 12, 1, 1...",0
6,-2115722991,"[2, 12, 24, 1, 21, 1, 5, 5, 4, 4, 4, 21, 24, 2...",0
7,-2108323954,"[21, 2, 12, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 11, ...",0
8,-2106285406,"[29, 15, 18, 27, 28, 0, 0, 0, 0, 0, 0, 0, 0, 0...",1
9,-2103642482,"[2, 22, 12, 1, 1, 21, 1, 1, 1, 1, 21, 1, 1, 0,...",0


In [100]:
new_X = new_ed_id_lists['ed_id']  # Features
new_X = new_X.values.tolist()
new_mask = [[1 if x > 0 else x for x in sublist] for sublist in new_X]
new_y = new_ed_id_lists['customer_label']  # Target variable
new_y = new_y.values.tolist()

In [101]:
# need to get the data from Luke and then perform testing part
test_dataset = torch.utils.data.TensorDataset(torch.tensor(new_X,
                                                        dtype=torch.long),
                                          torch.tensor(new_y,
                                                        dtype=torch.long),
                                          torch.tensor(new_mask,
                                                        dtype=torch.float))

In [102]:
new_import_test_loader = torch.utils.data.DataLoader(test_dataset,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=2
                         )

In [103]:
test_loss, test_acc = evaluate(new_import_test_loader, rnn, lossfun, device)
print(test_loss)
print(test_acc)

0.1267022704705596
0.9788161057692307
