In [3]:
import pandas as pd
import numpy as np
import datetime as dt
import math
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import MinMaxScaler

#from scikeras.wrappers import KerasRegressor

#import tensorflow as tf
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense,Dropout,LSTM,GRU,SimpleRNN

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

**Data Preprocessing Based on Luke's Result**

In [5]:
f_data = pd.read_csv('no_dup_data.csv')

In [92]:
# take part of the dataset
sample_f_data = f_data[:1000]

In [7]:
sample_f_data = sample_f_data[['customer_id', 'account_id', 'ed_id', 'journey_steps_until_end']]

In [8]:
#sample_f_data.to_csv('sample_data.csv')

In [9]:
sample_f_data[sample_f_data['account_id'] == -1199609206].head(5)

Unnamed: 0,customer_id,account_id,ed_id,journey_steps_until_end
576,-1554036291,-1199609206,15,1


In [10]:
# getting all journeys for each account
ed_id_lists = sample_f_data.groupby(['customer_id', 'account_id'])['ed_id'].apply(list).reset_index()

max_length = ed_id_lists['ed_id'].apply(len).max()
ed_id_lists['ed_id'] = ed_id_lists['ed_id'].apply(lambda x: x + [0] * (max_length - len(x)))

ed_id_lists

Unnamed: 0,customer_id,account_id,ed_id
0,-2133016776,1442088117,"[12, 4, 3, 19, 24, 4, 1, 4, 11, 4, 5, 11, 6, 4..."
1,-2124872601,1448775641,"[29, 19, 19, 19, 19, 19, 19, 3, 12, 5, 5, 4, 4..."
2,-2124427253,-1861676870,"[19, 19, 19, 3, 19, 3, 19, 19, 12, 19, 2, 4, 4..."
3,-2103242255,2034732712,"[19, 19, 3, 19, 19, 3, 19, 19, 19, 3, 12, 4, 1..."
4,-2101937295,958613450,"[2, 6, 19, 19, 3, 19, 19, 3, 19, 19, 3, 19, 19..."
...,...,...,...
309,2093003863,1949987557,"[19, 19, 19, 3, 12, 19, 4, 11, 4, 4, 4, 11, 4,..."
310,2106847638,1673406310,"[1, 19, 19, 3, 19, 19, 12, 19, 4, 4, 4, 1, 4, ..."
311,2117096751,-1885164450,"[12, 3, 4, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
312,2143090910,-297097202,"[19, 19, 19, 19, 19, 19, 19, 19, 3, 19, 12, 4,..."


In [11]:
# the code of Luke, possible future modifications
def event_label(x):
    act_list = np.array([29,12,15])
    ord_list = np.array([7,18])

    is_act = False
    is_ord = False

    # parse and get condition checks
    # if any activated, 
    if any(np.in1d(x,act_list)):
        is_act = True
    if any(np.in1d(x,ord_list)):
        is_ord = True

    # activated only
    if is_act == True and is_ord == False:
        return "Activated, No Order"

    # ordered only
    elif is_act == False and is_ord == True:
        return "Ordered, Not Activated"

    # activated and ordered
    elif is_act == True and is_ord == True:
        return "Activated and Ordered"
    
    # accounts neither fit
    else:
        return "Neither"

# vectorize function
event_label_vec = np.vectorize(event_label)

In [12]:
ed_id_lists["customer_label"] = event_label_vec(ed_id_lists["ed_id"])

In [13]:
ed_id_lists.head()

Unnamed: 0,customer_id,account_id,ed_id,customer_label
0,-2133016776,1442088117,"[12, 4, 3, 19, 24, 4, 1, 4, 11, 4, 5, 11, 6, 4...","Activated, No Order"
1,-2124872601,1448775641,"[29, 19, 19, 19, 19, 19, 19, 3, 12, 5, 5, 4, 4...",Activated and Ordered
2,-2124427253,-1861676870,"[19, 19, 19, 3, 19, 3, 19, 19, 12, 19, 2, 4, 4...",Activated and Ordered
3,-2103242255,2034732712,"[19, 19, 3, 19, 19, 3, 19, 19, 19, 3, 12, 4, 1...",Activated and Ordered
4,-2101937295,958613450,"[2, 6, 19, 19, 3, 19, 19, 3, 19, 19, 3, 19, 19...","Activated, No Order"


In [14]:
ed_id_lists['customer_label'].describe()

count                     314
unique                      2
top       Activated, No Order
freq                      224
Name: customer_label, dtype: object

In [40]:
# turning `customer_label` to true labels
codes, uniques = pd.factorize(ed_id_lists['customer_label'])
ed_id_lists['customer_label'] = codes
ed_id_lists.shape

(314, 4)

In [16]:
# activated & ordered = 0, activated, no order = 1
ed_id_lists.head(10)

Unnamed: 0,customer_id,account_id,ed_id,customer_label
0,-2133016776,1442088117,"[12, 4, 3, 19, 24, 4, 1, 4, 11, 4, 5, 11, 6, 4...",0
1,-2124872601,1448775641,"[29, 19, 19, 19, 19, 19, 19, 3, 12, 5, 5, 4, 4...",1
2,-2124427253,-1861676870,"[19, 19, 19, 3, 19, 3, 19, 19, 12, 19, 2, 4, 4...",1
3,-2103242255,2034732712,"[19, 19, 3, 19, 19, 3, 19, 19, 19, 3, 12, 4, 1...",1
4,-2101937295,958613450,"[2, 6, 19, 19, 3, 19, 19, 3, 19, 19, 3, 19, 19...",0
5,-2026925668,-1100126228,"[19, 19, 19, 19, 19, 19, 3, 12, 4, 4, 4, 4, 4,...",0
6,-2020511375,-1948657330,"[15, 1, 1, 21, 1, 1, 1, 21, 1, 21, 0, 0, 0, 0,...",0
7,-2020232094,2097268175,"[4, 11, 1, 4, 4, 4, 5, 4, 2, 5, 5, 5, 4, 19, 1...",1
8,-1999776631,-141087342,"[29, 15, 18, 24, 27, 28, 0, 0, 0, 0, 0, 0, 0, ...",1
9,-1990855982,-1617121722,"[1, 29, 15, 18, 27, 28, 0, 0, 0, 0, 0, 0, 0, 0...",1


**Building RNN Model**

In [75]:
# adding masks for each observation
X = ed_id_lists['ed_id']  # Features
X = X.values.tolist()
mask = [[1 if x > 0 else x for x in sublist] for sublist in X]
y = ed_id_lists['customer_label']  # Target variable
y = y.values.tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, mask_train, mask_test = train_test_split(X, y, mask, test_size=0.2, random_state=42)

In [78]:
train_dset = torch.utils.data.TensorDataset(torch.tensor(X_train,
                                                         dtype=torch.long),
                                            torch.tensor(y_train,
                                                         dtype=torch.long),
                                            torch.tensor(mask_train,
                                                         dtype=torch.float))

test_dset = torch.utils.data.TensorDataset(torch.tensor(X_test,
                                                        dtype=torch.long),
                                          torch.tensor(y_test,
                                                        dtype=torch.long),
                                          torch.tensor(mask_test,
                                                        dtype=torch.float))

In [79]:
# getting data loader for training and predicting process
batch_size = 128

train_loader = torch.utils.data.DataLoader(train_dset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=2
                         )

test_loader = torch.utils.data.DataLoader(test_dset,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=2
                         )

In [84]:
# small test of correctness
x, y, m = next(iter(train_loader))
print(x.shape, y.shape, m.shape)

print(x)
print(x.shape)
print(y)
print(y.shape)
print(m)
print(m.shape)

torch.Size([128, 197]) torch.Size([128]) torch.Size([128, 197])
tensor([[29, 12,  1,  ...,  0,  0,  0],
        [15,  1,  1,  ...,  0,  0,  0],
        [19, 19, 19,  ...,  0,  0,  0],
        ...,
        [19, 19, 19,  ...,  0,  0,  0],
        [22,  2, 12,  ...,  0,  0,  0],
        [ 1,  1,  1,  ...,  0,  0,  0]])
torch.Size([128, 197])
tensor([1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 1])
torch.Size([128])
tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 

In [83]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [85]:
class RNNClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, \
                 num_rec_layers=1, rec_layer=nn.RNN):
        super(RNNClassifier, self).__init__()
        # define all layers we need,
        # their parameters will be initialized automatically

        # nn.Embedding layer turns input sentences into word embeddings
        # with input and output dimension given by vocab_size and embedding_dim
        # self.word_embeddings = nn.Embedding(embedding_dim)

        # depending on the value of num_rec_layers, the corresponding number
        # of rec_layers (either RNN or LSTM) with batch_first=True and hidden
        # dimension given by hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.num_rec_layers = num_rec_layers
        self.rnn1 = rec_layer(embedding_dim, hidden_dim, batch_first=True)
        if self.num_rec_layers == 2:
            self.rnn2 = rec_layer(hidden_dim, hidden_dim, batch_first=True)

        # a final linear layer with sigmoid activation with input and output
        # dimension given by hidden_dim and 1.
        self.sigmoid = nn.Sigmoid()
        self.hidden2label = nn.Linear(hidden_dim, 1)

    def forward(self, sentences, mask):
        # sentences shape: [B, L], mask shape: [B, L]
        # embedding = word_embeddings(embedding_dim)# call your embedding layer, output shape: [B, L, DE]
        embedding = self.word_embeddings(sentences)
        out, hidden = self.rnn1(embedding)# call your rnn1, output shape: [B, L, DH]
        if self.num_rec_layers == 2:
            out, hidden = self.rnn2(out, hidden)# call your rnn2, output shape: [B, L, DH]
        out = (out*mask[:, :, None]).mean(dim=1) # shape: [B, DH]
        res = self.sigmoid(self.hidden2label(out))# call your hidden2label, output shape: [B, 1]
        return res

In [86]:
# create a particular instance of the model, do a 1 layer vaniila RNN with embedding_dim=hidden_dim=128
rnn = RNNClassifier(128, 128, 500)
rnn.to(device)

RNNClassifier(
  (word_embeddings): Embedding(500, 128)
  (rnn1): RNN(128, 128, batch_first=True)
  (sigmoid): Sigmoid()
  (hidden2label): Linear(in_features=128, out_features=1, bias=True)
)

In [88]:
# demo forward pass with the mini-batch that we generated above
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = x.to(device)
y = y.to(device)
m = m.to(device)

y_pred = rnn(x, m)
print(y_pred.shape)
print(y_pred[:10])

torch.Size([128, 1])
tensor([[0.5108],
        [0.5127],
        [0.5154],
        [0.5106],
        [0.5128],
        [0.5177],
        [0.5006],
        [0.4672],
        [0.5112],
        [0.5112]], grad_fn=<SliceBackward0>)


In [89]:
learning_rate = 0.001
optimizer =optim.RMSprop(rnn.parameters(), lr=learning_rate)

lossfun = nn.BCELoss(reduction='mean')

In [90]:
def train_epoch(train_loader, model, lossfun, optimizer, device):
    model.train()
    # iterate over mini-batches
    for it, (inputs, labels, mask) in enumerate(train_loader):
        # move everything to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        mask = mask.to(device)

        model.zero_grad()
        # forward pass
        output = model(inputs, mask)

        loss = lossfun(output.view(-1), labels.float())
        # backward pass
        loss.backward()

        # update model parameters
        optimizer.step()

def evaluate(loader, model, lossfun, device):
    model.eval()
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    # iterate over mini-batches
    for it, (inputs, labels, mask) in enumerate(loader):
        # move everything to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        mask = mask.to(device)

        # forward pass
        output = model(inputs, mask)

        # calculate loss value
        loss = lossfun(output.view(-1), labels.float())
        total_loss += loss.item()

        # calculate test accuracy
        pred = output.view(-1) > 0.5
        correct = (pred == labels.bool())
        total_acc += torch.sum(correct).item() / len(correct)

    total = it + 1
    return total_loss / total, total_acc / total


def train(train_loader, test_loader, model, lossfun, optimizer, \
          device, num_epochs):
    train_loss_ = []
    test_loss_ = []
    train_acc_ = []
    test_acc_ = []
    # irerate over training epochs
    for epoch in range(num_epochs):
        # at each step, we do a training epoch and evaluate on train and test data
        train_epoch(train_loader, model, lossfun, optimizer, device)
        train_loss, train_acc = evaluate(train_loader, model, lossfun, device)
        train_loss_.append(train_loss)
        train_acc_.append(train_acc)
        test_loss, test_acc = evaluate(test_loader, model, lossfun, device)
        test_loss_.append(test_loss)
        test_acc_.append(test_acc)

        print(f'Epoch: {epoch+1:3d}/{num_epochs:3d} '
              f'Training Loss: {train_loss_[epoch]:.3f}, Testing Loss: {test_loss_[epoch]:.3f}, '
              f'Training Acc: {train_acc_[epoch]:.3f}, Testing Acc: {test_acc_[epoch]:.3f}')

    return train_loss_, train_acc_, test_loss_, test_acc_

In [91]:
# let's first train a vanilla RNN
a, b, c, d = train(train_loader, test_loader, rnn, lossfun, \
                   optimizer, device, num_epochs=30)

Epoch:   1/ 30 Training Loss: 0.670, Testing Loss: 0.630, Training Acc: 0.737, Testing Acc: 0.762
Epoch:   2/ 30 Training Loss: 0.634, Testing Loss: 0.606, Training Acc: 0.746, Testing Acc: 0.762
Epoch:   3/ 30 Training Loss: 0.612, Testing Loss: 0.587, Training Acc: 0.777, Testing Acc: 0.810
Epoch:   4/ 30 Training Loss: 0.606, Testing Loss: 0.578, Training Acc: 0.773, Testing Acc: 0.794
Epoch:   5/ 30 Training Loss: 0.599, Testing Loss: 0.586, Training Acc: 0.805, Testing Acc: 0.841
Epoch:   6/ 30 Training Loss: 0.605, Testing Loss: 0.564, Training Acc: 0.757, Testing Acc: 0.810
Epoch:   7/ 30 Training Loss: 0.586, Testing Loss: 0.582, Training Acc: 0.824, Testing Acc: 0.857
Epoch:   8/ 30 Training Loss: 0.579, Testing Loss: 0.548, Training Acc: 0.813, Testing Acc: 0.825
Epoch:   9/ 30 Training Loss: 0.568, Testing Loss: 0.565, Training Acc: 0.829, Testing Acc: 0.857
Epoch:  10/ 30 Training Loss: 0.572, Testing Loss: 0.534, Training Acc: 0.812, Testing Acc: 0.810
Epoch:  11/ 30 Train