In [1]:
# !python -m spacy download ru_core_news_lg

In [2]:
# !pip install performer_pytorch

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from tqdm import tqdm
from sklearn.model_selection import GroupKFold

import pytorch_lightning as pl
from performer_pytorch import PerformerLM

import torch
import torch.nn as nn
from torch.profiler import profile, record_function, ProfilerActivity

import os
import datetime

import utils

%load_ext autoreload
%autoreload 1
%aimport utils

In [4]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [5]:
data_folder = '../data/'

In [6]:
df = utils.read_preprocessed_financial_data(data_folder, enc_cols=['mcc_description', 'tr_description'])
df.shape 

(3751083, 10)

In [7]:
df.head()

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id,gender,mcc_description,tr_description,week
0,39026145.0,0,4814.0,1030,-2245.92,311690,1.0,"[-0.102810316, -0.12563597, -0.11075681, -0.06...","[0.07799721, 0.21862344, 0.1882922, -0.1851609...",0
1,39026145.0,13328582400,4814.0,1030,-5614.79,311690,1.0,"[-0.102810316, -0.12563597, -0.11075681, -0.06...","[0.07799721, 0.21862344, 0.1882922, -0.1851609...",0
2,39026145.0,17673984000,4814.0,1030,-1122.96,311690,1.0,"[-0.102810316, -0.12563597, -0.11075681, -0.06...","[0.07799721, 0.21862344, 0.1882922, -0.1851609...",1
3,39026145.0,30168547200,4814.0,1030,-2245.92,311690,1.0,"[-0.102810316, -0.12563597, -0.11075681, -0.06...","[0.07799721, 0.21862344, 0.1882922, -0.1851609...",1
4,39026145.0,48592051200,4814.0,1030,-2245.92,311690,1.0,"[-0.102810316, -0.12563597, -0.11075681, -0.06...","[0.07799721, 0.21862344, 0.1882922, -0.1851609...",1


In [8]:
clients = df['customer_id'].unique()
clients = clients[:100] # drop this row (I just tested the code with small amount of data)

df_week = []

for client_id in tqdm(clients):
    client_data = df[df['customer_id'] == client_id]
    client_weeks = client_data.week.unique()
    
    for week in client_weeks:
        client_data_week = client_data[client_data['week'] == week]
        df_week.append(
            {
                'transactions': client_data_week['mcc_code'].tolist(),
                'amounts': client_data_week['amount'].tolist(),
                'tr_datetime': client_data_week['tr_datetime'].tolist(),
                'tr_type': client_data_week['tr_type'].tolist(),
                'mcc_description': client_data_week['mcc_description'].tolist(), # add mcc_description embeddings
                'tr_description': client_data_week['tr_description'].tolist(), # add tr_description embeddings
                'term_id': client_data_week['term_id'].tolist(),
                'customer_id': client_id, 
                'week': week,
                'gender': client_data_week['gender'].tolist()
            }
        )

100%|██████████| 100/100 [00:06<00:00, 16.58it/s]


In [9]:
df_week = pd.DataFrame(df_week)
df_week.shape

(5990, 10)

In [10]:
df_week.head(3)

Unnamed: 0,transactions,amounts,tr_datetime,tr_type,mcc_description,tr_description,term_id,customer_id,week,gender
0,"[4814.0, 4814.0, 5499.0, 5499.0, 5499.0, 5499....","[-2245.92, -5614.79, -1392.47, -920.83, -1010....","[0, 13328582400, 3317241600, 9680601600, 14449...","[1030, 1030, 1010, 1010, 1010, 1010, 1010, 101...","[[-0.102810316, -0.12563597, -0.11075681, -0.0...","[[0.07799721, 0.21862344, 0.1882922, -0.185160...","[311690, 311690, 311690, 311690, 311690, 31169...",39026145.0,0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,"[4814.0, 4814.0, 4814.0, 4814.0, 4814.0, 5499....","[-1122.96, -2245.92, -2245.92, -2245.92, -2245...","[17673984000, 30168547200, 48592051200, 487840...","[1030, 1030, 1030, 1030, 1030, 1010, 1010, 101...","[[-0.102810316, -0.12563597, -0.11075681, -0.0...","[[0.07799721, 0.21862344, 0.1882922, -0.185160...","[311690, 311690, 311690, 311690, 311690, 31169...",39026145.0,1,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,"[4814.0, 4814.0, 4814.0, 5331.0, 5331.0, 5331....","[-2245.92, -449.18, -1122.96, -6288.56, -1122....","[42837984000, 79248240000, 79340169600, 0, 0, ...","[1030, 1030, 1030, 1110, 1110, 1110, 1110, 101...","[[-0.102810316, -0.12563597, -0.11075681, -0.0...","[[0.07799721, 0.21862344, 0.1882922, -0.185160...","[311690, 311690, 311690, 311690, 311690, 31169...",39026145.0,2,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [11]:
MIN_LEN = 20
MAX_LEN = 50

lens = df_week.transactions.apply(lambda x: len(x))
df_week = df_week[(lens >= MIN_LEN) & (lens <= MAX_LEN)]
df_week.shape

(598, 10)

In [12]:
features = ['transactions', 'amounts', 'tr_datetime', 'tr_type', 'term_id', 'mcc_description'] #, 'tr_description'] 
desc_cols = ['mcc_description', 'tr_description']

INPUT_SIZE = len(list(set(features) - set(desc_cols)))
if 'mcc_description' in features:
    INPUT_SIZE += len(df['mcc_description'].iloc[0])
if 'tr_description' in features:
    INPUT_SIZE += len(df['tr_description'].iloc[0])
    
EPOCHS = 10
N_SPLITS = 5

In [13]:
INPUT_SIZE

305

# Performer

In [14]:
class ClassificationModel(pl.LightningModule):
    def __init__(self, model, train_data, test_data, batch_size=64, learning_rate=1e-3):
        super(ClassificationModel, self).__init__()
        self.model = model

        self.batch_size = batch_size
        self.loss_function = nn.BCELoss()

        self.train_data = train_data
        self.val_data = test_data

        self.learning_rate = learning_rate

    def forward(self, inputs):
        print('in forward')
        print('without resize')
        try:
            a = self.model(inputs)
            print(a)
        except:
            print('a = self.model(inputs) does not work')
            try:
                a = self.model(inputs.float())
                print(a)
            except:
                print('a = self.model(inputs.float()) does not work')
                try:
                    a = self.model(inputs.long())
                    print(a)
                except:
                    print('a = self.model(inputs.long()) does not work')
                    print('\n\ntry with resize')
                    inputs = inputs.resize(inputs.size()[0]*inputs.size()[1], inputs.size()[-1])
                    print('new inputs size', inputs.size())
                    try:
                        a = self.model(inputs)
                        print(a)
                    except:
                        print('a = self.model(inputs) does not work')
                        try:
                            a = self.model(inputs.float())
                            print(a)
                        except:
                            print('a = self.model(inputs.float()) does not work')
                            try:
                                a = self.model(inputs.long())
                                print(a)
                            except:
                                print('a = self.model(inputs.long()) does not work')
        inputs = inputs.resize(inputs.size()[0]*inputs.size()[1], inputs.size()[-1])
        return self.model(inputs)

    @staticmethod
    def calculate_metrics(target, y_pred):
        target = target.detach().cpu().numpy()
        y_pred = y_pred.detach().cpu().numpy()
        acc = accuracy_score(target, y_pred > 0.5)

        try:
            roc_auc = roc_auc_score(target, y_pred)
        except ValueError:
            roc_auc = acc
        pr_auc = average_precision_score(target, y_pred)

        return acc, roc_auc, pr_auc

    def training_step(self, batch, batch_idx):
        print('in training step')
        sample, target = batch
        print(sample, '\n------------------------------------------------------------------\n', target)
        pred = self.forward(sample.long())
        print('pred\n', pred)

        train_loss = self.loss_function(pred.squeeze(), target.float())
        train_accuracy = (target == (pred.squeeze() > 0.5)).float().mean()

        self.log("train_loss", train_loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log(
            "train_acc", train_accuracy, on_step=False, on_epoch=True, prog_bar=True
        )

        return train_loss

    def validation_step(self, batch, batch_idx):
        print('in validation step')
        sample, target = batch
        print(sample, '\n------------------------------------------------------------------\n', target)
        #pred = self.forward(sample.long())
        pred = self.forward(sample)

        val_loss = self.loss_function(pred.squeeze(), target.float())
        val_accuracy = (target == (pred.squeeze() > 0.5)).float().mean()

        self.log("val_loss", val_loss, prog_bar=True)
        self.log("val_acc", val_accuracy, prog_bar=True)

        return {
            "val_loss": val_loss,
            "val_acc": val_accuracy,
            "val_target": target,
            "val_predictions": pred,
        }

    def validation_epoch_end(self, outputs):
        predictions = torch.cat([x["val_predictions"] for x in outputs])
        target = torch.cat([x["val_target"] for x in outputs])

        accuracy, roc_auc, pr_auc = self.calculate_metrics(
            target.squeeze(), predictions.squeeze()
        )

        log_dict = {
            "mean_accuracy": accuracy,
            "mean_roc_auc": roc_auc,
            "mean_pr_auc": pr_auc,
        }

        for k, v in log_dict.items():
            self.log(k, v, prog_bar=True)

    def configure_optimizers(self):
        opt = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        return opt

    def train_dataloader(self):
        train_dataloader = torch.utils.data.DataLoader(
            self.train_data, batch_size=self.batch_size, shuffle=True, num_workers=4
        )
        return train_dataloader

    def val_dataloader(self):
        val_dataloader = torch.utils.data.DataLoader(
            self.val_data, batch_size=self.batch_size, shuffle=False, num_workers=4
        )
        return val_dataloader

In [15]:
base_model = PerformerLM(
    num_tokens = INPUT_SIZE,
    max_seq_len = 2048,
    dim = 1,
    heads = 1, 
    depth = 1
)

In [16]:
group_kfold = GroupKFold(n_splits=N_SPLITS)
metrics = {'Accuracy': [], 'ROC AUC': [], 'PR AUC': []}
for i, (train_index, test_index) in enumerate(group_kfold.split(X=df_week, groups=df_week['customer_id'])):
    train_data = df_week.iloc[train_index]
    test_data = df_week.iloc[test_index]

    random.seed(123)
    val_customer_id = random.sample(train_data['customer_id'].unique().tolist(), int(train_data['customer_id'].nunique() * 0.3))
    val_data = train_data[train_data['customer_id'].isin(val_customer_id)]
    train_data = train_data[~train_data['customer_id'].isin(val_customer_id)]

    train_dataset = utils.create_dataset(train_data, features, batch_size=64, shuffle=True)
    val_dataset = utils.create_dataset(val_data, features, batch_size=64, shuffle=False)
    test_dataset = utils.create_dataset(test_data, features, batch_size=64, shuffle=False)    

    model = ClassificationModel(
        model=base_model,
        train_data=train_dataset,
        test_data=val_dataset
    )

    current_time = datetime.datetime.now().strftime("%m%d%Y_%H:%M:%S")
    experiment_name = "Performer_" + str(EPOCHS) + "_" + current_time

    logger = pl.loggers.TensorBoardLogger(save_dir='../logs/', name=experiment_name)

    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        monitor='val_loss',
        dirpath=f'../logs/{experiment_name}',
        filename='{epoch:02d}-{val_loss:.3f}',
        mode='min')

    early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(
        monitor="val_loss", 
        min_delta=0.00, 
        patience=4, 
        verbose=False, 
        mode="min"
    )

    trainer = pl.Trainer(
        max_epochs=EPOCHS, 
        gpus=[1],  
        benchmark=True, 
        check_val_every_n_epoch=1, 
        logger=logger,
        callbacks=[checkpoint_callback, early_stop_callback])

    """if i == 0:
        with profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True
        ) as prof:
        with torch.cuda.profiler.profile() as prof:
            with record_function("model_training"):
                trainer.fit(model)
        time_memory_consumption(prof.key_averages().table(), '../models/training_{}.txt'.format(experiment_name))
    else:"""
    trainer.fit(model)
    torch.save(model.model.state_dict(), '../models/{}.pth'.format(experiment_name))

    dict_logs = utils.plot_train_process(logger.log_dir)
    plt.show()

    """if i == 0:
        with profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True
        ) as prof:
        with torch.cuda.profiler.profile() as prof:
            with record_function("model_testing"):
                test_predictions, test_targets, metric = utils.test_model(model, test_dataset)
        time_memory_consumption(prof.key_averages().table(), '../models/testing_{}.txt'.format(experiment_name))
    else:"""
    test_predictions, test_targets, metric = utils.test_model(model, test_dataset)

    metrics['Accuracy'].append(metric[0])
    metrics['ROC AUC'].append(metric[1])
    metrics['PR AUC'].append(metric[2])

  torch.tensor(df1[col].iloc[i]) for col in desc_cols_feat
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Missing logger folder: ../logs/Performer_10_12232021_09:14:51

  | Name          | Type        | Params
----------------------------------------------
0 | model         | PerformerLM | 1.2 K 
1 | loss_function | BCELoss     | 0     
----------------------------------------------
1.2 K     Trainable params
0         Non-trainable params
1.2 K     Total params
0.005     Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]in validation step
tensor([[[ 4.8140e+03, -3.3689e+03,  4.0435e+11,  ..., -1.6361e-01,
           1.6467e-01, -2.9726e-01],
         [ 4.8140e+03, -3.3689e+03,  1.3151e+12,  ..., -1.6361e-01,
           1.6467e-01, -2.9726e-01],
         [ 4.8140e+03, -3.3689e+03,  4.3602e+11,  ..., -1.6361e-01,
           1.6467e-01, -2.9726e-01],
         ...,
         [ 4.8140e+03, -3.3689e+03,  3.1209e+10,  ..., -1.6361e-01,
           1.6467e-01, -2.9726e-01],
         [ 4.8140e+03, -4.4918e+03,  4.4605e+10,  ..., -1.6361e-01,
           1.6467e-01, -2.9726e-01],
         [ 4.8140e+03, -3.3689e+03,  6.6146e+11,  ..., -1.6361e-01,
           1.6467e-01, -2.9726e-01]],

        [[ 5.5410e+03, -2.8477e+04,  2.1724e+11,  ..., -2.1053e-01,
           3.4581e-01, -4.1103e-01],
         [ 4.1120e+03, -5.4508e+04,  6.3698e+11,  ...,  1.2251e-01,
           1.6645e-01, -1.8916e-01],
         [ 5.5410e+03, -3.0455e+04,  7.2277e+11,  ..., -2.1053e



a = self.model(inputs.long()) does not work


RuntimeError: requested resize to 2250595x305 (686431475 elements in total), but the given tensor has a size of 7379x305 (2250595 elements). autograd's resize can only change the shape of a given tensor, while preserving the number of elements. 

In [None]:
for metric_name, metric_val in metrics.items():
    print('{}: {} ± {}'.format(metric_name, np.array(metric_val).mean(), np.array(metric_val).std()))