In [1]:
!python --version


Python 3.9.13


In [2]:
!python -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 --quiet
!python -m pip install pytorch-lightning==1.8.6 fsspec["s3"] python-dotenv boto3 s3fs==2022.11.0 tensorboard tensorflow --quiet

[0m

In [3]:
%load_ext dotenv
%dotenv env


In [4]:
S3_BUCKET = 's3://dissertation-data-dmiller'

In [5]:
# %load npz_extractor.py
import os
import zipfile

import numpy as np

class NPZExtractor:
    def __init__(self, input_path, n_files, n_sequences, s3_client, data_partition) -> None:
        self.input_path = input_path
        self.n_files = n_files
        self.n_sequences = n_sequences
        self.s3_client = s3_client
        self.data_partition = data_partition


    def get_dataset_pointer(self):

        read_path = os.path.join(self.input_path, f'files_used_{self.n_files}')
        if not os.path.exists(read_path):
            print(f'Creating directory: {read_path}')
            os.makedirs(read_path)


        for _ in range(0, self.n_sequences +1, 10):
            key_zip, key_npy = (
                os.path.join(read_path, f'sequence_index_{_}.npz'),
                os.path.join(read_path, f'sequence_index_{_}')
            )

            print(f'Loading pointer to dataset: {key_npy}: derived from {key_zip}')

            if not os.path.exists(key_npy):
                print(f'Zip file to extract: {key_zip}: npy file to load: {key_npy}')
                self.s3_client.download_file(
                    'dissertation-data-dmiller',
                    key_zip,
                    key_zip
                )
                print(f'Zip file downloaded: {key_zip}')
                self._zip_extract(key_zip, key_npy)

        lz_concatenated_results = self._lazy_concatenate()

        if self.data_partition:
            return [p[:self.data_partition] for p in lz_concatenated_results]
        else:
            return lz_concatenated_results


    def _zip_extract(self, key_zip, key_npy):
        print(f'Extracting file: {key_zip} -> {key_npy}')

        with zipfile.ZipFile(key_zip, 'r') as zip_ref:
            zip_ref.extractall(path=key_npy, members=['arr_0.npy'])

        print(f'Zip file exracted: {key_zip} -> {key_npy}/arr_0.npy')

    def _lazy_concatenate(self):
        lz_concat = []
        for _ in range(0, self.n_sequences +1, 10):
            path_to_load = os.path.join(self.input_path, f'files_used_{self.n_files}', f'sequence_index_{_}', f'arr_0.npy')
            print(f'Loading: {path_to_load}')
            lz_concat.append(np.load(path_to_load, mmap_mode='r'))
        return lz_concat

In [6]:
# %load data_module.py
import pdb

import numpy as np
from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

LABEL_INDEX = 1
TOTAL_EVENTS_INDEX = 2
BATCHES = 1000000

class ClickstreamDataModule(LightningDataModule):
    def __init__(self, pointer_list, batch_size, n_sequences, n_features) -> None:
        super().__init__()
        self.batch_size = batch_size
        train_index = int(pointer_list[0].shape[0] * 0.7)
        val_index = int(pointer_list[0].shape[0] * 0.85)
        self.training_data = [p[:train_index] for p in pointer_list]
        self.validation_data = [p[train_index:val_index] for p in pointer_list]
        self.test_data = [p[val_index:] for p in pointer_list]
        self.n_sequences = n_sequences
        self.n_features = n_features

    def train_dataloader(self):
        dataset = ClickstreamDataset(self.training_data, self.n_sequences, self.n_features)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=8, pin_memory=True)

    def val_dataloader(self):
        dataset = ClickstreamDataset(self.validation_data, self.n_sequences, self.n_features)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=8, pin_memory=True)
    
    def test_dataloader(self):
        dataset = ClickstreamDataset(self.test_data, self.n_sequences, self.n_features)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=8, pin_memory=True)
    
class ClickstreamDataset(Dataset):
    def __init__(self, dataset_pointer_list, n_sequences, n_features) -> None:
        super().__init__()
        """
        Yield data in batches of BATCHES
        """
        self.pointer_list = dataset_pointer_list
        self.n_sequences = n_sequences
        self.n_features = n_features
        self.total_events = self.pointer_list[0].shape[0]


    def __getitem__(self, idx):
        result = [np.array(i[idx]) for i in self.pointer_list]
        return np.concatenate(result)

    def __len__(self):
        return self.total_events


In [7]:
# %load model_base.py
import pdb


import torch
import torch.nn as nn
from pytorch_lightning import LightningModule
from torchmetrics import Accuracy
from torchmetrics import Precision
from torchmetrics import Recall
ZERO_HEURISTIC_RATE = 10
ORDINAL_FEATURE_INDEX = 17





class ModelBase(LightningModule):

    def __init__(self) -> None:
        super().__init__()

        self.loss = nn.BCEWithLogitsLoss()

        self.runtime_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.train_accuracy = Accuracy(task='binary', threshold=0.5)
        self.valid_accuracy = Accuracy(task='binary', threshold=0.5)

        self.train_precision = Precision(task='binary', threshold=0.5)
        self.valid_precision = Precision(task='binary', threshold=0.5)

        self.train_recall = Recall(task='binary', threshold=0.5)
        self.valid_recall = Recall(task='binary', threshold=0.5)

        self = self.to(self.runtime_device)

    def training_step(self, batch, batch_idx):
        loss, acc, prec, rec = self._run_step(batch, 'train')

        self.log(
            'loss/train',
            loss,
            logger=True,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        self.log(
            'loss_train',
            loss,
            logger=False,
            prog_bar=False,
            on_step=True,
            on_epoch=True,
            sync_dist=True
        )

        return {
            "loss": loss,
            "acc": acc,
            "prec": prec,
            "rec": rec
        }


    def validation_step(self, batch, batch_idx):
        loss, acc, prec, rec = self._run_step(batch, 'valid')

        self.log(
            'loss/valid',
            loss,
            logger=True,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        self.log(
            'loss_valid',
            loss,
            logger=False,
            prog_bar=False,
            on_step=True,
            on_epoch=True,
            sync_dist=True
        )

        return {
            "loss": loss,
            "acc": acc,
            "prec": prec,
            "rec": rec
        }


    def _run_step(self, batch, type):

        y, total_events, categorical_features, ordinal_features  = self._extract_features(batch)

        if 'ordinal' in self.model_name:
            y_hat = self(ordinal_features)
        
        if 'embedded' in self.model_name:
            concatenated = torch.cat((ordinal_features, categorical_features), dim=2)
            assert concatenated.shape == (batch.shape[0], self.n_sequences, 19), 'concatenated shape is wrong'
            y_hat = self(torch.cat((ordinal_features, categorical_features), dim=2))

        if self.zero_heuristic:
            y_hat = torch.where(total_events <= ZERO_HEURISTIC_RATE, torch.zeros_like(y_hat), y_hat)

        loss = self.loss(y_hat, y)
        y = y.int()

        if 'train' in type:
            acc = self.train_accuracy(y_hat, y)
            prec = self.train_precision(y_hat, y)
            rec = self.train_recall(y_hat, y)

        else:
            acc = self.valid_accuracy(y_hat, y)
            prec = self.valid_precision(y_hat, y)
            rec = self.valid_recall(y_hat, y)

        return loss, acc, prec, rec

    def _extract_features(self, tensor):

        label, total_events, user_id, project_id, features, shifters = (
            tensor[:, 0], tensor[:, 1],  tensor[:, 2],
            tensor[:, 3], tensor[:, 5:5+17], tensor[:, 5+17:]
        )

        shifters = torch.reshape(shifters, (shifters.shape[0], self.n_sequences-1, 18))
        shifter_project_id, shifter_features = shifters[:, :, 0], shifters[:, :, 1:]

        project_id = torch.flip(torch.cat((project_id.unsqueeze(1), shifter_project_id), dim=1), dims=[1]).long()
        features = torch.flip(torch.cat((features.unsqueeze(1), shifter_features), dim=1), dims=[1])

        user_id = user_id.unsqueeze(1).repeat(1, self.n_sequences).long()
        user_id = torch.where(project_id == 0, 0, user_id)

        user_id, project_id = user_id.unsqueeze(2), project_id.unsqueeze(2)

        user_project_concat = torch.cat((user_id, project_id), dim=2)

        assert user_project_concat.shape == (user_id.shape[0], self.n_sequences, 2), 'user_project_concat shape is wrong'
        return (
            label.unsqueeze(1), 
            total_events.unsqueeze(1), 
            user_project_concat,
            features
        )

    def training_epoch_end(self, outputs):

        acc, prec, rec, loss = (
            torch.stack([out['acc'] for out in outputs]),
            torch.stack([out['prec'] for out in outputs]),
            torch.stack([out['rec'] for out in outputs]),
            torch.stack([out['loss'] for out in outputs])
        )

        acc, prec, rec, loss = (
            torch.mean(acc),
            torch.mean(prec),
            torch.mean(rec),
            torch.mean(loss)
        )

        self.logger.experiment.add_scalar('acc/train', acc, self.current_epoch)
        self.logger.experiment.add_scalar('prec/train', prec, self.current_epoch)
        self.logger.experiment.add_scalar('rec/train', rec, self.current_epoch)
        self.logger.experiment.add_scalar('loss_e/train', loss, self.current_epoch)


    def validation_epoch_end(self, outputs):

        acc, prec, rec, loss = (
            torch.stack([out['acc'] for out in outputs]),
            torch.stack([out['prec'] for out in outputs]),
            torch.stack([out['rec'] for out in outputs]),
            torch.stack([out['loss'] for out in outputs])
        )

        acc, prec, rec, loss = (
            torch.mean(acc),
            torch.mean(prec),
            torch.mean(rec),
            torch.mean(loss)
        )

        self.logger.experiment.add_scalar('acc/valid', acc, self.current_epoch)
        self.logger.experiment.add_scalar('prec/valid', prec, self.current_epoch)
        self.logger.experiment.add_scalar('rec/valid', rec, self.current_epoch)
        self.logger.experiment.add_scalar('loss_e/valid', loss, self.current_epoch)

    def configure_optimizers(self):
        # equation for adam optimizer
        """
        m_t = beta_1 * m_{t-1} + (1 - beta_1) * g_t
        v_t = beta_2 * v_{t-1} + (1 - beta_2) * g_t^2
        m_cap = m_t / (1 - beta_1^t)
        v_cap = v_t / (1 - beta_2^t)
        w_t = w_{t-1} - lr * m_cap / (sqrt(v_cap) + eps)
        """
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

In [8]:
# %load model_protos.py
# %load model_protos.py
import pdb

import torch
import torch.nn as nn
ORDINAL_FEATURE_INDEX = 17


class LSTMOrdinal(ModelBase):
    def __init__(self, n_features, n_seqeuences, hidden_size=32, dropout=0.2, lr=0.01, batch_size=256, zero_heuristic=False) -> None:
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=n_features - 3,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.learning_rate = lr
        self.batch_size = batch_size
        self.model_name = 'ordinal'

        self.output = nn.Linear(
            hidden_size,
            1
        )

        self.n_sequences = n_seqeuences

        self.zero_heuristic = zero_heuristic
        self.save_hyperparameters()

    def forward(self, x):


        x, _ = self.lstm(x)
        x = x[:, -1]
        return self.output(x)

class LSTMEmbedUserProject(ModelBase):
    def __init__(self, n_features, n_sequences, embedding_matrix, hidden_size=32, dropout=0.2, lr=0.01, batch_size=256, zero_heuristic=False) -> None:
        super().__init__()

        self.lstm_ordinal = nn.LSTM(
            input_size=n_features - 3,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.user_embedding = nn.Embedding(
            num_embeddings=embedding_matrix['user_id'][0] + 1,
            embedding_dim=embedding_matrix['user_id'][1],
            padding_idx=0
        )

        self.project_embedding = nn.Embedding(
            num_embeddings=embedding_matrix['project_id'][0] + 1,
            embedding_dim=embedding_matrix['project_id'][1],
            padding_idx=0
        )

        self.lstm_embedding = nn.LSTM(
            input_size=embedding_matrix['user_id'][1] + embedding_matrix['project_id'][1],
            hidden_size=(embedding_matrix['user_id'][1] + embedding_matrix['project_id'][1]) // 2,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.learning_rate = lr
        self.batch_size = batch_size
        self.model_name = 'embedded'

        trunk_linear = hidden_size + ((embedding_matrix['user_id'][1] + embedding_matrix['project_id'][1]) // 3)

        self.out_trunk = nn.Sequential(
            nn.Linear(trunk_linear, 20),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(20, 1)
        )

        self.n_sequences = n_sequences
        self.zero_heuristic = zero_heuristic

        self.save_hyperparameters()

    def forward(self, x):
        ordinal_features, categorical_features = x[:, :, :ORDINAL_FEATURE_INDEX], x[:, :, ORDINAL_FEATURE_INDEX:].int()


        user, project = categorical_features[:, :, 0], categorical_features[:, :, 1]

        """
        assert no more than 2 unique users: padded and value
        """



        user, project = self.user_embedding(user), self.project_embedding(project)
        categorical_features = torch.cat((user, project), dim=2)
        ordinal_out = self.lstm_ordinal(ordinal_features)
        categorical_out = self.lstm_embedding(categorical_features)
        return self.out_trunk(torch.cat((ordinal_out[0][:, -1], categorical_out[0][:, -1]), dim=1))
    

class LSTMEmbedUser(ModelBase):
    def __init__(self, n_features, n_sequences, embedding_matrix, hidden_size=32, dropout=0.2, lr=0.01, batch_size=256, zero_heuristic=False) -> None:
        super().__init__()

        self.lstm_ordinal = nn.LSTM(
            input_size=n_features - 3,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.user_embedding = nn.Embedding(
            num_embeddings=embedding_matrix['user_id'][0] + 1,
            embedding_dim=embedding_matrix['user_id'][1],
            padding_idx=0
        )

        self.lstm_embedding = nn.LSTM(
            input_size=embedding_matrix['user_id'][1],
            hidden_size = max(3, (embedding_matrix['user_id'][1]) // 3),
            # hidden_size=(embedding_matrix['user_id'][1]) // 3,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )


        self.learning_rate = lr
        self.batch_size = batch_size
        self.model_name = 'embedded_user'

        trunk_linear = hidden_size + max(((embedding_matrix['user_id'][1]) // 3), 3)

        self.out_trunk = nn.Sequential(
            nn.Linear(trunk_linear, 20),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(20, 1)
        )

        self.n_sequences = n_sequences
        self.zero_heuristic = zero_heuristic

        self.save_hyperparameters()
    
    def forward(self, x):
        ordinal_features, categorical_features = x[:, :, :ORDINAL_FEATURE_INDEX], x[:, :, ORDINAL_FEATURE_INDEX:].int()
        user = categorical_features[:, :, 0]
        user = self.user_embedding(user)
        ordinal_out = self.lstm_ordinal(ordinal_features)
        user_out = self.lstm_embedding(user)
        return self.out_trunk(torch.cat((ordinal_out[0][:, -1], user_out[0][:, -1]), dim=1))


class LSTMEmbedOneLSTM(ModelBase):
    def __init__(self, n_features, n_sequences, embedding_matrix, hidden_size=32, dropout=0.2, lr=0.01, batch_size=256, zero_heuristic=False) -> None:
        super().__init__()

        self.lstm_combined = nn.LSTM(
            input_size=(n_features - 3) + embedding_matrix['user_id'][1],
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.user_embedding = nn.Embedding(
            num_embeddings=embedding_matrix['user_id'][0] + 1,
            embedding_dim=embedding_matrix['user_id'][1],
            padding_idx=0
        )

        self.learning_rate = lr
        self.batch_size = batch_size
        self.model_name = 'embedded_one_lstm'

        self.out = nn.Linear(hidden_size, 1)

        self.n_sequences = n_sequences
        self.zero_heuristic = zero_heuristic

        self.save_hyperparameters()
    
    def forward(self, x):
        ordinal_features, categorical_features = x[:, :, :ORDINAL_FEATURE_INDEX], x[:, :, ORDINAL_FEATURE_INDEX:].int()
        user = categorical_features[:, :, 0]
        user = self.user_embedding(user)
        features = torch.cat((ordinal_features, user), dim=2)
        out = self.lstm_combined(features)
        return self.out(out[0][:, -1])


In [9]:
import torch
from torchmetrics import Accuracy, Precision, Recall, F1Score
import torch.nn as nn
import numpy as np
from tqdm import tqdm

def _extract_features(tensor, n_sequences):
        

        features, shifters = (
             tensor[:, 5:5+17], tensor[:, 5+17:]
        )

        shifters = torch.reshape(shifters, (shifters.shape[0], n_sequences, 18))
        shifters = shifters[:, :, 1:]
        features = torch.flip(torch.cat((features.unsqueeze(1), shifters), dim=1), dims=[1])
        return features
  
def auc_by_user_bin(model, dataset, n_sequences, dataset_type):

    bin_list = [0, 25, 50, 75, 100, 150, 200, 250, 300, 500, 750, 1000, 2000, 5000, 10000, 20000, 50000, 100000]
    bin_matrix = []
    # bin_container["max"] = []
    acc, prec, rec, f1 = Accuracy(task='binary', threshold=0.5), Precision(task='binary', threshold=0.5), Recall(task='binary', threshold=0.5), F1Score(task='binary', threshold=0.5)
    if torch.cuda.is_available():
        acc.to('cuda')
        prec.to('cuda')
        rec.to('cuda')
        f1.to('cuda')
        model.to('cuda')
    model.eval()
    print(f'Running AUC by user bin for dataset_val: {dataset_type}')
    """
    Wrap dataloader in TQDM to get progress bar
    """

    for batch in tqdm(dataset):
        for indx, bin in enumerate(bin_list):
            if indx == 0:
                continue
            batch = batch.to('cuda') if torch.cuda.is_available() else batch
            batch_min = batch[(batch[:, 1] > bin_list[indx-1]) & (batch[:, 1] <= bin)]
            labels_min = batch_min[:, 0].int().unsqueeze(1)
            features_min = _extract_features(batch_min, n_sequences)
            with torch.no_grad():
                if features_min.shape[0] > 0:
                    preds_min = model(features_min) 
                    preds_min = nn.Sigmoid()(preds_min)
                    acc_min, prec_min, rec_min, f1_min = (
                        acc(preds_min, labels_min),
                        prec(preds_min, labels_min),
                        rec(preds_min, labels_min),
                        f1(preds_min, labels_min)
                    )
                    bin_matrix.append([bin, acc_min, prec_min, rec_min, f1_min])
                if bin == max(bin_list):
                    batch_max = batch[batch[:, 1] > bin]
                    labels_max = batch_max[:, 0].int().unsqueeze(1)
                    features_max = _extract_features(batch_max, n_sequences)
                    if features_max.shape[0] > 0:
                        preds_max = model(features_max) 
                        preds_max = nn.Sigmoid()(preds_max)
                        acc_max, prec_max, rec_max, f1_max = (
                            acc(preds_max, labels_max),
                            prec(preds_max, labels_max),
                            rec(preds_max, labels_max),
                            f1(preds_max, labels_max)
                        )
                        bin_matrix.append([bin*10, acc_max, prec_max, rec_max, f1_max])

    return torch.tensor(bin_matrix).detach().cpu().numpy()

In [10]:
# %load run_lstm_model.py
# %load run_lstm_model.py
import argparse
import logging
import os
import sys
from datetime import datetime
from io import BytesIO

import boto3
import numpy as np
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger
import json
import pdb
S3_BUCKET = 's3://dissertation-data-dmiller'
SNS_TOPIC = 'arn:aws:sns:eu-west-1:774141665752:gradient-task'

USER_INDEX = 9
PROJECT_INDEX = 10
COUNTRY_INDEX = 11

"""
Embedding dim based on cube root of number of unique values
"""

EMBEDDING_DIMS = {
    '5': {
        'user_id': (17891, 2),
        'project_id': (328, 1),
    },
    '30': {
        'user_id': (60459 ,int(60459**0.25)),
        'project_id': (617. , int(617**0.25)),
    },
    '45': {
        'user_id': (85663, int(85663**0.25)),
        'project_id': (757, int(757**0.25)),
    },
    '61': {
        'user_id': (104744, 18),
        'project_id': (846, 6),
    }
}


def setup_logging():

    logger = logging.getLogger(__name__)

    handler = logging.StreamHandler()

    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    logger.info(f'Setup logging')
    return logger


def _device_count():
    if 'ipykernel' in sys.modules: return 1

    if torch.cuda.is_available():
        return torch.cuda.device_count()

    return 1

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_type', type=str, default='ordinal')

    parser.add_argument('--batch_size', type=int, default=128)
    parser.add_argument('--learning_rate', type=float, default=0.01)

    parser.add_argument('--n_workers', type=int, default=8)
    parser.add_argument('--n_epochs', type=int, default=100)

    parser.add_argument('--hidden_size', type=int, default=32)
    parser.add_argument('--dropout', type=float, default=0.2)
    parser.add_argument('--n_sequences', type=int, default=10)
    parser.add_argument('--n_features', type=int, default=20)

    parser.add_argument('--data_input_path', type=str, default='datasets/torch_ready_data_4')
    parser.add_argument('--data_partition', type=int, default=0)

    parser.add_argument('--n_files', type=str, default='5')

    parser.add_argument('--progress_bar', type=bool, default=True)
    parser.add_argument('--checkpoint', type=str, default=None)

    parser.add_argument('--find_hparams', type=bool, default=False)

    parser.add_argument('--zero_heuristic', type=bool, default=False)
    parser.add_argument('--validate_only', type=bool, default=False)
    args = parser.parse_args()
    return args


def get_model(
    model_type,
    n_features,
    n_sequences,
    hidden_size,
    dropout,
    learning_rate,
    batch_size,
    zero_heuristic,
    n_files
):
    if model_type == 'ordinal':
        print('Creating LSTMOrdinal model')
        return LSTMOrdinal(
            n_features,
            n_sequences,
            hidden_size,
            dropout,
            learning_rate,
            batch_size,
            zero_heuristic
        )
    elif model_type == 'embed_user_project':
        return LSTMEmbedUserProject(
            n_features,
            n_sequences,
            EMBEDDING_DIMS[str(n_files)],
            hidden_size,
            dropout,
            learning_rate,
            batch_size,
        )
    elif model_type == 'embed_user':
        return LSTMEmbedUser(
            n_features,
            n_sequences,
            EMBEDDING_DIMS[str(n_files)],
            hidden_size,
            dropout,
            learning_rate,
            batch_size,
        )
    else:
        return LSTMEmbedOneLSTM(
            n_features,
            n_sequences,
            EMBEDDING_DIMS[str(n_files)],
            hidden_size,
            dropout,
            learning_rate,
            batch_size,
        )



def main(args):
    np.set_printoptions(precision=8, suppress=True)
    np.set_printoptions(linewidth=400)

    torch.set_printoptions(precision=4, sci_mode=False)
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")


    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    print(f'Running experiment at {date_time}')

    s3_client = boto3.client(
        's3',
        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
    )
    npz_extractor = NPZExtractor(
        args.data_input_path,
        args.n_files,
        args.n_sequences,
        s3_client,
        args.data_partition)

    clickstream_data_loader = ClickstreamDataModule(npz_extractor.get_dataset_pointer(), args.batch_size, args.n_sequences + 1, args.n_features)

    model = get_model(
        args.model_type,
        args.n_features,
        args.n_sequences + 1,
        args.hidden_size,
        args.dropout,
        args.learning_rate,
        args.batch_size,
        args.zero_heuristic,
        args.n_files
    )

    checkpoint_path = os.path.join(
        S3_BUCKET,
        'lstm_experiments',
        'checkpoints',
        str(args.n_files),
        args.model_type,
        f'sequence_length_{args.n_sequences}',
        str(args.data_partition),
        date_time)

    checkpoint = ModelCheckpoint(
        monitor='loss_valid',
        dirpath=checkpoint_path,
        filename='clickstream-{epoch:02d}-{loss_valid:.2f}',
        every_n_epochs=2,
        save_top_k=3
    )

    callbacks = [checkpoint]
    if args.progress_bar:
        progress_bar = TQDMProgressBar(refresh_rate=10)
        callbacks += [progress_bar]

    metric_logger = TensorBoardLogger(
        save_dir=f's3://dissertation-data-dmiller/lstm_experiments/results/{args.n_files}/{args.model_type}',
        name=f'sequence_length_{args.n_sequences}/{args.data_partition}/{date_time}',
        flush_secs=60,
        log_graph=True,
    )

    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
    device_count = _device_count()
    strategy = 'ddp' if 'ipykernel' not in sys.modules else None

    config = "\n".join([
        f'data input path: {args.data_input_path}',
        f'data partition: {args.data_partition}',
        f'batch_size: {args.batch_size}',
        f'n_epoch: {args.n_epochs}',
        f'n_workers: 8',
        f'device: {model.runtime_device}',
        f'train_samples: {clickstream_data_loader.training_data[0].shape[0]}',
        f'val_samples: {clickstream_data_loader.validation_data[0].shape[0]}',
        f'hidden size: {args.hidden_size}',
        f'dropout: {args.dropout}',
        f'n_sequences: {args.n_sequences}',
        f'n_features: {args.n_features}',
        f'learning_rate: {args.learning_rate}',
        f'accelerator: {accelerator}',
        f'device_count: {device_count}',
        f'strategy: {strategy}',
        f'model_type: {args.model_type}',
        f'zero_heuristic: {args.zero_heuristic}',
    ])


    if not args.validate_only:
        print(f'Beginning training:\n {config}\nlog_path= tensorboard --logdir {metric_logger.save_dir}/{metric_logger.name}/version_0')

        trainer = Trainer(
            precision='bf16' if torch.cuda.is_available() else 16,
            check_val_every_n_epoch=1,
            accelerator=accelerator,
            strategy=strategy,
            devices=device_count,
            max_epochs=args.n_epochs,
            callbacks=callbacks,
            logger=metric_logger,
            enable_progress_bar=args.progress_bar,
            log_every_n_steps=10
        )

        if args.checkpoint:
            checkpoint_s3_path = os.path.join('s3://dissertation-data-dmiller', args.checkpoint)
            print(f'Downloading checkpoint from {checkpoint_s3_path}')
            trainer.fit(model, ckpt_path=checkpoint_s3_path, datamodule=clickstream_data_loader)
        else:
            trainer.fit(model, datamodule=clickstream_data_loader)
    else:
        checkpoint_s3_path = os.path.join('s3://dissertation-data-dmiller', args.checkpoint)
        # print(f'Downloading checkpoint from {checkpoint_s3_path}')
        # model.load_from_checkpoint(checkpoint_s3_path)
        # auc_user_val, auc_user_test = (
        #     auc_by_user_bin(model, clickstream_data_loader.val_dataloader(), args.n_sequences, "val"),
        #     auc_by_user_bin(model, clickstream_data_loader.test_dataloader(), args.n_sequences, "test")
        # )

        # df_val = pd.DataFrame(auc_user_val, columns=['user_bin', 'acc', 'prec', 'rec', 'f1'])
        # df_test = pd.DataFrame(auc_user_test, columns=['user_bin', 'acc', 'prec', 'rec', 'f1'])

        # df_val.to_csv(f'auc_user_val_seq_{args.n_sequences}_heuristic_{args.zero_heuristic}.csv') 
        # df_test.to_csv(f'auc_user_test_seq_{args.n_sequences}_heuristic_{args.zero_heuristic}.csv')


In [11]:
class Arguments:

    model_type = 'embed_user_singular_lstm'
    batch_size = 4096
    learning_rate = 0.001
    n_workers = 8
    n_epochs = 100
    hidden_size = 32
    dropout = .2
    n_sequences = 40
    n_features = 20
    data_input_path = 'torch_ready_data_4'
    data_partition = None
    n_files = 61
    progress_bar = True
    checkpoint = None
    find_hparams = False
    zero_heuristic = False
    validate_only = False


In [12]:
auc_user_val = main(Arguments)

Running experiment at 2023_02_15_09_50
Loading pointer to dataset: torch_ready_data_4/files_used_61/sequence_index_0: derived from torch_ready_data_4/files_used_61/sequence_index_0.npz
Loading pointer to dataset: torch_ready_data_4/files_used_61/sequence_index_10: derived from torch_ready_data_4/files_used_61/sequence_index_10.npz
Loading pointer to dataset: torch_ready_data_4/files_used_61/sequence_index_20: derived from torch_ready_data_4/files_used_61/sequence_index_20.npz
Loading pointer to dataset: torch_ready_data_4/files_used_61/sequence_index_30: derived from torch_ready_data_4/files_used_61/sequence_index_30.npz
Loading pointer to dataset: torch_ready_data_4/files_used_61/sequence_index_40: derived from torch_ready_data_4/files_used_61/sequence_index_40.npz
Loading: torch_ready_data_4/files_used_61/sequence_index_0/arr_0.npy
Loading: torch_ready_data_4/files_used_61/sequence_index_10/arr_0.npy
Loading: torch_ready_data_4/files_used_61/sequence_index_20/arr_0.npy
Loading: torch

Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Beginning training:
 data input path: torch_ready_data_4
data partition: None
batch_size: 4096
n_epoch: 100
n_workers: 8
device: cuda
train_samples: 27565035
val_samples: 5906793
hidden size: 32
dropout: 0.2
n_sequences: 40
n_features: 20
learning_rate: 0.001
accelerator: gpu
device_count: 1
strategy: None
model_type: embed_user_singular_lstm
zero_heuristic: False
log_path= tensorboard --logdir s3://dissertation-data-dmiller/lstm_experiments/results/61/embed_user_singular_lstm/sequence_length_40/None/2023_02_15_09_50/version_0


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type              | Params
------------------------------------------------------
0 | loss            | BCEWithLogitsLoss | 0     
1 | train_accuracy  | BinaryAccuracy    | 0     
2 | valid_accuracy  | BinaryAccuracy    | 0     
3 | train_precision | BinaryPrecision   | 0     
4 | valid_precision | BinaryPrecision   | 0     
5 | train_recall    | BinaryRecall      | 0     
6 | valid_recall    | BinaryRecall      | 0     
7 | lstm_combined   | LSTM              | 17.3 K
8 | user_embedding  | Embedding         | 1.9 M 
9 | out             | Linear            | 33    
------------------------------------------------------
1.9 M     Trainable params
0         Non-trainable params
1.9 M     Total params
7.611     Total estimated model params size (MB)
  rank_zero_warn(


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]