In [1]:
!python --version


Python 3.9.13


In [2]:
!python -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 --quiet 
!python -m pip install pytorch-lightning==1.8.6 python-dotenv fsspec["s3"] boto3 s3fs==2022.11.0 --quiet

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cudf 22.10.1+2.gca9a422da9 requires cupy-cuda115<12.0.0a0,>=9.5.0, which is not installed.
cudf 22.10.1+2.gca9a422da9 requires cupy-cuda115<12.0.0a0,>=9.5.0, which is not installed.
cudf 22.10.1+2.gca9a422da9 requires cuda-python<11.7.1,>=11.5, but you have cuda-python 11.7.1 which is incompatible.[0m[31m
[0m

In [3]:
%load_ext dotenv
%dotenv env


In [4]:
S3_BUCKET = 's3://dissertation-data-dmiller'

In [5]:
import logging
import os
import zipfile
import numpy as np

In [6]:
LABEL = [
    "continue_work_session_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    
    "year",
    "month",
    "day",
    "hour",
    "minute",
    "second"
]

OUT_FEATURE_COLUMNS = [
    "user_count",
    "project_count",
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time",
    
    "expanding_click_average",
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]


In [7]:
# %load npz_extractor.py

class NPZExtractor:
    logger = logging.getLogger(__name__)
    def __init__(self, input_path, n_files, n_sequences, s3_client, data_partition) -> None:
        self.input_path = input_path
        self.n_files = n_files
        self.n_sequences = n_sequences
        self.s3_client = s3_client
        self.data_partition = data_partition


    def get_dataset_pointer(self):

        read_path = os.path.join(self.input_path, f'files_used_{self.n_files}')
        if not os.path.exists(read_path):
            self.logger.info(f'Creating directory: {read_path}')
            os.makedirs(read_path)


        for _ in range(0, self.n_sequences +1, 10):
            key_zip, key_npy = (
                os.path.join(read_path, f'sequence_index_{_}.npz'),
                os.path.join(read_path, f'sequence_index_{_}')
            )
            
            self.logger.info(f'Loading pointer to dataset: {key_npy}: derived from {key_zip}')


            if not os.path.exists(key_zip):
                self.logger.info(f'Zip file to extract: {key_zip}: npy file to load: {key_npy}')
                self.s3_client.download_file(
                    'dissertation-data-dmiller',
                    key_zip,
                    key_zip
                )
            if not os.path.exists(key_npy):
                self.logger.info(f'Zip file downloaded: {key_zip}: npy file to load: {key_npy}')

                self._zip_extract(key_zip, key_npy)

        lz_concatenated_results = self._lazy_concatenate()

        if self.data_partition:
            return [p[:self.data_partition] for p in lz_concatenated_results]
        else:
            return lz_concatenated_results


    def _zip_extract(self, key_zip, key_npy):
        self.logger.info(f'Extracting file: {key_zip} -> {key_npy}')

        with zipfile.ZipFile(key_zip, 'r') as zip_ref:
            zip_ref.extractall(path=key_npy, members=['arr_0.npy'])

        self.logger.info(f'Zip file exracted: {key_zip} -> {key_npy}/arr_0.npy')

    def _lazy_concatenate(self):
        lz_concat = []
        for _ in range(0, self.n_sequences +1, 10):
            path_to_load = os.path.join(self.input_path, f'files_used_{self.n_files}', f'sequence_index_{_}', f'arr_0.npy')
            self.logger.info(f'Loading: {path_to_load}')
            lz_concat.append(np.load(path_to_load, mmap_mode='r'))
        return lz_concat

In [8]:
# %load data_module.py
import pdb

import numpy as np
import torch
from  pytorch_lightning  import LightningDataModule
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import IterableDataset

LABEL_INDEX = 1
TOTAL_EVENTS_INDEX = 2
BATCHES = 1000000

class ClickstreamDataModule(LightningDataModule):
    def __init__(self, pointer_list, batch_size, n_sequences) -> None:
        super().__init__()
        self.batch_size = batch_size
        train_index = int(pointer_list[0].shape[0] * 0.7)
        val_index = int(pointer_list[0].shape[0] * 0.85)
        self.training_data = [p[:train_index] for p in pointer_list]
        self.validation_data = [p[train_index:val_index] for p in pointer_list]
        self.test_data = [p[val_index:] for p in pointer_list]
        self.n_sequences = n_sequences

    def train_dataloader(self):
        dataset = ClickstreamDataset(self.training_data, self.n_sequences)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=8, pin_memory=False)

    def val_dataloader(self):
        dataset = ClickstreamDataset(self.validation_data, self.n_sequences)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=8, pin_memory=False)
    
    def test_dataloader(self):
        dataset = ClickstreamDataset(self.test_data, self.n_sequences)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=8, pin_memory=False)
    
class ClickstreamDataset(Dataset):
    def __init__(self, dataset_pointer_list, n_sequences) -> None:
        super().__init__()
        """
        Yield data in batches of BATCHES
        """
        self.pointer_list = dataset_pointer_list
        self.n_sequences = n_sequences
        self.total_events = self.pointer_list[0].shape[0]


    def __getitem__(self, idx):
        result = [np.array(i[idx]) for i in self.pointer_list]
        return np.concatenate(result)

    def __len__(self):
        return self.total_events


In [9]:
# %load model_base.py
import pdb

import torch
import torch.nn as nn
from pytorch_lightning import LightningModule
from torchmetrics import Accuracy, Precision, Recall
from torchmetrics import Precision
from torchmetrics import Recall
METADATA_INDEX = 13
PLATFORM_TIME_INDEX = 4
USER_ID_INDEX = 1

class ModelBase(LightningModule):

    def __init__(self) -> None:
        super().__init__()

        self.loss = nn.BCEWithLogitsLoss()


        self.train_accuracy = Accuracy(task='binary', threshold=0.5)
        self.valid_accuracy = Accuracy(task='binary', threshold=0.5)

        self.train_precision = Precision(task='binary', threshold=0.5)
        self.valid_precision = Precision(task='binary', threshold=0.5)

        self.train_recall = Recall(task='binary', threshold=0.5)
        self.valid_recall = Recall(task='binary', threshold=0.5)
        
        


    def training_step(self, batch, batch_idx):
        loss, acc, prec, rec = self._run_step(batch, 'train')

        self.log(
            'loss/train',
            loss,
            logger=True,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        self.log(
            'loss_train',
            loss,
            logger=False,
            prog_bar=False,
            on_step=True,
            on_epoch=True,
            sync_dist=True
        )

        return {
            "loss": loss,
            "acc": acc,
            "prec": prec,
            "rec": rec, 
        }


    def validation_step(self, batch, batch_idx):
        loss, acc, prec, rec,  = self._run_step(batch, 'valid')

        self.log(
            'loss/valid',
            loss,
            logger=True,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        self.log(
            'loss_valid',
            loss,
            logger=False,
            prog_bar=False,
            on_step=True,
            on_epoch=True,
            sync_dist=True
        )

        return {
            "loss": loss,
            "acc": acc,
            "prec": prec,
            "rec": rec, 

        }


    def _run_step(self, batch, type):
        
    

        metadata, features  = self._extract_features(batch)
        y = metadata[:, 0].unsqueeze(1)

        if self.model_name.startswith('ordinal'):
            y_hat = self(features)
            
        if self.model_name.startswith('embedded'):
            user_id = metadata[:, USER_ID_INDEX]
            user_id = user_id.unsqueeze(1).repeat(1, self.n_sequences).unsqueeze(2)
            concatenated = torch.cat((user_id, features), dim=2)
            y_hat = self(concatenated)
           
        
        if self.zero_heuristic:
            cum_platform_time = metadata[:, PLATFORM_TIME_INDEX].unsqueeze(1)
            limit, prior = torch.scalar_tensor(25.0).float(), torch.scalar_tensor(0.0).float()
            limit, prior = limit.to(self.device), prior.to(self.device)
            y_hat = torch.where(cum_platform_time > limit, y_hat, prior)
              
        loss = self.loss(y_hat, y)

        if 'train' in type:
            acc = self.train_accuracy(y_hat, y)
            prec = self.train_precision(y_hat, y)
            rec = self.train_recall(y_hat, y)

        else:
            acc = self.valid_accuracy(y_hat, y)
            prec = self.valid_precision(y_hat, y)
            rec = self.valid_recall(y_hat, y)

        return loss, acc, prec, rec

    def _extract_features(self, tensor):
        
        metadata = tensor[:, :METADATA_INDEX]
        features = tensor[:, METADATA_INDEX:]

                
        features = torch.flip(
            torch.reshape(features, (features.shape[0], self.n_sequences, self.n_features)),
            dims=[1]
        )
        
        return metadata, features



    def training_epoch_end(self, outputs):

        acc, prec, rec, loss = (
            torch.stack([out['acc'] for out in outputs]),
            torch.stack([out['prec'] for out in outputs]),
            torch.stack([out['rec'] for out in outputs]),
            torch.stack([out['loss'] for out in outputs]),
        )

        acc, prec, rec, loss = (
            torch.mean(acc),
            torch.mean(prec),
            torch.mean(rec),
            torch.mean(loss),
        )

        self.logger.experiment.add_scalar('acc/train', acc, self.current_epoch)
        self.logger.experiment.add_scalar('prec/train', prec, self.current_epoch)
        self.logger.experiment.add_scalar('rec/train', rec, self.current_epoch)
        self.logger.experiment.add_scalar('loss_e/train', loss, self.current_epoch)


    def validation_epoch_end(self, outputs):

        acc, prec, rec, loss = (
            torch.stack([out['acc'] for out in outputs]),
            torch.stack([out['prec'] for out in outputs]),
            torch.stack([out['rec'] for out in outputs]),
            torch.stack([out['loss'] for out in outputs]),
        )

        acc, prec, rec, loss = (
            torch.mean(acc),
            torch.mean(prec),
            torch.mean(rec),
            torch.mean(loss),
        )

        self.logger.experiment.add_scalar('acc/valid', acc, self.current_epoch)
        self.logger.experiment.add_scalar('prec/valid', prec, self.current_epoch)
        self.logger.experiment.add_scalar('rec/valid', rec, self.current_epoch)
        self.logger.experiment.add_scalar('loss_e/valid', loss, self.current_epoch)
        
    def configure_optimizers(self):
        # equation for adam optimizer
        """
        m_t = beta_1 * m_{t-1} + (1 - beta_1) * g_t
        v_t = beta_2 * v_{t-1} + (1 - beta_2) * g_t^2
        m_cap = m_t / (1 - beta_1^t)
        v_cap = v_t / (1 - beta_2^t)
        w_t = w_{t-1} - lr * m_cap / (sqrt(v_cap) + eps)
        """
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

In [10]:
# %load model_protos.py
# %load model_protos.py
import torch
import torch.nn as nn
ORDINAL_FEATURE_INDEX = 17



class LSTMOrdinal(ModelBase):
    def __init__(self, n_features, n_seqeuences, hidden_size=32, dropout=0.2, lr=0.01, batch_size=256, zero_heuristic=False) -> None:
        self.n_features = n_features
        self.n_sequences = n_seqeuences
        self.zero_heuristic = zero_heuristic
        self.learning_rate = lr
        self.batch_size = batch_size
        self.model_name = 'ordinal'

        super().__init__()
        
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )
        self.output = nn.Linear(
            hidden_size,
            1
        )


        self.save_hyperparameters()

    def forward(self, x):


        x, _ = self.lstm(x)
        x = x[:, -1]
        return self.output(x)


class LSTMEmbedUser(ModelBase):
    def __init__(self, n_features, n_sequences, embedding_matrix, hidden_size=32, dropout=0.2, lr=0.001, batch_size=256, zero_heuristic=False) -> None:
        super().__init__()
        
        self.n_features = n_features
        self.n_sequences = n_sequences
        self.embedding = embedding_matrix
        self.learning_rate = lr
        self.batch_size = batch_size
        self.zero_heuristic = zero_heuristic
        self.model_name = 'embedded_user'
        

        self.lstm_ordinal = nn.LSTM(
            input_size=n_features,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.user_embedding = nn.Embedding(
            num_embeddings=embedding_matrix['user_embed'],
            embedding_dim=embedding_matrix['embed_out'],
            padding_idx=0
        )

        self.learning_rate = lr
        self.batch_size = batch_size

        trunk_linear = hidden_size + embedding_matrix['embed_out']

        self.out_trunk = nn.Sequential(
            nn.Linear(trunk_linear, 20),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(20, 1)
        )

        self.n_sequences = n_sequences 
        self.zero_heuristic = zero_heuristic

        self.save_hyperparameters()
    
    def forward(self, x):
        user_id, features = x[:, :, 0], x[:, :, 1:]
        user_id = user_id.long()
        user_embed = self.user_embedding(user_id)
        ordinal_out, _ = self.lstm_ordinal(features)
        concatenated_out = torch.cat((ordinal_out[:, -1], user_embed[:, -1]), dim=1)
        return self.out_trunk(concatenated_out)



In [11]:
# %load run_lstm_model.py
#  %load run_lstm_model.py
import argparse
import logging
import os
import sys
from datetime import datetime
from io import BytesIO

import boto3
import numpy as np
import torch

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger
import json
import pdb
S3_BUCKET = 's3://dissertation-data-dmiller'
SNS_TOPIC = 'arn:aws:sns:eu-west-1:774141665752:gradient-task'







np.set_printoptions(precision=8, suppress=True, linewidth=400)
torch.set_printoptions(precision=8, linewidth=400, sci_mode=False)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
"""
Embedding dim based on cube root of number of unique values
"""


def setup_logging():

    logger = logging.getLogger(__name__)

    handler = logging.StreamHandler()

    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    logger.info(f'Setup logging')
    return logger


def _device_count():
    if 'ipykernel' in sys.modules: return 1

    if torch.cuda.is_available():
        return torch.cuda.device_count()

    return 1


def setup_logging():

    logger = logging.getLogger(__name__)
    return logger


def _device_count():
    if 'ipykernel' in sys.modules: return 1

    if torch.cuda.is_available():
        return torch.cuda.device_count()

    return 1

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_type', type=str, default='ordinal')

    parser.add_argument('--batch_size', type=int, default=128)
    parser.add_argument('--learning_rate', type=float, default=0.001)

    parser.add_argument('--n_workers', type=int, default=8)
    parser.add_argument('--n_epochs', type=int, default=1)

    parser.add_argument('--hidden_size', type=int, default=32)
    parser.add_argument('--dropout', type=float, default=0.2)
    parser.add_argument('--n_sequences', type=int, default=10)
    parser.add_argument('--n_features', type=int, default=20)

    parser.add_argument('--data_input_path', type=str, default='datasets/torch_ready_data')
    parser.add_argument('--data_partition', type=int, default=1000)

    parser.add_argument('--n_files', type=str, default='2')

    parser.add_argument('--progress_bar', type=bool, default=True)
    parser.add_argument('--checkpoint', type=str, default=None)

    parser.add_argument('--find_hparams', type=bool, default=False)

    parser.add_argument('--zero_heuristic', type=bool, default=False)
    parser.add_argument('--validate_only', type=bool, default=False)
    args = parser.parse_args()
    return args


def get_model(
    logger,
    model_type,
    n_features,
    n_sequences,
    hidden_size,
    dropout,
    learning_rate,
    batch_size,
    zero_heuristic,
    embedding_params=None
):
    if model_type.startswith('ordinal') or model_type.startswith('heuristic'):
        logger.info('Creating LSTMOrdinal model')
        return LSTMOrdinal(
            n_features,
            n_sequences,
            hidden_size,
            dropout,
            learning_rate,
            batch_size,
            zero_heuristic
        )
    
    return LSTMEmbedUser(
        n_features,
        n_sequences,
        embedding_params,
        hidden_size,
        dropout,
        learning_rate,
        batch_size,
        zero_heuristic
    )


def main(args):
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")

    logger = setup_logging()
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    logger.info(f'Running experiment at {date_time}')

    s3_client = boto3.client(
        's3',
        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
    )
    
    npz_extractor = NPZExtractor(
        args.data_input_path,
        args.n_files,
        args.n_sequences,
        s3_client,
        args.data_partition)
    
    dataset = npz_extractor.get_dataset_pointer()
        
    if args.model_type.startswith('embedded'):
        users = dataset[0][:, 1].max() + 1
        embed_params = { 'user_embed': int(users), 'embed_out': int(min(users ** 0.25, 5)) }
    else:
        embed_params = None
    logger.info(f'Running model with embedding params {embed_params}')
    
    clickstream_data_loader = ClickstreamDataModule(dataset, args.batch_size, args.n_sequences + 1)

    model = get_model(
        logger,
        args.model_type,
        args.n_features,
        args.n_sequences + 1,
        args.hidden_size,
        args.dropout,
        args.learning_rate,
        args.batch_size,
        args.zero_heuristic,
        embed_params
    )
    
    data_version = "1"
    checkpoint_path = os.path.join(
        S3_BUCKET,
        'lstm_experiments',
        'checkpoints',
        f'data_v{data_version}',
        f'n_files_{str(args.n_files)}',
        args.model_type,
        f'sequence_length_{args.n_sequences}',
        f'data_partition_{str(args.data_partition)}',
        date_time)

    checkpoint = ModelCheckpoint(
        monitor='loss_valid',
        dirpath=checkpoint_path,
        filename='clickstream-{epoch:02d}-{loss_valid:.2f}',
        every_n_epochs=2,
        save_top_k=3
    )

    callbacks = [checkpoint]
    if args.progress_bar:
        progress_bar = TQDMProgressBar(refresh_rate=10)
        callbacks += [progress_bar]

    metric_logger = TensorBoardLogger(
        save_dir=f's3://dissertation-data-dmiller/lstm_experiments/results/data_v{data_version}/n_files_{args.n_files}/{args.model_type}',
        name=f'sequence_length_{args.n_sequences}/data_partition_{args.data_partition}/{date_time}',
        flush_secs=60,
        log_graph=True,
    )

    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
    device_count = _device_count()
    strategy = 'ddp' if 'ipykernel' not in sys.modules else None

    config = "\n".join([
        f'data input path: {args.data_input_path}',
        f'data partition: {args.data_partition}',
        f'batch_size: {args.batch_size}',
        f'n_epoch: {args.n_epochs}',
        f'n_workers: 8',
        f'train_samples: {clickstream_data_loader.training_data[0].shape[0]}',
        f'val_samples: {clickstream_data_loader.validation_data[0].shape[0]}',
        f'hidden size: {args.hidden_size}',
        f'dropout: {args.dropout}',
        f'n_sequences: {args.n_sequences}',
        f'n_features: {args.n_features}',
        f'learning_rate: {args.learning_rate}',
        f'accelerator: {accelerator}',
        f'device_count: {device_count}',
        f'strategy: {strategy}',
        f'model_type: {args.model_type}',
        f'zero_heuristic: {args.zero_heuristic}',
    ])


    logger.info(f'Beginning validation:\n {config}')
    logger.info(f'log_path=\n tensorboard --logdir {metric_logger.save_dir}/{metric_logger.name}/version_0')
    logger.info(f'checkpoint_path=\n {checkpoint_path}')
    trainer = Trainer(
        precision="bf16",
        check_val_every_n_epoch=1,
        accelerator=accelerator,
        devices=device_count,
        max_epochs=args.n_epochs,
        callbacks=callbacks,
        logger=metric_logger,
        enable_progress_bar=args.progress_bar,
        log_every_n_steps=500
        )
    
    if args.checkpoint:
        checkpoint_s3_path = os.path.join('s3://dissertation-data-dmiller', args.checkpoint)
        logger.info(f'Running model from checkpoint: {checkpoint_s3_path}')
        trainer.fit(model, ckpt_path=checkpoint_s3_path, datamodule=clickstream_data_loader)
    else:
        logger.info('Running model from scratch')
        trainer.fit(model, datamodule=clickstream_data_loader) 



In [12]:
class Arguments:

    model_type = 'ordinal'

    batch_size = 2048
    learning_rate = 0.001
    n_workers = 8
    n_epochs = 100
    hidden_size = 32
    dropout = .2
    n_sequences = 10
    n_features = 22
    
    data_input_path = 'torch_ready_data'
    data_partition = None
    n_files = 30
    progress_bar = True
    checkpoint = None
    find_hparams = False
    zero_heuristic = False



In [13]:
main(Arguments)

2023-06-18 16:33:22,629 - __main__ - INFO - Running experiment at 2023_06_18_16_33
2023-06-18 16:33:22,681 - __main__ - INFO - Loading pointer to dataset: torch_ready_data/files_used_30/sequence_index_0: derived from torch_ready_data/files_used_30/sequence_index_0.npz
2023-06-18 16:33:22,682 - __main__ - INFO - Loading pointer to dataset: torch_ready_data/files_used_30/sequence_index_10: derived from torch_ready_data/files_used_30/sequence_index_10.npz
2023-06-18 16:33:22,684 - __main__ - INFO - Loading: torch_ready_data/files_used_30/sequence_index_0/arr_0.npy
2023-06-18 16:33:22,716 - __main__ - INFO - Loading: torch_ready_data/files_used_30/sequence_index_10/arr_0.npy
2023-06-18 16:33:22,748 - __main__ - INFO - Running model with embedding params None
2023-06-18 16:33:22,749 - __main__ - INFO - Creating LSTMOrdinal model
2023-06-18 16:33:22,754 - torch.distributed.nn.jit.instantiator - INFO - Created a temporary directory at /tmp/tmpn0ar5zu6
2023-06-18 16:33:22,755 - torch.distribut

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

2023-06-18 16:34:28,676 - botocore.credentials - INFO - Found credentials in environment variables.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]