In [1]:
!python -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 --quiet
!python -m pip install pytorch-lightning==1.8.6 fsspec["s3"] python-dotenv boto3 s3fs==2022.11.0 tensorboard tensorflow --quiet

[0m

In [2]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer

In [3]:
%load_ext dotenv
%dotenv env


In [4]:
# %load npz_extractor.py
import logging
import os
import zipfile

import boto3
import numpy as np
import torch

class NPZExtractor:
    def __init__(self, input_path, n_files, n_sequences, s3_client, data_partition) -> None:
        self.input_path = input_path
        self.n_files = n_files
        self.n_sequences = n_sequences
        self.s3_client = s3_client
        self.data_partition = data_partition


    def get_dataset_pointer(self):

        read_path = os.path.join(self.input_path, f'files_used_{self.n_files}')
        if not os.path.exists(read_path):
            print(f'Creating directory: {read_path}')
            os.makedirs(read_path)


        for _ in range(0, self.n_sequences +1, 10):
            key_zip, key_npy = (
                os.path.join(read_path, f'sequence_index_{_}.npz'),
                os.path.join(read_path, f'sequence_index_{_}')
            )

            print(f'Loading pointer to dataset: {key_npy}: derived from {key_zip}')

            if not os.path.exists(key_npy):
                print(f'Zip file to extract: {key_zip}: npy file to load: {key_npy}')
                self.s3_client.download_file(
                    'dissertation-data-dmiller',
                    key_zip,
                    key_zip
                )
                print(f'Zip file downloaded: {key_zip}')
                self._zip_extract(key_zip, key_npy)

        lz_concatenated_results = self._lazy_concatenate()

        if self.data_partition:
            return [p[:self.data_partition] for p in lz_concatenated_results]
        else:
            return lz_concatenated_results


    def _zip_extract(self, key_zip, key_npy):
        print(f'Extracting file: {key_zip} -> {key_npy}')

        with zipfile.ZipFile(key_zip, 'r') as zip_ref:
            zip_ref.extractall(path=key_npy, members=['arr_0.npy'])

        print(f'Zip file exracted: {key_zip} -> {key_npy}/arr_0.npy')

    def _lazy_concatenate(self):
        lz_concat = []
        for _ in range(0, self.n_sequences +1, 10):
            path_to_load = os.path.join(self.input_path, f'files_used_{self.n_files}', f'sequence_index_{_}', f'arr_0.npy')
            print(f'Loading: {path_to_load}')
            lz_concat.append(np.load(path_to_load, mmap_mode='r'))
        return lz_concat

In [5]:
# %load data_module.py

import numpy as np
from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

LABEL_INDEX = 1
TOTAL_EVENTS_INDEX = 2
BATCHES = 1000000

class ClickstreamDataModule(LightningDataModule):
    def __init__(self, pointer_list, batch_size, n_sequences, n_features) -> None:
        super().__init__()
        self.batch_size = batch_size
        train_index = int(pointer_list[0].shape[0] * 0.7)
        val_index = int(pointer_list[0].shape[0] * 0.85)
        self.training_data = [p[:train_index] for p in pointer_list]
        self.validation_data = [p[train_index:val_index] for p in pointer_list]
        self.test_data = [p[val_index:] for p in pointer_list]
        self.n_sequences = n_sequences
        self.n_features = n_features

    def train_dataloader(self):
        dataset = ClickstreamDataset(self.training_data, self.n_sequences, self.n_features)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=8, pin_memory=False)

    def val_dataloader(self):
        dataset = ClickstreamDataset(self.validation_data, self.n_sequences, self.n_features)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=8, pin_memory=False)
    
    def test_dataloader(self):
        dataset = ClickstreamDataset(self.test_data, self.n_sequences, self.n_features)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=8, pin_memory=False)
    
class ClickstreamDataset(Dataset):
    def __init__(self, dataset_pointer_list, n_sequences, n_features) -> None:
        super().__init__()
        """
        Yield data in batches of BATCHES
        
        """
        self.pointer_list = dataset_pointer_list
        self.n_sequences = n_sequences
        self.n_features = n_features
        self.total_events = self.pointer_list[0].shape[0]


    def __getitem__(self, idx):
        result = [np.array(i[idx]) for i in self.pointer_list]
        return np.concatenate(result)

    def __len__(self):
        return self.total_events


In [6]:
# %load model_base.py
import pdb


import torch
import torch.nn as nn
from pytorch_lightning import LightningModule
from torchmetrics import Accuracy
from torchmetrics import Precision
from torchmetrics import Recall
ZERO_HEURISTIC_RATE = 10
ORDINAL_FEATURE_INDEX = 17

import pdb

import torch
import torch.nn as nn
from pytorch_lightning import LightningModule
from torchmetrics import Accuracy
from torchmetrics import Precision
from torchmetrics import Recall
ZERO_HEURISTIC_RATE = 10
ORDINAL_FEATURE_INDEX = 17



class ModelBase(LightningModule):

    def __init__(self) -> None:
        super().__init__()

        self.loss = nn.BCEWithLogitsLoss()

        self.runtime_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.train_accuracy = Accuracy(task='binary', threshold=0.5)
        self.valid_accuracy = Accuracy(task='binary', threshold=0.5)

        self.train_precision = Precision(task='binary', threshold=0.5)
        self.valid_precision = Precision(task='binary', threshold=0.5)

        self.train_recall = Recall(task='binary', threshold=0.5)
        self.valid_recall = Recall(task='binary', threshold=0.5)

        self = self.to(self.runtime_device)

    def training_step(self, batch, batch_idx):
        loss, acc, prec, rec = self._run_step(batch, 'train')

        self.log(
            'loss/train',
            loss,
            logger=True,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        self.log(
            'loss_train',
            loss,
            logger=False,
            prog_bar=False,
            on_step=True,
            on_epoch=True,
            sync_dist=True
        )

        return {
            "loss": loss,
            "acc": acc,
            "prec": prec,
            "rec": rec
        }


    def validation_step(self, batch, batch_idx):
        loss, acc, prec, rec = self._run_step(batch, 'valid')

        self.log(
            'loss/valid',
            loss,
            logger=True,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        self.log(
            'loss_valid',
            loss,
            logger=False,
            prog_bar=False,
            on_step=True,
            on_epoch=True,
            sync_dist=True
        )

        return {
            "loss": loss,
            "acc": acc,
            "prec": prec,
            "rec": rec
        }


    def _run_step(self, batch, type):

        y, total_events, categorical_features, ordinal_features  = self._extract_features(batch)

        if 'ordinal' in self.model_name:
            y_hat = self(ordinal_features)
        
        if 'embedded' in self.model_name:
            concatenated = torch.cat((ordinal_features, categorical_features), dim=2)
            assert concatenated.shape == (batch.shape[0], self.n_sequences, 19), 'concatenated shape is wrong'
            y_hat = self(torch.cat((ordinal_features, categorical_features), dim=2))

        if self.zero_heuristic:
            y_hat = torch.where(total_events <= ZERO_HEURISTIC_RATE, torch.zeros_like(y_hat), y_hat)

        loss = self.loss(y_hat, y)
        y = y.int()

        if 'train' in type:
            acc = self.train_accuracy(y_hat, y)
            prec = self.train_precision(y_hat, y)
            rec = self.train_recall(y_hat, y)

        else:
            acc = self.valid_accuracy(y_hat, y)
            prec = self.valid_precision(y_hat, y)
            rec = self.valid_recall(y_hat, y)

        return loss, acc, prec, rec

    def _extract_features(self, tensor):

        label, total_events, user_id, project_id, features, shifters = (
            tensor[:, 0], tensor[:, 1],  tensor[:, 2],
            tensor[:, 3], tensor[:, 5:5+17], tensor[:, 5+17:]
        )

        shifters = torch.reshape(shifters, (shifters.shape[0], self.n_sequences-1, 18))
        shifter_project_id, shifter_features = shifters[:, :, 0], shifters[:, :, 1:]

        project_id = torch.flip(torch.cat((project_id.unsqueeze(1), shifter_project_id), dim=1), dims=[1]).long()
        features = torch.flip(torch.cat((features.unsqueeze(1), shifter_features), dim=1), dims=[1])

        user_id = user_id.unsqueeze(1).repeat(1, self.n_sequences).long()
        user_id = torch.where(project_id == 0, 0, user_id)

        user_id, project_id = user_id.unsqueeze(2), project_id.unsqueeze(2)

        user_project_concat = torch.cat((user_id, project_id), dim=2)

        assert user_project_concat.shape == (user_id.shape[0], self.n_sequences, 2), 'user_project_concat shape is wrong'
        return (
            label.unsqueeze(1), 
            total_events.unsqueeze(1), 
            user_project_concat,
            features
        )

    def training_epoch_end(self, outputs):

        acc, prec, rec, loss = (
            torch.stack([out['acc'] for out in outputs]),
            torch.stack([out['prec'] for out in outputs]),
            torch.stack([out['rec'] for out in outputs]),
            torch.stack([out['loss'] for out in outputs])
        )

        acc, prec, rec, loss = (
            torch.mean(acc),
            torch.mean(prec),
            torch.mean(rec),
            torch.mean(loss)
        )

        self.logger.experiment.add_scalar('acc/train', acc, self.current_epoch)
        self.logger.experiment.add_scalar('prec/train', prec, self.current_epoch)
        self.logger.experiment.add_scalar('rec/train', rec, self.current_epoch)
        self.logger.experiment.add_scalar('loss_e/train', loss, self.current_epoch)


    def validation_epoch_end(self, outputs):

        acc, prec, rec, loss = (
            torch.stack([out['acc'] for out in outputs]),
            torch.stack([out['prec'] for out in outputs]),
            torch.stack([out['rec'] for out in outputs]),
            torch.stack([out['loss'] for out in outputs])
        )

        acc, prec, rec, loss = (
            torch.mean(acc),
            torch.mean(prec),
            torch.mean(rec),
            torch.mean(loss)
        )

        self.logger.experiment.add_scalar('acc/valid', acc, self.current_epoch)
        self.logger.experiment.add_scalar('prec/valid', prec, self.current_epoch)
        self.logger.experiment.add_scalar('rec/valid', rec, self.current_epoch)
        self.logger.experiment.add_scalar('loss_e/valid', loss, self.current_epoch)

    def configure_optimizers(self):
        # equation for adam optimizer
        """
        m_t = beta_1 * m_{t-1} + (1 - beta_1) * g_t
        v_t = beta_2 * v_{t-1} + (1 - beta_2) * g_t^2
        m_cap = m_t / (1 - beta_1^t)
        v_cap = v_t / (1 - beta_2^t)
        w_t = w_{t-1} - lr * m_cap / (sqrt(v_cap) + eps)
        """
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

In [7]:
# %load model_protos.py
# %load model_protos.py
import pdb

import torch
import torch.nn as nn
ORDINAL_FEATURE_INDEX = 17


class LSTMOrdinal(ModelBase):
    def __init__(self, n_features, n_seqeuences, hidden_size=32, dropout=0.2, lr=0.01, batch_size=256, zero_heuristic=False) -> None:
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=n_features - 3,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.learning_rate = lr
        self.batch_size = batch_size
        self.model_name = 'ordinal'

        self.output = nn.Linear(
            hidden_size,
            1
        )

        self.n_sequences = n_seqeuences

        self.zero_heuristic = zero_heuristic
        self.save_hyperparameters()

    def forward(self, x):


        x, _ = self.lstm(x)
        x = x[:, -1]
        return self.output(x)

class LSTMEmbedUserProject(ModelBase):
    def __init__(self, n_features, n_sequences, embedding_matrix, hidden_size=32, dropout=0.2, lr=0.01, batch_size=256, zero_heuristic=False) -> None:
        super().__init__()

        self.lstm_ordinal = nn.LSTM(
            input_size=n_features - 3,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.user_embedding = nn.Embedding(
            num_embeddings=embedding_matrix['user_id'][0] + 1,
            embedding_dim=embedding_matrix['user_id'][1],
            padding_idx=0
        )

        self.project_embedding = nn.Embedding(
            num_embeddings=embedding_matrix['project_id'][0] + 1,
            embedding_dim=embedding_matrix['project_id'][1],
            padding_idx=0
        )

        self.lstm_embedding = nn.LSTM(
            input_size=embedding_matrix['user_id'][1] + embedding_matrix['project_id'][1],
            hidden_size=(embedding_matrix['user_id'][1] + embedding_matrix['project_id'][1]) // 2,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.learning_rate = lr
        self.batch_size = batch_size
        self.model_name = 'embedded'

        trunk_linear = hidden_size + ((embedding_matrix['user_id'][1] + embedding_matrix['project_id'][1]) // 3)

        self.out_trunk = nn.Sequential(
            nn.Linear(trunk_linear, 20),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(20, 1)
        )

        self.n_sequences = n_sequences
        self.zero_heuristic = zero_heuristic

        self.save_hyperparameters()

    def forward(self, x):
        ordinal_features, categorical_features = x[:, :, :ORDINAL_FEATURE_INDEX], x[:, :, ORDINAL_FEATURE_INDEX:].int()


        user, project = categorical_features[:, :, 0], categorical_features[:, :, 1]

        """
        assert no more than 2 unique users: padded and value
        """



        user, project = self.user_embedding(user), self.project_embedding(project)
        categorical_features = torch.cat((user, project), dim=2)
        ordinal_out = self.lstm_ordinal(ordinal_features)
        categorical_out = self.lstm_embedding(categorical_features)
        return self.out_trunk(torch.cat((ordinal_out[0][:, -1], categorical_out[0][:, -1]), dim=1))
    

class LSTMEmbedUser(ModelBase):
    def __init__(self, n_features, n_sequences, embedding_matrix, hidden_size=32, dropout=0.2, lr=0.01, batch_size=256, zero_heuristic=False) -> None:
        super().__init__()

        self.lstm_ordinal = nn.LSTM(
            input_size=n_features - 3,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.user_embedding = nn.Embedding(
            num_embeddings=embedding_matrix['user_id'][0] + 1,
            embedding_dim=embedding_matrix['user_id'][1],
            padding_idx=0
        )

        self.lstm_embedding = nn.LSTM(
            input_size=embedding_matrix['user_id'][1],
            hidden_size = max(3, (embedding_matrix['user_id'][1]) // 3),
            # hidden_size=(embedding_matrix['user_id'][1]) // 3,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )


        self.learning_rate = lr
        self.batch_size = batch_size
        self.model_name = 'embedded_user'

        trunk_linear = hidden_size + max(((embedding_matrix['user_id'][1]) // 3), 3)

        self.out_trunk = nn.Sequential(
            nn.Linear(trunk_linear, 20),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(20, 1)
        )

        self.n_sequences = n_sequences
        self.zero_heuristic = zero_heuristic

        self.save_hyperparameters()
    
    def forward(self, x):
        ordinal_features, categorical_features = x[:, :, :ORDINAL_FEATURE_INDEX], x[:, :, ORDINAL_FEATURE_INDEX:].int()
        user = categorical_features[:, :, 0]
        user = self.user_embedding(user)
        ordinal_out = self.lstm_ordinal(ordinal_features)
        user_out = self.lstm_embedding(user)
        return self.out_trunk(torch.cat((ordinal_out[0][:, -1], user_out[0][:, -1]), dim=1))


class LSTMEmbedOneLSTM(ModelBase):
    def __init__(self, n_features, n_sequences, embedding_matrix, hidden_size=32, dropout=0.2, lr=0.01, batch_size=256, zero_heuristic=False) -> None:
        super().__init__()

        self.lstm_combined = nn.LSTM(
            input_size=(n_features - 3) + embedding_matrix['user_id'][1],
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        self.user_embedding = nn.Embedding(
            num_embeddings=embedding_matrix['user_id'][0] + 1,
            embedding_dim=embedding_matrix['user_id'][1],
            padding_idx=0
        )

        self.learning_rate = lr
        self.batch_size = batch_size
        self.model_name = 'embedded_one_lstm'

        self.out = nn.Linear(hidden_size, 1)

        self.n_sequences = n_sequences
        self.zero_heuristic = zero_heuristic

        self.save_hyperparameters()
    
    def forward(self, x):
        ordinal_features, categorical_features = x[:, :, :ORDINAL_FEATURE_INDEX], x[:, :, ORDINAL_FEATURE_INDEX:].int()
        user = categorical_features[:, :, 0]
        user = self.user_embedding(user)
        features = torch.cat((ordinal_features, user), dim=2)
        out = self.lstm_combined(features)
        return self.out(out[0][:, -1])


In [8]:
USER_CATEGORIES = [0, 50, 100, 150, 200, 300, 500, 1000, 200, 3000, 5000, 7500, 10000, 20000]

In [9]:
import torch
def adjacency_matrix():
    adjacency_matrix = {}
    for user in USER_CATEGORIES:
        for sub_user in USER_CATEGORIES:
            if (sub_user, user) not in adjacency_matrix:
                adjacency_matrix[(user, sub_user)] = {
                    'mean': [],
                    'std': []
                }
    return adjacency_matrix


def shuffle_indexes(tensor):
    indexes = torch.randperm(tensor.shape[0])
    return tensor[indexes]

In [10]:
import tqdm
def cosine_similarity(model, val_loader, test_loader):
    matrix = adjacency_matrix()
    cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6).to('cuda')
    model = model.to('cuda')

    pbar_dataset = tqdm.tqdm(zip(val_loader, test_loader), total=len(val_loader))
    for indx, (batch_val, batch_test) in enumerate(pbar_dataset):
        for g_index_1 in range(len(USER_CATEGORIES)):
            for g_index_2 in range(len(USER_CATEGORIES)):
                u_group_1, u_group_2 = USER_CATEGORIES[g_index_1], USER_CATEGORIES[g_index_2]
                if (u_group_1, u_group_2) in matrix:
                    tensor_group_1, tensor_group_2 = batch_val[:, [1, 3]].to('cuda').long(), batch_test[:, [1, 3]].to('cuda').long()
                    pbar_dataset.set_description(f"cosine similarity {indx}")

                    tensor_group_1 = tensor_group_1[(tensor_group_1[:, 0] > USER_CATEGORIES[g_index_1 - 1]) & (tensor_group_1[:, 0] <= u_group_1)]
                    tensor_group_2 = tensor_group_2[(tensor_group_2[:, 0] > USER_CATEGORIES[g_index_2 - 1]) & (tensor_group_2[:, 0] <= u_group_2)]

                    if tensor_group_1.shape != tensor_group_2.shape:
                        min_len = min(tensor_group_1.shape[0], tensor_group_2.shape[0])
                        tensor_group_1 = tensor_group_1[:min_len]
                        tensor_group_2 = tensor_group_2[:min_len]

                    if tensor_group_1.shape[0] > 0 and tensor_group_2.shape[0] > 0:
                       
                        embedding_g_1, embedding_g_2 = model.user_embedding(tensor_group_1), model.user_embedding(tensor_group_2)
                        cosine_similarity = cos_sim(embedding_g_1, embedding_g_2)
                        matrix[(u_group_1, u_group_2)]['mean'].append(cosine_similarity.mean().item())
                        matrix[(u_group_1, u_group_2)]['std'].append(cosine_similarity.std().item())

    return matrix

In [11]:
# %load run_lstm_model.py
# %load run_lstm_model.py
import os
from datetime import datetime

import boto3
import numpy as np
import torch

S3_BUCKET = 's3://dissertation-data-dmiller'
SNS_TOPIC = 'arn:aws:sns:eu-west-1:774141665752:gradient-task'

USER_INDEX = 9
PROJECT_INDEX = 10
COUNTRY_INDEX = 11
USER_CATEGORIES = [100, 500, 1000, 5000, 10000]

"""
Embedding dim based on cube root of number of unique values
"""

EMBEDDING_DIMS = {
    '5': {
        'user_id': (17891, 2),
        'project_id': (328, 1),
    },
    '30': {
        'user_id': (60459 ,int(60459**0.25)),
        'project_id': (617. , int(617**0.25)),
    },
    '45': {
        'user_id': (85663, int(85663**0.25)),
        'project_id': (757, int(757**0.25)),
    },
    '61': {
        'user_id': (104744, 18),
        'project_id': (846, 6),
    }
}

def get_model(
    model_type,
    n_features,
    n_sequences,
    hidden_size,
    dropout,
    learning_rate,
    batch_size,
    zero_heuristic,
    n_files
):
    if model_type == 'ordinal':
        print('Creating LSTMOrdinal model')
        return LSTMOrdinal(
            n_features,
            n_sequences,
            hidden_size,
            dropout,
            learning_rate,
            batch_size,
            zero_heuristic
        )
    elif model_type == 'embed_user_project':
        return LSTMEmbedUserProject(
            n_features,
            n_sequences,
            EMBEDDING_DIMS[str(n_files)],
            hidden_size,
            dropout,
            learning_rate,
            batch_size,
        )
    elif model_type == 'embed_user':
        return LSTMEmbedUser(
            n_features,
            n_sequences,
            EMBEDDING_DIMS[str(n_files)],
            hidden_size,
            dropout,
            learning_rate,
            batch_size,
        )
    else:
        return LSTMEmbedOneLSTM(
            n_features,
            n_sequences,
            EMBEDDING_DIMS[str(n_files)],
            hidden_size,
            dropout,
            learning_rate,
            batch_size,
        )



def main(args):
    np.set_printoptions(linewidth=400)

    torch.set_printoptions(precision=4, sci_mode=False)
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")


    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    print(f'Running experiment at {date_time}')

    npz_extractor = NPZExtractor(
        args.data_input_path,
        args.n_files,
        args.n_sequences,
        None,
        args.data_partition)

    clickstream_data_loader = ClickstreamDataModule(npz_extractor.get_dataset_pointer(), args.batch_size, args.n_sequences + 1, args.n_features)

    model = get_model(
        args.model_type,
        args.n_features,
        args.n_sequences + 1,
        args.hidden_size,
        args.dropout,
        args.learning_rate,
        args.batch_size,
        args.zero_heuristic,
        args.n_files
    )

    checkpoint_path = os.path.join('s3://dissertation-data-dmiller', args.checkpoint)
    print(f'Loading checkpoint from {checkpoint_path}')
    model.load_from_checkpoint(checkpoint_path)
    model.eval()
    model.to('cuda')

    val_loader = clickstream_data_loader.val_dataloader()
    test_loader = clickstream_data_loader.test_dataloader()
    print('Identifying cosine similarity for validation loader')
    cos_sim_matrix = cosine_similarity(model, val_loader, test_loader)
    return cos_sim_matrix

In [12]:
import json

class Arguments:

    model_type = 'embed_user'
    batch_size = 16684
    learning_rate = 0.001
    n_workers = 8
    n_epochs = 100
    hidden_size = 32
    dropout = .2
    n_sequences = 10
    n_features = 20
    data_input_path = 'torch_ready_data_4'
    data_partition = None
    n_files = 61
    progress_bar = True
    checkpoint = 'lstm_experiments/checkpoints/61/embed_user/sequence_length_40/None/2023_02_15_09_43/clickstream-epoch=05-loss_valid=0.60.ckpt'
    find_hparams = False
    zero_heuristic = False

In [13]:
cos_sim_matrix = main(Arguments)
cos_sim_matrix = {
    str(k): v for k, v in cos_sim_matrix.items()
}

with open('cos_sim_matrix_3.json', 'w') as f:
    json.dump(cos_sim_matrix, f)

Running experiment at 2023_02_16_14_28
Loading pointer to dataset: torch_ready_data_4/files_used_61/sequence_index_0: derived from torch_ready_data_4/files_used_61/sequence_index_0.npz
Loading pointer to dataset: torch_ready_data_4/files_used_61/sequence_index_10: derived from torch_ready_data_4/files_used_61/sequence_index_10.npz
Loading: torch_ready_data_4/files_used_61/sequence_index_0/arr_0.npy
Loading: torch_ready_data_4/files_used_61/sequence_index_10/arr_0.npy
Loading checkpoint from s3://dissertation-data-dmiller/lstm_experiments/checkpoints/61/embed_user/sequence_length_40/None/2023_02_15_09_43/clickstream-epoch=05-loss_valid=0.60.ckpt
Identifying cosine similarity for validation loader


cosine similarity 354: 100%|██████████| 355/355 [01:33<00:00,  3.80it/s]
