In [1]:
!pip install torch python-dotenv boto3 --quiet 

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.4.0 requires botocore<1.27.60,>=1.27.59, but you have botocore 1.29.118 which is incompatible.[0m[31m
[0m

In [2]:
%load_ext dotenv
%dotenv env

In [3]:
# %load constant
LABEL = [
    "session_terminates_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    
    "cum_session_event_raw",
    "cum_session_time_raw",
    
    "cum_platform_event_raw",
    "global_events_user",
    "global_session_time_minutes",
]

DATE_TIME = [
    "date_time",
]

OUT_FEATURE_COLUMNS = [
    "country_count",
    "timestamp_raw",
    "date_hour_sin",
    
    "date_hour_cos",
    "session_5_count",
    "session_30_count",
    
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time_minutes",
    
    "expanding_click_average",
    "cum_platform_time_minutes",
    "cum_platform_events",
    
    "cum_projects",
    "average_event_time",
    "rolling_session_time",
    
    "rolling_session_events",
    "rolling_session_gap",
    "session_event_count",
]


TORCH_LOAD_COLS = LABEL + METADATA + DATE_TIME + OUT_FEATURE_COLUMNS

In [4]:
# %load data_module
import pdb

import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import IterableDataset

LABEL_INDEX = 1
TOTAL_EVENTS_INDEX = 2
BATCHES = 1000000

    
class ClickstreamDataset(Dataset):
    def __init__(self, dataset_pointer_list) -> None:
        """
        Yield data in batches of BATCHES
        """
        self.events = dataset_pointer_list
        self.size = self.events[0].shape[0]


    def __getitem__(self, idx):
        events = [np.array([event[idx]]) for event in self.events]
        return np.concatenate(events, axis=1)
       

    def __len__(self):
        return self.size


In [5]:
# %load torch_model_bases
import torch 
from torch import nn
N_FEATURES = 18
class LSTMOrdinal(nn.Module):
    def __init__(self,  hidden_size=32, dropout=0.2) -> None:
        super(LSTMOrdinal, self).__init__()
        
        self.lstm = nn.LSTM(
            input_size=N_FEATURES,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )
        self.output = nn.Linear(
            hidden_size,
            1
        )

    def forward(self, x):


        x, _ = self.lstm(x)
        x = x[:, -1]
        return self.output(x)

In [6]:
# %load npz_extractor
import logging
import os
import zipfile

import boto3
import numpy as np
import torch
import logging

class NPZExtractor:
    logger = logging.getLogger(__name__)
    def __init__(self, input_path, n_files, n_sequences, s3_client, data_partition) -> None:
        self.input_path = input_path
        self.n_files = n_files
        self.n_sequences = n_sequences
        self.s3_client = s3_client
        self.data_partition = data_partition


    def get_dataset_pointer(self):

        read_path = os.path.join(self.input_path, f'files_used_{self.n_files}')
        if not os.path.exists(read_path):
            self.logger.info(f'Creating directory: {read_path}')
            os.makedirs(read_path)


        for _ in range(0, self.n_sequences +1, 10):
            key_zip, key_npy = (
                os.path.join(read_path, f'sequence_index_{_}.npz'),
                os.path.join(read_path, f'sequence_index_{_}')
            )
            
            self.logger.info(f'Loading pointer to dataset: {key_npy}: derived from {key_zip}')


            if not os.path.exists(key_zip):
                self.logger.info(f'Zip file to extract: {key_zip}: npy file to load: {key_npy}')
                self.s3_client.download_file(
                    'dissertation-data-dmiller',
                    key_zip,
                    key_zip
                )
            if not os.path.exists(key_npy):
                self.logger.info(f'Zip file downloaded: {key_zip}: npy file to load: {key_npy}')

                self._zip_extract(key_zip, key_npy)

        lz_concatenated_results = self._lazy_concatenate()

        if self.data_partition:
            return [p[:self.data_partition] for p in lz_concatenated_results]
        else:
            return lz_concatenated_results


    def _zip_extract(self, key_zip, key_npy):
        self.logger.info(f'Extracting file: {key_zip} -> {key_npy}')

        with zipfile.ZipFile(key_zip, 'r') as zip_ref:
            zip_ref.extractall(path=key_npy, members=['arr_0.npy'])

        self.logger.info(f'Zip file exracted: {key_zip} -> {key_npy}/arr_0.npy')

    def _lazy_concatenate(self):
        lz_concat = []
        for _ in range(0, self.n_sequences +1, 10):
            path_to_load = os.path.join(self.input_path, f'files_used_{self.n_files}', f'sequence_index_{_}', f'arr_0.npy')
            self.logger.info(f'Loading: {path_to_load}')
            lz_concat.append(np.load(path_to_load, mmap_mode='r'))
        return lz_concat

In [7]:
# %load likelihood_engagement_cpu
import torch
import os
import argparse
import logging
import boto3
from pprint import pformat
from torch import nn
import io
import tqdm
import numpy as np


CHECK_COLS = LABEL + METADATA + DATE_TIME + ['prediction']

In [21]:


torch.set_printoptions(sci_mode=False, linewidth=400, precision=2)
np.set_printoptions(suppress=True, precision=4, linewidth=200)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

if torch.cuda.is_available():
    import cudf as pd
    import pandas as cpu_pd
    cpu_pd.set_option('display.max_columns', 500)
    cpu_pd.set_option('display.width', 1000)
    
    import numpy as np
else:
    import pandas as pd
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    pd.set_option('display.precision', 4)
    import numpy as np
    
    

CHECKPOINT_DIR='s3://dissertation-data-dmiller/lstm_experiments/checkpoints/data_v1/n_files_30/ordinal/sequence_length_10/data_partition_None/2023_03_30_07_54'
METADATA_INDEX = 12
logger = logging.getLogger('likelihood_engagement')

def parse_args():
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--n_files', type=int, default=2)
    parser.add_argument('--n_sequences', type=int, default=10)
    parser.add_argument('--file_path', type=str, default='datasets/torch_ready_data')
    parser.add_argument('--checkpoint_dir', type=str, default=CHECKPOINT_DIR)
    parser.add_argument('--write_path', type=str, default='datasets/lstm_predictions')
    parser.add_argument('--model_type', type=str, default='ordinal')
    args = parser.parse_args()
    return args

def _extract_features(tensor, n_sequences, n_features):
      
    tensor = tensor.squeeze(1)
    metadata, features = tensor[:, :METADATA_INDEX], tensor[:, METADATA_INDEX:] 
        
    features = torch.flip(
        torch.reshape(features, (features.shape[0], n_sequences, n_features)),
        dims=[1]
    )
        
    return metadata, features

@torch.no_grad()
def generate_static_predictions(args):
    
    user_metadata_container = []
    
    logger.info('Generating static prediction likelihoods for experiment')
    npz_extractor = NPZExtractor(
        args.file_path,
        args.n_files,
        args.n_sequences,
        None,
        None
           
    )
    
    logger.info('generating dataset pointer')
    dataset = npz_extractor.get_dataset_pointer()
    
    
    logger.info('Downloading model checkpoint')
    
    write_path = os.path.join(args.write_path, f'files_used_{args.n_files}/{args.model_type}_seq_{args.n_sequences}')
    if not os.path.exists(write_path):
        logger.info(f'Creating directory: {write_path}')
        os.makedirs(write_path)
    
    client = boto3.client('s3')
    
    checkpoint = client.get_object(
        Bucket='dissertation-data-dmiller',
        Key='lstm_experiments/checkpoints/data_v1/n_files_30/ordinal/sequence_length_10/data_partition_None/2023_03_30_07_54/clickstream-epoch=83-loss_valid=0.29.ckpt'
    )
    

    logger.info('Loading model checkpoint')
    
    buffer = io.BytesIO(checkpoint['Body'].read())
    map_location = 'cuda' if torch.cuda.is_available() else 'cpu'
   
    logger.info('checkpoint loaded from buffer. Loading model')
    model_state = torch.load(buffer, map_location=torch.device(map_location))
    model = LSTMOrdinal()
    model.load_state_dict(model_state['state_dict'])
    model = model.cuda() if torch.cuda.is_available() else model.cpu()
    logger.info(f'Model loaded. Creating dataset: n_events {dataset[0].shape[0]}')
    
    dataset = ClickstreamDataset(dataset)
    logger.info(f'Dataset created. Creating loader')
    loader = DataLoader(dataset, batch_size=65536, shuffle=False)
    

    p_bar = tqdm.tqdm(loader)
    
    activation_fx = nn.Sigmoid().cuda()
    
    for indx, data in enumerate(p_bar):
        p_bar.set_description(f'Processing batch: {indx}')
        metadata, features = _extract_features(data, args.n_sequences + 1, 18)
        metadata, features = metadata.cuda(), features.cuda()
        user_metadata = metadata[:, :4]
        preds = model(features)
        preds = activation_fx(preds)
        user_metadata = torch.cat([user_metadata, preds], dim=1)
        user_metadata_container.append(user_metadata.cpu().numpy())

   
    user_metadata = np.concatenate(user_metadata_container, axis=0)
    user_metadata = pd.DataFrame(user_metadata, columns=['user_label', 'user_id', 'session_id', 'event_id', 'prediction'])
    
    
   
    logger.info(f'Writing predictions to {write_path}/predictions.parquet')
 
    user_metadata.to_parquet(f'{write_path}/predictions.parquet')


def join_pred_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_files', type=int, default=2)
    parser.add_argument('--n_sequences', type=int, default=10)
    parser.add_argument('--model_type', type=str, default='ordinal')
    parser.add_argument('--rl_data', type=str, default='datasets/rl_ready_data')
    
    args = parser.parse_args()
    return args

def join_predictions_on_original(args):
    predictions_path = f'lstm_predictions/files_used_{args.n_files}/{args.model_type}_seq_{args.n_sequences}/predictions.parquet'
    dataset_path = f'calculated_features/files_used_{args.n_files}.parquet'
    if not torch.cuda.is_available():
        predictions_path, dataset_path = (
            os.path.join('datasets', predictions_path),
            os.path.join('datasets', dataset_path)
        )
    logger.info(f'Loading predictions from {predictions_path}')
    logger.info(f'Loading dataset from {dataset_path}')
    
    predictions, original = (
        pd.read_parquet(predictions_path),
        pd.read_parquet(dataset_path, columns=TORCH_LOAD_COLS)
    )
    
    predictions = predictions.rename(columns={
        'session_id': 'session_30_raw',
        'event_id': 'cum_session_event_raw'
    })
    
    logger.info(f'Shape of predictions: {predictions.shape}')
    logger.info(f'Shape of original: {original.shape}')
    
    
    logger.info(f'Joining predictions on original dataset')

    predictions = predictions.set_index(['user_id', 'session_30_raw', 'cum_session_event_raw']) \
        .join(original.set_index(['user_id', 'session_30_raw', 'cum_session_event_raw'])) \
        .reset_index() \
        .drop(columns=['user_label'])
    
    
    logger.info(f'Predictions joined: {predictions.shape}: columns')
    logger.info(pformat(predictions.columns.tolist()))
    
    logger.info(predictions[CHECK_COLS].head(10))
    
    write_path = os.path.join(
        args.rl_data,
        f'files_used_{args.n_files}',
        f'{args.model_type}_seq_{args.n_sequences}'
    )
    
    if not os.path.exists(write_path):
        logger.info(f'Creating directory: {write_path}')
        os.makedirs(write_path)
    
    logger.info(f'Writing joined predictions to {write_path}/rl_ready_data.parquet.gzip')
    predictions.to_parquet(f'{write_path}/rl_ready_data.parquet')
   
    


In [22]:
class StaticPredArgs:
    n_files = 30
    n_sequences = 10
    file_path = 'torch_ready_data'
    checkpoint_dir = CHECKPOINT_DIR
    write_path = 'lstm_predictions'
    model_type = 'ordinal'
    
class JoinedPredArgs:
    n_files = 30
    n_sequences = 10
    model_type = 'ordinal'
    rl_data = 'rl_ready_data'
    

In [23]:
generate_static_predictions(StaticPredArgs)

2023-04-24 15:47:02,166 Generating static prediction likelihoods for experiment
2023-04-24 15:47:02,167 generating dataset pointer
2023-04-24 15:47:02,168 Loading pointer to dataset: torch_ready_data/files_used_30/sequence_index_0: derived from torch_ready_data/files_used_30/sequence_index_0.npz
2023-04-24 15:47:02,170 Loading pointer to dataset: torch_ready_data/files_used_30/sequence_index_10: derived from torch_ready_data/files_used_30/sequence_index_10.npz
2023-04-24 15:47:02,171 Loading: torch_ready_data/files_used_30/sequence_index_0/arr_0.npy
2023-04-24 15:47:02,211 Loading: torch_ready_data/files_used_30/sequence_index_10/arr_0.npy
2023-04-24 15:47:02,247 Downloading model checkpoint
2023-04-24 15:47:02,249 Creating directory: lstm_predictions/files_used_30/ordinal_seq_10
2023-04-24 15:47:02,263 Found credentials in environment variables.
2023-04-24 15:47:03,166 Loading model checkpoint
2023-04-24 15:47:03,397 checkpoint loaded from buffer. Loading model
2023-04-24 15:47:04,627

In [24]:
join_predictions_on_original(JoinedPredArgs)

2023-04-24 15:55:25,668 Loading predictions from lstm_predictions/files_used_30/ordinal_seq_10/predictions.parquet
2023-04-24 15:55:25,669 Loading dataset from calculated_features/files_used_30.parquet
2023-04-24 15:55:32,964 Shape of predictions: (38500990, 5)
2023-04-24 15:55:32,965 Shape of original: (38500990, 27)
2023-04-24 15:55:32,966 Joining predictions on original dataset
2023-04-24 15:55:33,389 Predictions joined: (38500990, 28): columns
2023-04-24 15:55:33,391 ['user_id',
 'session_30_raw',
 'cum_session_event_raw',
 'prediction',
 'session_terminates_30_minutes',
 'cum_session_time_raw',
 'cum_platform_event_raw',
 'global_events_user',
 'global_session_time_minutes',
 'date_time',
 'country_count',
 'timestamp_raw',
 'date_hour_sin',
 'date_hour_cos',
 'session_5_count',
 'session_30_count',
 'cum_session_event_count',
 'delta_last_event',
 'cum_session_time_minutes',
 'expanding_click_average',
 'cum_platform_time_minutes',
 'cum_platform_events',
 'cum_projects',
 'avera

In [28]:
!aws s3 cp --recursive rl_ready_data/ s3://dissertation-data-dmiller/rl_ready_data/

upload: rl_ready_data/files_used_10/ordinal_seq_10/rl_ready_data.parquet to s3://dissertation-data-dmiller/rl_ready_data/files_used_10/ordinal_seq_10/rl_ready_data.parquet
upload: rl_ready_data/files_used_30/ordinal_seq_10/rl_ready_data.parquet to s3://dissertation-data-dmiller/rl_ready_data/files_used_30/ordinal_seq_10/rl_ready_data.parquet


In [27]:
!pip install awscli -q

[0m