In [1]:
!pip install torch python-dotenv boto3 --quiet 

[0m

In [2]:
%load_ext dotenv
%dotenv env

In [3]:
import torch
import os
import argparse
import logging
import boto3
from pprint import pformat
from torch import nn
import io
import tqdm
import numpy as np
import pandas as pd

In [4]:
LABEL = [
    "session_terminates_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    
    "cum_session_event_raw",
    "cum_session_time_raw",

    "cum_platform_event_raw",
    "global_events_user",
    "global_session_time_minutes",
]

DATE_TIME = [
    "date_time",
]

DATE_COLS = [
    'year',
    'month',
    'day',
    'hour',
    'minute',
]
OUT_FEATURE_COLUMNS = [
    "country_count",
    "timestamp_raw",
    "date_hour_sin",
    
    "date_hour_cos",
    "session_5_count",
    "session_30_count",
    
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time_minutes",
    
    "expanding_click_average",
    "cum_platform_time_minutes",
    "cum_platform_events",
    
    "cum_projects",
    "average_event_time",
    "rolling_session_time",
    
    "rolling_session_events",
    "rolling_session_gap",
    "session_event_count",
]

PREDICTION_COLS = [
    'prediction',
]

METADATA_STAT_COLUMNS = [
    'session_size',
    'sim_size',
    'session_minutes',
    'ended',
    'incentive_index',
    'reward',
    'n_episodes',
    'time_in_session',
]



TORCH_LOAD_COLS = LABEL + METADATA + DATE_TIME + OUT_FEATURE_COLUMNS + ['prediction']

OUT_COLUMNS = [
    'label',
    'user_id',
    'session_30',
    'cum_session_event_raw',
    'cum_session_time_raw',
    'glob_platform_event',
    'glob_platform_time',
    'year',
    'month',
    'day',
    'hour',
    'minute',
    'prediction'
] + OUT_FEATURE_COLUMNS


In [5]:
# %load data_module
import pdb

import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import IterableDataset

LABEL_INDEX = 1
TOTAL_EVENTS_INDEX = 2
BATCHES = 1000000

    
class ClickstreamDataset(Dataset):
    def __init__(self, dataset_pointer_list) -> None:
        """
        Yield data in batches of BATCHES
        """
        self.events = dataset_pointer_list
        self.size = self.events[0].shape[0]


    def __getitem__(self, idx):
        events = [np.array([event[idx]]) for event in self.events]
        return np.concatenate(events, axis=1)
       

    def __len__(self):
        return self.size


In [6]:
# %load torch_model_bases
import torch 
from torch import nn
N_FEATURES = 18
class LSTMOrdinal(nn.Module):
    def __init__(self,  hidden_size=32, dropout=0.2) -> None:
        super(LSTMOrdinal, self).__init__()
        
        self.lstm = nn.LSTM(
            input_size=N_FEATURES,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )
        self.output = nn.Linear(
            hidden_size,
            1
        )

    def forward(self, x):


        x, _ = self.lstm(x)
        x = x[:, -1]
        return self.output(x)

In [7]:
# %load npz_extractor
import logging
import os
import zipfile

import boto3
import numpy as np
import torch
import logging

class NPZExtractor:
    logger = logging.getLogger(__name__)
    def __init__(self, input_path, n_files, n_sequences, s3_client, data_partition) -> None:
        self.input_path = input_path
        self.n_files = n_files
        self.n_sequences = n_sequences
        self.s3_client = s3_client
        self.data_partition = data_partition


    def get_dataset_pointer(self):

        read_path = os.path.join(self.input_path, f'files_used_{self.n_files}')
        if not os.path.exists(read_path):
            self.logger.info(f'Creating directory: {read_path}')
            os.makedirs(read_path)


        for _ in range(0, self.n_sequences +1, 10):
            key_zip, key_npy = (
                os.path.join(read_path, f'sequence_index_{_}.npz'),
                os.path.join(read_path, f'sequence_index_{_}')
            )
            
            self.logger.info(f'Loading pointer to dataset: {key_npy}: derived from {key_zip}')


            if not os.path.exists(key_zip):
                self.logger.info(f'Zip file to extract: {key_zip}: npy file to load: {key_npy}')
                self.s3_client.download_file(
                    'dissertation-data-dmiller',
                    key_zip,
                    key_zip
                )
            if not os.path.exists(key_npy):
                self.logger.info(f'Zip file downloaded: {key_zip}: npy file to load: {key_npy}')

                self._zip_extract(key_zip, key_npy)

        lz_concatenated_results = self._lazy_concatenate()

        if self.data_partition:
            return [p[:self.data_partition] for p in lz_concatenated_results]
        else:
            return lz_concatenated_results


    def _zip_extract(self, key_zip, key_npy):
        self.logger.info(f'Extracting file: {key_zip} -> {key_npy}')

        with zipfile.ZipFile(key_zip, 'r') as zip_ref:
            zip_ref.extractall(path=key_npy, members=['arr_0.npy'])

        self.logger.info(f'Zip file exracted: {key_zip} -> {key_npy}/arr_0.npy')

    def _lazy_concatenate(self):
        lz_concat = []
        for _ in range(0, self.n_sequences +1, 10):
            path_to_load = os.path.join(self.input_path, f'files_used_{self.n_files}', f'sequence_index_{_}', f'arr_0.npy')
            self.logger.info(f'Loading: {path_to_load}')
            lz_concat.append(np.load(path_to_load, mmap_mode='r'))
        return lz_concat

In [8]:

CHECKPOINT_DIR='s3://dissertation-data-dmiller/lstm_experiments/checkpoints/data_v1/n_files_30/ordinal/sequence_length_10/data_partition_None/2023_03_30_07_54'
METADATA_INDEX = 12
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger('likelihood_engagement')
np.set_printoptions(suppress=True, precision=3, linewidth=200)
torch.set_printoptions(sci_mode=False, precision=3)
def _extract_features(tensor, n_sequences, n_features):
        
    metadata, features = tensor[:, :, :METADATA_INDEX], tensor[:, :, METADATA_INDEX:] 
    metadata, features = metadata.squeeze(), features.squeeze()
    metadata, features = metadata.squeeze(), features.squeeze()
                
    features = torch.flip(
        torch.reshape(features, (features.shape[0], n_sequences, n_features)),
        dims=[1]
    )
        
    return metadata, features

def _extract_last_sequence(tensor):
    """_summary_
    Extracts the last sequence from a tensor of sequences.
    """
    return tensor[:, -1, :]

@torch.no_grad()
def generate_static_predictions(args):
    
    user_metadata_container = []
    
    logger.info('Generating static prediction likelihoods for experiment')
    npz_extractor = NPZExtractor(
        args.file_path,
        args.n_files,
        args.n_sequences,
        None,
        None
           
    )
    
    logger.info('generating dataset pointer')
    dataset = npz_extractor.get_dataset_pointer()
    
    logger.info('Downloading model checkpoint')
    
    write_path = os.path.join(args.write_path, f'files_used_{args.n_files}')
    if not os.path.exists(write_path):
        logger.info(f'Creating directory: {write_path}')
        os.makedirs(write_path)
    
    client = boto3.client('s3')
    
    checkpoint = client.get_object(
        Bucket='dissertation-data-dmiller',
        Key='lstm_experiments/checkpoints/data_v1/n_files_30/ordinal/sequence_length_10/data_partition_None/2023_03_30_07_54/clickstream-epoch=83-loss_valid=0.29.ckpt'
    )
    

    logger.info('Loading model checkpoint')
    
    buffer = io.BytesIO(checkpoint['Body'].read())
   
    logger.info('checkpoint loaded from buffer. Loading model')
    model_state = torch.load(buffer, map_location=torch.device('cuda'))
    model = LSTMOrdinal()
    model.load_state_dict(model_state['state_dict'])
    model = model.cuda()
    logger.info(f'Model loaded. Creating dataset: n_events {dataset[0].shape[0]}')
    
    dataset = ClickstreamDataset(dataset)
    loader = DataLoader(dataset, batch_size=8192*32, shuffle=False)
    

    p_bar = tqdm.tqdm(loader, total=len(loader))
    
    for indx, data in enumerate(p_bar):
        p_bar.set_description(f'Processing batch: {indx}')
        data = data.cuda()
        metadata, features =_extract_features(data, args.n_sequences + 1, 18)
        last_sequence = _extract_last_sequence(features)
        preds = model(features)
        preds = nn.Sigmoid()(preds)
        user_metadata = torch.cat([metadata, preds, last_sequence], dim=1)
        user_metadata_container.append(user_metadata)
        
    predicted_data = torch.tensor(user_metadata).cpu().numpy()
    predicted_data = pd.DataFrame(predicted_data, columns=OUT_COLUMNS)
    predicted_data = predicted_data.rename(columns={
        'prediction': f'pred_{args.model_type}_{args.n_sequences}'
    })
     
    
    predicted_data = predicted_data.drop(columns=DATE_COLS)
    output_path = os.path.join(write_path, f'{args.model_type}_seq_{args.n_sequences}.parquet')
    logger.info(f'Writing rl ready data: {output_path}')
    
    logger.info(f'Percentage data correct: {predicted_data.count().min() / predicted_data.shape[0]}')
    predicted_data.to_parquet(output_path, index=False)
    

In [12]:
class Arguments:
    n_files = 2
    n_sequences = 10
    file_path = 'torch_ready_data'
    checkpoint_dir = CHECKPOINT_DIR
    write_path = 'rl_ready_data'
    model_type = 'ordinal'

In [13]:
generate_static_predictions(Arguments())


2023-04-26 10:49:22,473 INFO Generating static prediction likelihoods for experiment
2023-04-26 10:49:22,474 INFO generating dataset pointer
2023-04-26 10:49:22,474 INFO Loading pointer to dataset: torch_ready_data/files_used_2/sequence_index_0: derived from torch_ready_data/files_used_2/sequence_index_0.npz
2023-04-26 10:49:22,475 INFO Loading pointer to dataset: torch_ready_data/files_used_2/sequence_index_10: derived from torch_ready_data/files_used_2/sequence_index_10.npz
2023-04-26 10:49:22,475 INFO Loading: torch_ready_data/files_used_2/sequence_index_0/arr_0.npy
2023-04-26 10:49:22,478 INFO Loading: torch_ready_data/files_used_2/sequence_index_10/arr_0.npy
2023-04-26 10:49:22,481 INFO Downloading model checkpoint
2023-04-26 10:49:23,268 INFO Loading model checkpoint
2023-04-26 10:49:23,504 INFO checkpoint loaded from buffer. Loading model
2023-04-26 10:49:23,523 INFO Model loaded. Creating dataset: n_events 2566734
Processing batch: 9: 100%|██████████| 10/10 [00:29<00:00,  2.93s