In [9]:
!pip install torch python-dotenv boto3 --quiet awscli

[0m

In [10]:
%load_ext dotenv
%dotenv env

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [11]:
import torch
import os
import argparse
import logging
import boto3
from pprint import pformat
from torch import nn
import io
import tqdm
import numpy as np
import pandas as pd

In [20]:
import os


LABEL = [
    "label"
]

METADATA = [
    "user_id",
    "session_30_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "glob_session_time_raw", 
    "year",
    "month",
    "day",
    "hour",
    "minute",
    "second"
]

OUT_FEATURE_COLUMNS = [
    "user_count",
    "project_count",
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time",
    
    "expanding_click_average",
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]

GROUPBY_COLS = ['user_id']

LOAD_COLS = LABEL + METADATA + OUT_FEATURE_COLUMNS

S3_BUCKET = 'dissertation-data-dmiller'
BASE_CHECK_PATH = 'lstm_experiments/checkpoints'


LSTM_CHECKPOINTS_10 = {
    'LSTM SEQ 1': 'lstm-experiments/ordinal-sequence-length-10-window-1/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-10-window-1epoch=39-loss_valid=0.61.ckpt',
    'LSTM SEQ 10': 'lstm-experiments/ordinal-sequence-length-10-window-10/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-10epoch=03-loss_valid=0.60.ckpt',
    'LSTM SEQ 20': 'lstm-experiments/ordinal-sequence-length-20-window-10/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-20-window-10epoch=23-loss_valid=0.61.ckpt',
    'LSTM SEQ 30': 'lstm-experiments/ordinal-sequence-length-30-window-10/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-30-window-10epoch=25-loss_valid=0.61.ckpt',
    'LSTM SEQ 40': 'lstm-experiments/ordinal-sequence-length-40-window-20/lightning_logs/version_1/checkpoints/clickstream-ordinal-sequence_length-40-window-20epoch=75-loss_valid=0.61.ckpt'
}

LSTM_CHECKPOINTS_20 = {
    'LSTM SEQ 1': 'lstm-experiments/ordinal-sequence-length-1-window-20/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-1-window-20epoch=61-loss_valid=0.61.ckpt',
    'LSTM SEQ 10': 'lstm-experiments/ordinal-sequence-length-10-window-20/lightning_logs/version_1/checkpoints/clickstream-ordinal-sequence_length-10-window-20epoch=13-loss_valid=0.62.ckpt',
    'LSTM SEQ 20': 'lstm-experiments/ordinal-sequence-length-20-window-10/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-20-window-10epoch=23-loss_valid=0.61.ckpt',
    'LSTM SEQ 30': 'lstm-experiments/ordinal-sequence-length-30-window-20/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-30-window-20epoch=21-loss_valid=0.61.ckpt',
    'LSTM SEQ 40': 'lstm-experiments/ordinal-sequence-length-30-window-20/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-30-window-20epoch=21-loss_valid=0.61.ckpt'
}  
 
LSTM_CHECKPOINTS_30 = {
    'LSTM SEQ 1': 'lstm-experiments/ordinal-sequence-length-10/lightning_logs/version_3/checkpoints/clickstream-ordinal-sequence_length-1epoch=33-loss_valid=0.59.ckpt',
    'LSTM SEQ 10': 'lstm-experiments/ordinal-sequence-length-10/lightning_logs/version_1/checkpoints/clickstream-ordinal-sequence_length-10epoch=27-loss_valid=0.59.ckpt',
    'LSTM SEQ 20': 'lstm-experiments/ordinal-sequence-length-20/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-20epoch=23-loss_valid=0.59.ckpt',
    'LSTM SEQ 30': 'lstm-experiments/ordinal-sequence-length-30/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-30epoch=61-loss_valid=0.59.ckpt',
    'LSTM SEQ 30 H': 'lstm-experiments/heuristic-ordinal-sequence-length-30/lightning_logs/version_8/checkpoints/clickstream-heuristic-ordinal-sequence_length-30epoch=67-loss_valid=0.59.ckpt',
    'LSTM SEQ 40': 'lstm-experiments/ordinal-sequence-length-40/lightning_logs/version_0/checkpoints/clickstream-ordinal-sequence_length-40epoch=75-loss_valid=0.59.ckpt'
}

In [21]:
# %load data_module
import pdb

import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import IterableDataset

LABEL_INDEX = 1
TOTAL_EVENTS_INDEX = 2
BATCHES = 1000000

    
class ClickstreamDataset(Dataset):
    def __init__(self, dataset_pointer_list) -> None:
        """
        Yield data in batches of BATCHES
        """
        self.events = dataset_pointer_list
        self.size = self.events[0].shape[0]


    def __getitem__(self, idx):
        events = [np.array([event[idx]]) for event in self.events]
        return np.concatenate(events, axis=1)
       

    def __len__(self):
        return self.size


In [14]:
# %load torch_model_bases
import torch 
from torch import nn
N_FEATURES = 22
class LSTMOrdinal(nn.Module):
    def __init__(self,  hidden_size=32, dropout=0.2) -> None:
        super(LSTMOrdinal, self).__init__()
        
        self.lstm = nn.LSTM(
            input_size=N_FEATURES,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )
        self.output = nn.Linear(
            hidden_size,
            1
        )

    def forward(self, x):


        x, _ = self.lstm(x)
        x = x[:, -1]
        return self.output(x)

In [15]:
import logging
import os
import zipfile

import boto3
import numpy as np
import torch
import logging

class NPZExtractor:
    logger = logging.getLogger(__name__)
    def __init__(self, input_path, n_files, n_sequences, s3_client, pred_window) -> None:
        self.input_path = input_path
        self.n_files = n_files
        self.n_sequences = n_sequences
        self.s3_client = s3_client
        self.pred_window = pred_window
        self.read_path = os.path.join(
            self.input_path,
            f'files_used_{self.n_files}_window_{self.pred_window}'
        )


    def get_dataset_pointer(self):

        if not os.path.exists(self.read_path):
            self.logger.info(f'Creating directory: {self.read_path}')
            os.makedirs(self.read_path)


        for _ in range(0, self.n_sequences +1, 10):
            key_zip, key_npy = (
                os.path.join(self.read_path, f'sequence_index_{_}.npz'),
                os.path.join(self.read_path, f'sequence_index_{_}')
            )
            
            self.logger.info(f'Loading pointer to dataset: {key_npy}: derived from {key_zip}')


            if not os.path.exists(key_zip):
                self.logger.info(f'Zip file to extract: {key_zip}: npy file to load: {key_npy}')
                self.s3_client.download_file(
                    'dissertation-data-dmiller',
                    key_zip,
                    key_zip
                )
            if not os.path.exists(key_npy):
                self.logger.info(f'Zip file downloaded: {key_zip}: npy file to load: {key_npy}')

                self._zip_extract(key_zip, key_npy)

        lz_concatenated_results = self._lazy_concatenate()
        return lz_concatenated_results

    def _zip_extract(self, key_zip, key_npy):
        self.logger.info(f'Extracting file: {key_zip} -> {key_npy}')

        with zipfile.ZipFile(key_zip, 'r') as zip_ref:
            zip_ref.extractall(path=key_npy, members=['arr_0.npy'])

        self.logger.info(f'Zip file exracted: {key_zip} -> {key_npy}/arr_0.npy')

    def _lazy_concatenate(self):
        lz_concat = []
        for _ in range(0, self.n_sequences +1, 10):
            path_to_load = os.path.join(self.read_path, f'sequence_index_{_}', f'arr_0.npy')
            self.logger.info(f'Loading: {path_to_load}')
            lz_concat.append(np.load(path_to_load, mmap_mode='r'))
        return lz_concat

In [16]:
# %load likelihood_engagement_cpu
import torch
import os
import argparse
from torch.utils.data import DataLoader, Dataset
import logging
import boto3
from pprint import pformat
from torch import nn
import io
import tqdm
import numpy as np
import pandas as pd



pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)    
pd.set_option('mode.use_inf_as_na', True)
torch.set_printoptions(sci_mode=False, linewidth=400, precision=2)
np.set_printoptions(suppress=True, precision=4, linewidth=200)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

METADATA_INDEX = 13

logger = logging.getLogger('likelihood_engagement')

def parse_args():
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--n_files', type=int, default=2)
    parser.add_argument('--n_sequences', type=int, default=20)
    parser.add_argument('--file_path', type=str, default='datasets/torch_ready_data')
    parser.add_argument('--write_path', type=str, default='datasets/rl_ready_data')
    parser.add_argument('--model_type', type=str, default='ordinal')
    args = parser.parse_args()
    return args

def _extract_features(tensor, n_sequences, n_features):
    
    features_dict = {}
    tensor = tensor.squeeze()
    metadata = tensor[:, :METADATA_INDEX]
    features = tensor[:, METADATA_INDEX:]
                
    features = torch.flip(
        torch.reshape(features, (features.shape[0], 41, 22)),
        dims=[1]
    )
    
    features_dict['LSTM SEQ 1'] =  features[:, -1, :].unsqueeze(1).clone()
    features_dict['LSTM SEQ 10'] = features[:, 30:, :].clone()
    features_dict['LSTM SEQ 20'] = features[:, 20:, :].clone()
    features_dict['LSTM SEQ 30'] = features[:, 10:, :].clone()
    features_dict['LSTM SEQ 30 H']=features[:, 10:, :].clone()
    features_dict['LSTM SEQ 40'] = features.clone()
    return metadata, features_dict



def get_models(checkpoints: dict, s3_client, device):
    """_summary_
    Downloads models from s3 and loads them into memory.
    """
    models = {}
    for name, checkpoint in checkpoints.items():
        logger.info(f'Downloading model: {name}')
        response = s3_client.get_object(
            Bucket=S3_BUCKET,
            Key=checkpoint
        )
        buffer = io.BytesIO(response['Body'].read())
        state = torch.load(buffer, map_location=torch.device(device))
        model = LSTMOrdinal()
        model.load_state_dict(state['state_dict'])
        model.to(device)
        models[name] = model
    return models

@torch.no_grad()
def generate_static_predictions(args):
    
    user_metadata_container = []
    client = boto3.client('s3')
    
    logger.info('Generating static prediction likelihoods for experiment')
        
    write_path = os.path.join(args.write_path, f'files_used_{args.n_files}_window_{args.data_window}')
    file_path = os.path.join(args.file_path, f'files_used_{args.n_files}_window_{args.data_window}')
    logger.info(
        f'Writing to: {write_path}: reading from {file_path}'
    )
    npz_extractor = NPZExtractor(
        args.file_path,
        args.n_files,
        args.n_sequences,
        client,
        10
           
    )
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'   
    logger.info(f'Setting device to {device}')
    
    logger.info('generating dataset pointer')
    dataset = npz_extractor.get_dataset_pointer()
    
    logger.info('Downloading model checkpoint')
    
    write_path = os.path.join(args.write_path, f'files_used_{args.n_files}_window_{args.data_window}')
    if not os.path.exists(write_path):
        logger.info(f'Creating directory: {write_path}')
        os.makedirs(write_path)
    
    client = boto3.client('s3')
    
    if args.data_window == 10:
        lstm_checkpoints = LSTM_CHECKPOINTS_10.copy()
    elif args.data_window == 20:
        lstm_checkpoints = LSTM_CHECKPOINTS_20.copy()
    else:
        lstm_checkpoints = LSTM_CHECKPOINTS_30.copy()
    
    logger.info(f'Downloading models from checkpoints {lstm_checkpoints.keys()}: window: {args.data_window}')
    
    models = get_models(lstm_checkpoints, client, device)
    
    dataset = ClickstreamDataset(dataset)
    loader = DataLoader(dataset, batch_size=2048*4, shuffle=False, num_workers=8, pin_memory=True)
    activation = nn.Sigmoid()
    

    p_bar = tqdm.tqdm(loader, total=len(loader))
    
    for indx, data in enumerate(p_bar):
        p_bar.set_description(f'Processing batch: {indx}')
        data = data.to(device)
        
        metadata, features_dict = _extract_features(data, args.n_sequences + 1, 22)
        preds_1 = activation(models['LSTM SEQ 1'](features_dict['LSTM SEQ 1']))
        preds_10 = activation(models['LSTM SEQ 10'](features_dict['LSTM SEQ 10']))
        preds_20 = activation(models['LSTM SEQ 20'](features_dict['LSTM SEQ 20']))
        preds_30 = activation(models['LSTM SEQ 30'](features_dict['LSTM SEQ 30']))
        # preds_30_h = activation(models['LSTM SEQ 30 H'](features_dict['LSTM SEQ 30 H']))
        preds_40 = activation(models['LSTM SEQ 40'](features_dict['LSTM SEQ 40']))
       
        user_metadata = torch.cat((
                metadata,
                features_dict['LSTM SEQ 1'].squeeze(),
                preds_1,
                preds_10,
                preds_20,
                preds_30,
                # preds_30_h,
                preds_40),
            1
        )
        
        user_metadata_container.append(user_metadata)

   
    predicted_data = torch.cat(user_metadata_container, dim=0)
    logger.info(f'Predicted data shape: {predicted_data.shape}: generating df')
    predicted_data = pd.DataFrame(predicted_data.cpu().numpy(), columns=LABEL + METADATA + OUT_FEATURE_COLUMNS + list(models.keys()))
    logger.info(f'Decoding date time data: merging to date time: {predicted_data.shape}')
   
    predicted_data = predicted_data.sort_values(by=['year', 'month', 'day', 'hour', 'minute', 'second'])
    write_path = os.path.join(write_path, f'data_for_auc_window_{args.data_window}')
    logger.info(f'Writing to parquet: {write_path}')
    cols = ['label', 'user_id', 'session_30_raw', 'cum_session_time_raw', 'glob_session_time_raw'] + [col for col in predicted_data.columns if 'LSTM' in col]
    predicted_data = predicted_data[cols]
    logger.info(f'Writing to parquet: {write_path}')
    predicted_data.to_parquet(write_path, index=False)
    logger.info(f'Generating evaluation dataset')
    predicted_data_eval = predicted_data[int(len(predicted_data) * 0.7):]
    predicted_data_path = f'{write_path}_eval'
    logger.info(f'Writing predicted data to: {predicted_data_path}: shape: {predicted_data_eval.shape}')
    predicted_data_eval.to_parquet(predicted_data_path, index=False)
    # return predicted_data
    

In [22]:
class Arguments:
    n_files = 30
    n_sequences = 40
    file_path = 'torch_ready_data'
    write_path = 'rl_ready_data'
    data_window = 20

In [23]:
generate_static_predictions(Arguments)

2023-07-12 05:23:56,913 Generating static prediction likelihoods for experiment
2023-07-12 05:23:56,914 Writing to: rl_ready_data/files_used_30_window_20: reading from torch_ready_data/files_used_30_window_20
2023-07-12 05:23:56,914 Setting device to cuda
2023-07-12 05:23:56,915 generating dataset pointer
2023-07-12 05:23:56,915 Loading pointer to dataset: torch_ready_data/files_used_30_window_10/sequence_index_0: derived from torch_ready_data/files_used_30_window_10/sequence_index_0.npz
2023-07-12 05:23:56,916 Loading pointer to dataset: torch_ready_data/files_used_30_window_10/sequence_index_10: derived from torch_ready_data/files_used_30_window_10/sequence_index_10.npz
2023-07-12 05:23:56,917 Loading pointer to dataset: torch_ready_data/files_used_30_window_10/sequence_index_20: derived from torch_ready_data/files_used_30_window_10/sequence_index_20.npz
2023-07-12 05:23:56,917 Loading pointer to dataset: torch_ready_data/files_used_30_window_10/sequence_index_30: derived from torch_

In [24]:
!aws s3 sync rl_ready_data/ s3://dissertation-data-dmiller/rl_ready_data/

upload: rl_ready_data/files_used_30_window_20/data_for_auc_window_20_eval to s3://dissertation-data-dmiller/rl_ready_data/files_used_30_window_20/data_for_auc_window_20_eval
upload: rl_ready_data/files_used_30_window_20/data_for_auc_window_20 to s3://dissertation-data-dmiller/rl_ready_data/files_used_30_window_20/data_for_auc_window_20
