In [1]:
!python -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 --quiet
!python -m pip install boto3 python-dotenv gputil --quiet



[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.4.0 requires botocore<1.27.60,>=1.27.59, but you have botocore 1.29.111 which is incompatible.[0m[31m
[0m

In [2]:
%load_ext dotenv
%dotenv env

In [3]:

INITIAL_LOAD_COLUMNS = [
    'user_id',
    'project_id',
    'date_time',
    'session_5_count',
    'session_30_count',
    'session_terminates_30_minutes',
    'country',
]

LABEL = [
    "session_terminates_30_minutes"
]


METADATA = [
    "user_id",
    "session_30_raw",
    "cum_session_event_raw",
    "cum_session_time_raw",
    "cum_platform_event_raw",
    "global_events_user",
    "global_session_time_minutes",
]

DATE_TIME = [
    "date_time",
    "year",
    "month",
    "day",
    "hour",
    "minute"
]

OUT_FEATURE_COLUMNS = [
    "country_count",
    "timestamp_raw",
    "date_hour_sin",
    
    "date_hour_cos",
    "session_5_count",
    "session_30_count",
    
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time_minutes",
    
    "expanding_click_average",
    "cum_platform_time_minutes",
    "cum_platform_events",
    
    "cum_projects",
    "average_event_time",
    "rolling_session_time",
    
    "rolling_session_events",
    "rolling_session_gap",
    "session_event_count",
]



GROUPBY_COLS = ['user_id']

TORCH_LOAD_COLS = LABEL + METADATA + DATE_TIME + OUT_FEATURE_COLUMNS

In [4]:
import cudf as gpu_pd
from cuml.preprocessing import MinMaxScaler
import torch
import numpy as cpu_np
import cupy as gpu_np
import pandas as cpu_pd
from pprint import pformat
import logging
import os
import glob
import boto3
import os
import numpy 
import logging
import GPUtil

In [5]:
class SessionizeData:
    logger = logging.getLogger(__name__)
    def __init__(self, df, max_sequence_index, write_path, load_cols, feature_cols, grouper, save_s3=True):
        self.df = df
        self.max_sequence_index = max_sequence_index + 1
        self.min_sequence_index = self.max_sequence_index - 10
        self.device = self._device()
        self.sequences = cpu_np.arange(self.min_sequence_index, self.max_sequence_index).tolist()
        self.seq_container = []
        self.torch_sequences = None
        self.output_path = write_path
        self.save_s3 = save_s3
        self.load_columns = load_cols
        self.feature_cols = feature_cols
        self.grouper = grouper
        

    def _device(self):
        return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def _sequence_lazy(self):
         return next(self._lazy_load_shifted_index())

    def _shifters(self):
        for _ in range(self.min_sequence_index, self.max_sequence_index):
            self.logger.info(f'Loading sequence: {_} -> {self.max_sequence_index}')
            self.seq_container.append(self._sequence_lazy())
        if torch.cuda.is_available():
            GPUtil.showUtilization()

        sequences = torch.cat(self.seq_container, dim=1).half()
        return sequences

    def generate_sequence(self):

        self.logger.info(f'Generating shifted clickstreams from {self.min_sequence_index} -> {self.max_sequence_index}')
        sequence = self._shifters()

        self.logger.info(f'Shifter shape: {sequence.shape}')
        

        self.logger.info(f'Loading intial clickstream to {self.device}')

        if self.max_sequence_index == 11:
            self.logger.info('Initial clickstream writing to disk')
            initial_clickstream = self.df[self.load_columns].values.astype(gpu_np.float32)
            self._sequence_to_disk(initial_clickstream, 0)

        self.logger.info(f'Writing sequence to disk: {self.max_sequence_index - 1}')
        self._sequence_to_disk(sequence.cpu().numpy(), self.max_sequence_index - 1)


    def _sequence_to_disk(self, partition, sequence_index):
        if self.save_s3:
            s3_client = boto3.client(
                's3',
                aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
            )

        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)

        partition_path = os.path.join(self.output_path, f'sequence_index_{sequence_index}.npz')
        self.logger.info(f'Saving to disk: {partition_path}')
        numpy.savez_compressed(partition_path, partition)

        if self.save_s3:
            self.logger.info(f'Uploading to s3: dissertation-data-dmiller/{partition_path}')
            s3_client.upload_file(partition_path, 'dissertation-data-dmiller', partition_path)

    def _lazy_load_shifted_index(self):

        torch.cuda.empty_cache()
        indx = self.sequences.pop(0)
        torch_container = []
        for col in self.feature_cols:
            sequence = self.df.groupby(self.grouper)[col].shift(indx).fillna(0).values.astype(gpu_np.float16)
            sequence_tensor = torch.tensor(sequence).to(self.device).half()
            torch_container.append(sequence_tensor.unsqueeze(1))
            torch.cuda.empty_cache()

        yield torch.cat(torch_container, dim=1).half()

In [6]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

torch.set_printoptions(sci_mode=False, precision=4, linewidth=400)

cpu_np.set_printoptions(suppress=True)
cpu_np.set_printoptions(precision=4)

def get_logger():
    logger = logging.getLogger(__name__)
    return logger


def scale_feature_cols(df, scaler, scaler_columns):
    df[scaler_columns] = scaler.fit_transform(df[scaler_columns])
    return df

def main(args):
    
    logger = get_logger()
    logger.info('Starting sessionize_users_cpu.py with arguments')
    logger.info(pformat(args.__dict__))
    
    data_read = os.path.join(args.input_path, f'files_used_{args.data_subset}')

    logger.info(f'Reading data from {data_read}')
    df = gpu_pd.read_parquet(data_read, columns=TORCH_LOAD_COLS)
    logger.info(f'Data read: {df.shape}')
    logger.info('Casting date time and sorting by date time')
    df['date_time'] = gpu_pd.to_datetime(df['date_time'])
    df = df.sort_values(by=['date_time'])
    logger.info('Data read: scaling scaler columns')
    df = scale_feature_cols(df, MinMaxScaler(), OUT_FEATURE_COLUMNS)
    logger.info('Scaling complete: implement sessionize')
    
    for seq_index in args.seq_list:

        sessionize = SessionizeData(
            df,
            seq_index,
            os.path.join(args.output_path, f'files_used_{args.data_subset}'),
            [col for col in TORCH_LOAD_COLS if col != 'date_time'],
            OUT_FEATURE_COLUMNS,
            GROUPBY_COLS,
            args.save_s3
        )
    
        logger.info(f'Generating sequence for {seq_index}')
        sessionize.generate_sequence()
    
    logger.info(f'Sessionize complete for sequences {args.seq_list}')
   




In [7]:
class Arguments:
    seq_list = [10]
    input_path = 'calculated_features'
    output_path = 'torch_ready_data'
    data_subset = 10
    save_s3 = True

for data_subset in [2, 10, 30]:
    args = Arguments()
    args.data_subset = data_subset
    main(args)

2023-04-12 15:09:12,247 - __main__ - INFO - Starting sessionize_users_cpu.py with arguments
2023-04-12 15:09:12,249 - __main__ - INFO - {'data_subset': 2}
2023-04-12 15:09:12,249 - __main__ - INFO - Reading data from calculated_features/files_used_2
2023-04-12 15:09:18,374 - __main__ - INFO - Data read: (2566734, 32)
2023-04-12 15:09:18,375 - __main__ - INFO - Casting date time and sorting by date time
2023-04-12 15:09:18,389 - __main__ - INFO - Data read: scaling scaler columns
2023-04-12 15:09:22,002 - __main__ - INFO - Scaling complete: implement sessionize
2023-04-12 15:09:22,004 - __main__ - INFO - Generating sequence for 10
2023-04-12 15:09:22,005 - __main__ - INFO - Generating shifted clickstreams from 1 -> 11
2023-04-12 15:09:22,005 - __main__ - INFO - Loading sequence: 1 -> 11
2023-04-12 15:09:26,587 - __main__ - INFO - Loading sequence: 2 -> 11
2023-04-12 15:09:26,870 - __main__ - INFO - Loading sequence: 3 -> 11
2023-04-12 15:09:27,145 - __main__ - INFO - Loading sequence: 4

| ID | GPU | MEM |
------------------
|  0 | 77% |  8% |


2023-04-12 15:09:29,834 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_2/sequence_index_0.npz
2023-04-12 15:09:39,337 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_2/sequence_index_0.npz
2023-04-12 15:09:45,156 - __main__ - INFO - Writing sequence to disk: 10
2023-04-12 15:09:45,761 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_2/sequence_index_10.npz
2023-04-12 15:09:57,763 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_2/sequence_index_10.npz
2023-04-12 15:10:05,770 - __main__ - INFO - Sessionize complete for sequences [10]
2023-04-12 15:10:05,773 - __main__ - INFO - Starting sessionize_users_cpu.py with arguments
2023-04-12 15:10:05,773 - __main__ - INFO - {'data_subset': 10}
2023-04-12 15:10:05,774 - __main__ - INFO - Reading data from calculated_features/files_used_10
2023-04-12 15:10:08,161 - __main__ - INFO - Data read: (12833662, 32)
2023-04-12 15:10:08,16

| ID | GPU | MEM |
------------------
|  0 | 73% | 14% |


2023-04-12 15:10:20,510 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_10/sequence_index_0.npz
2023-04-12 15:11:05,438 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_10/sequence_index_0.npz
2023-04-12 15:11:15,424 - __main__ - INFO - Writing sequence to disk: 10
2023-04-12 15:11:18,509 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_10/sequence_index_10.npz
2023-04-12 15:12:14,726 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_10/sequence_index_10.npz
2023-04-12 15:12:24,017 - __main__ - INFO - Sessionize complete for sequences [10]
2023-04-12 15:12:24,021 - __main__ - INFO - Starting sessionize_users_cpu.py with arguments
2023-04-12 15:12:24,022 - __main__ - INFO - {'data_subset': 30}
2023-04-12 15:12:24,023 - __main__ - INFO - Reading data from calculated_features/files_used_30
2023-04-12 15:12:30,643 - __main__ - INFO - Data read: (38500990, 32)
2023-04-12 15:12:3

| ID | GPU  | MEM |
-------------------
|  0 | 100% | 32% |


2023-04-12 15:13:06,852 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30/sequence_index_0.npz
2023-04-12 15:15:16,080 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30/sequence_index_0.npz
2023-04-12 15:15:45,899 - __main__ - INFO - Writing sequence to disk: 10
2023-04-12 15:15:54,906 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30/sequence_index_10.npz
2023-04-12 15:18:40,342 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30/sequence_index_10.npz
2023-04-12 15:19:10,294 - __main__ - INFO - Sessionize complete for sequences [10]


In [8]:
!rm -r torch_ready_data_main