In [1]:

!python -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 --quiet
!python -m pip install boto3 python-dotenv GPUtil --quiet



[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.4.0 requires botocore<1.27.60,>=1.27.59, but you have botocore 1.31.1 which is incompatible.[0m[31m
[0m

In [2]:
%load_ext dotenv
%dotenv env

In [3]:
LABEL = [
    "label"
]

METADATA = [
    "user_id",
    "session_count_raw",
    "cum_platform_event_raw",
    
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "global_session_time",
    
    "year",
    "month",
    "day",
    
    "hour",
    "minute",
    "second"
]

OUT_FEATURE_COLUMNS = [
    "user_count",
    "project_count",
    "country_count", 

    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_count",
    "session_5_count",
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time",
    
    "expanding_click_average",
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]


GROUPBY_COLS = ['user_id']

LOAD_COLS = LABEL + METADATA + OUT_FEATURE_COLUMNS



In [4]:
import cudf as gpu_pd
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy as cpu_np
import cupy as gpu_np
import pandas as cpu_pd
from pprint import pformat
import logging
import os
import glob
import boto3
import os
import numpy 
import logging
import GPUtil

In [5]:
class SessionizeData:
    logger = logging.getLogger(__name__)
    def __init__(self, df, max_sequence_index, write_path, load_cols, feature_cols, grouper, save_s3=True):
        self.df = df
        self.max_sequence_index = max_sequence_index + 1
        self.min_sequence_index = self.max_sequence_index - 10
        self.device = self._device()
        self.sequences = cpu_np.arange(self.min_sequence_index, self.max_sequence_index).tolist()
        self.seq_container = []
        self.torch_sequences = None
        self.output_path = write_path
        self.save_s3 = save_s3
        self.load_columns = load_cols
        self.feature_cols = feature_cols
        self.grouper = grouper
        

    def _device(self):
        return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def _sequence_lazy(self):
         return next(self._lazy_load_shifted_index())

    def _shifters(self):
        for _ in range(self.min_sequence_index, self.max_sequence_index):
            self.logger.info(f'Loading sequence: {_} -> {self.max_sequence_index}')
            self.seq_container.append(self._sequence_lazy())
        if torch.cuda.is_available():
            GPUtil.showUtilization()

        sequences = torch.cat(self.seq_container, dim=1).half()
        return sequences

    def generate_sequence(self):

        self.logger.info(f'Generating shifted clickstreams from {self.min_sequence_index} -> {self.max_sequence_index}')
        sequence = self._shifters()

        self.logger.info(f'Shifter shape: {sequence.shape}')
        

        self.logger.info(f'Loading intial clickstream to {self.device}')

        if self.max_sequence_index == 11:
            self.logger.info('Initial clickstream writing to disk')
            initial_clickstream = self.df[self.load_columns]
            initial_clickstream = self.df[self.load_columns].values.astype(gpu_np.float32)
            
            self.logger.info(f'Initial clickstream shape: {initial_clickstream.shape}')
            self._sequence_to_disk(initial_clickstream, 0)

        self.logger.info(f'Writing sequence to disk: {self.max_sequence_index - 1}')
        self._sequence_to_disk(sequence.cpu().numpy(), self.max_sequence_index - 1)


    def _sequence_to_disk(self, partition, sequence_index):
        if self.save_s3:
            s3_client = boto3.client(
                's3',
                aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
            )

        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)

        partition_path = os.path.join(self.output_path, f'sequence_index_{sequence_index}.npz')
        self.logger.info(f'Saving to disk: {partition_path}')
        numpy.savez_compressed(partition_path, partition)

        if self.save_s3:
            self.logger.info(f'Uploading to s3: dissertation-data-dmiller/{partition_path}')
            s3_client.upload_file(partition_path, 'dissertation-data-dmiller', partition_path)

    def _lazy_load_shifted_index(self):

        torch.cuda.empty_cache()
        indx = self.sequences.pop(0)
        torch_container = []
        for col in self.feature_cols:
            sequence = self.df.groupby(self.grouper)[col].shift(indx).fillna(0).values.astype(gpu_np.float16)
            sequence_tensor = torch.tensor(sequence).to(self.device).half()
            torch_container.append(sequence_tensor.unsqueeze(1))
            torch.cuda.empty_cache()

        yield torch.cat(torch_container, dim=1).half()

In [9]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

torch.set_printoptions(sci_mode=False, precision=4, linewidth=400)

cpu_np.set_printoptions(suppress=True)
cpu_np.set_printoptions(precision=4)

def get_logger():
    logger = logging.getLogger(__name__)
    return logger


def scale_feature_cols(df, scaler, scaler_columns):
    df[scaler_columns] = scaler.fit_transform(df[scaler_columns])
    return df

def main(args):
    
    logger = get_logger()
    logger.info('Starting sessionize_users_cpu.py with arguments')
    logger.info(pformat(args.__dict__))
    
    data_read = os.path.join(args.input_path, f'files_used_{args.data_subset}', f'calculated_features_window_{args.window}.parquet')

    logger.info(f'Reading data from {data_read}')
    scaler_cols = [
        col for col in OUT_FEATURE_COLUMNS if 'sin' not in col and 'cos' not in col
    ]

    df = gpu_pd.read_parquet(data_read, columns=LOAD_COLS + ['date_time'])
    logger.info(f'Data read: {df.shape}')
    logger.info('Casting date time and sorting by date time')
    df['date_time'] = gpu_pd.to_datetime(df['date_time'])
    df = df.sort_values(by=['date_time'])
    logger.info('Data read: scaling scaler columns')
    train_partition = int(df.shape[0] * 0.7)
    
    train_df = df.iloc[:train_partition]
    test_df = df.iloc[train_partition:]
    
    scaler = MinMaxScaler(feature_range=(-1, 1))
    train_df, val_df = train_df.to_pandas(), test_df.to_pandas()
    logger.info(f'Fitting on train dataset')
    train_df[scaler_cols] = scaler.fit_transform(train_df[scaler_cols])
    
    logger.info(f'Applying scalar to validation dataset')
    val_df[scaler_cols] = scaler.transform(val_df[scaler_cols])
    train_df, val_df = gpu_pd.from_pandas(train_df), gpu_pd.from_pandas(val_df)
    df = gpu_pd.concat([train_df, val_df]).sort_values(by=['date_time'])
    logger.info(f'DF concatenated')
    logger.info(f'Shape after features: {df.shape}')

    logger.info('Scaling complete: implement sessionize')
    
    for seq_index in args.seq_list:

        sessionize = SessionizeData(
            df,
            seq_index,
            os.path.join(args.output_path, f'files_used_{args.data_subset}_window_{args.window}'),
            LOAD_COLS,
            OUT_FEATURE_COLUMNS,
            GROUPBY_COLS,
            args.save_s3
        )
    
        logger.info(f'Generating sequence for {seq_index}')
        sessionize.generate_sequence()
    
    logger.info(f'Sessionize complete for sequences {args.seq_list}')
   




In [10]:
class Arguments:
    def __init__(self, window) -> None:
        
        self.seq_list = [10, 20, 30, 40]
        self.input_path = 'calculated_features'
        self.output_path = 'torch_ready_data'
        self.data_subset = 30
        self.save_s3 = True
        self.window = window

for window in [10, 20]:
    args = Arguments(window)
    df = main(args)

2023-07-10 10:46:11,611 - __main__ - INFO - Starting sessionize_users_cpu.py with arguments
2023-07-10 10:46:11,612 - __main__ - INFO - {'data_subset': 30,
 'input_path': 'calculated_features',
 'output_path': 'torch_ready_data',
 'save_s3': True,
 'seq_list': [10, 20, 30, 40],
 'window': 10}
2023-07-10 10:46:11,613 - __main__ - INFO - Reading data from calculated_features/files_used_30/calculated_features_window_10.parquet
2023-07-10 10:46:18,735 - __main__ - INFO - Data read: (38500990, 36)
2023-07-10 10:46:18,736 - __main__ - INFO - Casting date time and sorting by date time
2023-07-10 10:46:18,806 - __main__ - INFO - Data read: scaling scaler columns
2023-07-10 10:46:29,673 - __main__ - INFO - Fitting on train dataset
2023-07-10 10:46:45,662 - __main__ - INFO - Applying scalar to validation dataset
2023-07-10 10:46:55,152 - __main__ - INFO - DF concatenated
2023-07-10 10:46:55,153 - __main__ - INFO - Shape after features: (38500990, 36)
2023-07-10 10:46:55,153 - __main__ - INFO - S

| ID | GPU | MEM |
------------------
|  0 | 96% | 55% |


2023-07-10 10:47:35,741 - __main__ - INFO - Initial clickstream shape: (38500990, 35)
2023-07-10 10:47:35,956 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30_window_10/sequence_index_0.npz
2023-07-10 10:49:41,618 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30_window_10/sequence_index_0.npz
2023-07-10 10:50:31,759 - __main__ - INFO - Writing sequence to disk: 10
2023-07-10 10:50:44,338 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30_window_10/sequence_index_10.npz
2023-07-10 10:54:43,329 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30_window_10/sequence_index_10.npz
2023-07-10 10:54:57,738 - __main__ - INFO - Generating sequence for 20
2023-07-10 10:54:57,739 - __main__ - INFO - Generating shifted clickstreams from 11 -> 21
2023-07-10 10:54:57,740 - __main__ - INFO - Loading sequence: 11 -> 21
2023-07-10 10:55:01,486 - __main__ - INFO - Loading sequence: 12 

| ID | GPU | MEM |
------------------
|  0 | 96% | 55% |


2023-07-10 10:55:45,997 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30_window_10/sequence_index_20.npz
2023-07-10 10:59:39,905 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30_window_10/sequence_index_20.npz
2023-07-10 10:59:53,330 - __main__ - INFO - Generating sequence for 30
2023-07-10 10:59:53,331 - __main__ - INFO - Generating shifted clickstreams from 21 -> 31
2023-07-10 10:59:53,332 - __main__ - INFO - Loading sequence: 21 -> 31
2023-07-10 10:59:57,117 - __main__ - INFO - Loading sequence: 22 -> 31
2023-07-10 11:00:00,768 - __main__ - INFO - Loading sequence: 23 -> 31
2023-07-10 11:00:04,307 - __main__ - INFO - Loading sequence: 24 -> 31
2023-07-10 11:00:07,829 - __main__ - INFO - Loading sequence: 25 -> 31
2023-07-10 11:00:11,369 - __main__ - INFO - Loading sequence: 26 -> 31
2023-07-10 11:00:14,904 - __main__ - INFO - Loading sequence: 27 -> 31
2023-07-10 11:00:18,436 - __main__ - INFO - Loading sequence: 28 -> 3

| ID | GPU | MEM |
------------------
|  0 | 91% | 55% |


2023-07-10 11:00:41,894 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30_window_10/sequence_index_30.npz
2023-07-10 11:04:28,329 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30_window_10/sequence_index_30.npz
2023-07-10 11:04:41,659 - __main__ - INFO - Generating sequence for 40
2023-07-10 11:04:41,660 - __main__ - INFO - Generating shifted clickstreams from 31 -> 41
2023-07-10 11:04:41,663 - __main__ - INFO - Loading sequence: 31 -> 41
2023-07-10 11:04:45,469 - __main__ - INFO - Loading sequence: 32 -> 41
2023-07-10 11:04:49,147 - __main__ - INFO - Loading sequence: 33 -> 41
2023-07-10 11:04:52,668 - __main__ - INFO - Loading sequence: 34 -> 41
2023-07-10 11:04:56,186 - __main__ - INFO - Loading sequence: 35 -> 41
2023-07-10 11:04:59,718 - __main__ - INFO - Loading sequence: 36 -> 41
2023-07-10 11:05:03,239 - __main__ - INFO - Loading sequence: 37 -> 41
2023-07-10 11:05:06,764 - __main__ - INFO - Loading sequence: 38 -> 4

| ID | GPU | MEM |
------------------
|  0 | 61% | 55% |


2023-07-10 11:05:30,091 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30_window_10/sequence_index_40.npz
2023-07-10 11:09:12,710 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30_window_10/sequence_index_40.npz
2023-07-10 11:09:27,798 - __main__ - INFO - Sessionize complete for sequences [10, 20, 30, 40]
2023-07-10 11:09:27,833 - __main__ - INFO - Starting sessionize_users_cpu.py with arguments
2023-07-10 11:09:27,834 - __main__ - INFO - {'data_subset': 30,
 'input_path': 'calculated_features',
 'output_path': 'torch_ready_data',
 'save_s3': True,
 'seq_list': [10, 20, 30, 40],
 'window': 20}
2023-07-10 11:09:27,834 - __main__ - INFO - Reading data from calculated_features/files_used_30/calculated_features_window_20.parquet
2023-07-10 11:09:35,425 - __main__ - INFO - Data read: (38500990, 36)
2023-07-10 11:09:35,425 - __main__ - INFO - Casting date time and sorting by date time
2023-07-10 11:09:35,494 - __main__ - INFO - Dat

| ID | GPU | MEM |
------------------
|  0 | 60% | 55% |


2023-07-10 11:10:49,830 - __main__ - INFO - Initial clickstream shape: (38500990, 35)
2023-07-10 11:10:49,840 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30_window_20/sequence_index_0.npz
2023-07-10 11:17:28,751 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30_window_20/sequence_index_10.npz
2023-07-10 11:17:53,985 - __main__ - INFO - Generating sequence for 20
2023-07-10 11:17:53,986 - __main__ - INFO - Generating shifted clickstreams from 11 -> 21
2023-07-10 11:17:53,986 - __main__ - INFO - Loading sequence: 11 -> 21
2023-07-10 11:17:57,731 - __main__ - INFO - Loading sequence: 12 -> 21
2023-07-10 11:18:01,436 - __main__ - INFO - Loading sequence: 13 -> 21
2023-07-10 11:18:04,926 - __main__ - INFO - Loading sequence: 14 -> 21
2023-07-10 11:18:08,419 - __main__ - INFO - Loading sequence: 15 -> 21
2023-07-10 11:18:11,918 - __main__ - INFO - Loading sequence: 16 -> 21
2023-07-10 11:18:15,433 - __main__ - INFO - Loading seq

| ID | GPU | MEM |
------------------
|  0 | 59% | 55% |


2023-07-10 11:18:41,953 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30_window_20/sequence_index_20.npz
2023-07-10 11:22:28,478 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30_window_20/sequence_index_20.npz
2023-07-10 11:22:52,566 - __main__ - INFO - Generating sequence for 30
2023-07-10 11:22:52,567 - __main__ - INFO - Generating shifted clickstreams from 21 -> 31
2023-07-10 11:22:52,567 - __main__ - INFO - Loading sequence: 21 -> 31
2023-07-10 11:22:56,190 - __main__ - INFO - Loading sequence: 22 -> 31
2023-07-10 11:22:59,779 - __main__ - INFO - Loading sequence: 23 -> 31
2023-07-10 11:23:03,267 - __main__ - INFO - Loading sequence: 24 -> 31
2023-07-10 11:23:06,721 - __main__ - INFO - Loading sequence: 25 -> 31
2023-07-10 11:23:10,181 - __main__ - INFO - Loading sequence: 26 -> 31
2023-07-10 11:23:13,664 - __main__ - INFO - Loading sequence: 27 -> 31
2023-07-10 11:23:17,124 - __main__ - INFO - Loading sequence: 28 -> 3

| ID | GPU | MEM |
------------------
|  0 | 75% | 55% |


2023-07-10 11:23:40,031 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30_window_20/sequence_index_30.npz
2023-07-10 11:27:21,736 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30_window_20/sequence_index_30.npz
2023-07-10 11:27:49,548 - __main__ - INFO - Generating sequence for 40
2023-07-10 11:27:49,549 - __main__ - INFO - Generating shifted clickstreams from 31 -> 41
2023-07-10 11:27:49,549 - __main__ - INFO - Loading sequence: 31 -> 41
2023-07-10 11:27:53,333 - __main__ - INFO - Loading sequence: 32 -> 41
2023-07-10 11:27:57,004 - __main__ - INFO - Loading sequence: 33 -> 41
2023-07-10 11:28:00,531 - __main__ - INFO - Loading sequence: 34 -> 41
2023-07-10 11:28:04,054 - __main__ - INFO - Loading sequence: 35 -> 41
2023-07-10 11:28:07,589 - __main__ - INFO - Loading sequence: 36 -> 41
2023-07-10 11:28:11,112 - __main__ - INFO - Loading sequence: 37 -> 41
2023-07-10 11:28:14,624 - __main__ - INFO - Loading sequence: 38 -> 4

| ID | GPU | MEM |
------------------
|  0 | 51% | 55% |


2023-07-10 11:28:37,727 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30_window_20/sequence_index_40.npz
2023-07-10 11:32:16,647 - __main__ - INFO - Uploading to s3: dissertation-data-dmiller/torch_ready_data/files_used_30_window_20/sequence_index_40.npz
2023-07-10 11:32:31,517 - __main__ - INFO - Sessionize complete for sequences [10, 20, 30, 40]
