In [1]:

!python -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 --quiet
!python -m pip install boto3 python-dotenv GPUtil --quiet



[0m

In [2]:
%load_ext dotenv
%dotenv env

In [None]:
LABEL = [
    "continue_work_session_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    "cum_platform_event_raw",
    
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "global_session_time",
    
    "year",
    "month",
    "day",
    
    "hour",
    "minute",
    "second"
]

OUT_FEATURE_COLUMNS = [
    "user_count",
    "project_count",
    "country_count", 

    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time",
    
    "expanding_click_average",
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]


GROUPBY_COLS = ['user_id']

LOAD_COLS = LABEL + METADATA + OUT_FEATURE_COLUMNS



: 

In [4]:
import cudf as gpu_pd
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy as cpu_np
import cupy as gpu_np
import pandas as cpu_pd
from pprint import pformat
import logging
import os
import glob
import boto3
import os
import numpy 
import logging
import GPUtil

In [5]:
class SessionizeData:
    logger = logging.getLogger(__name__)
    def __init__(self, df, max_sequence_index, write_path, load_cols, feature_cols, grouper, save_s3=True):
        self.df = df
        self.max_sequence_index = max_sequence_index + 1
        self.min_sequence_index = self.max_sequence_index - 10
        self.device = self._device()
        self.sequences = cpu_np.arange(self.min_sequence_index, self.max_sequence_index).tolist()
        self.seq_container = []
        self.torch_sequences = None
        self.output_path = write_path
        self.save_s3 = save_s3
        self.load_columns = load_cols
        self.feature_cols = feature_cols
        self.grouper = grouper
        

    def _device(self):
        return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def _sequence_lazy(self):
         return next(self._lazy_load_shifted_index())

    def _shifters(self):
        for _ in range(self.min_sequence_index, self.max_sequence_index):
            self.logger.info(f'Loading sequence: {_} -> {self.max_sequence_index}')
            self.seq_container.append(self._sequence_lazy())
        if torch.cuda.is_available():
            GPUtil.showUtilization()

        sequences = torch.cat(self.seq_container, dim=1).half()
        return sequences

    def generate_sequence(self):

        self.logger.info(f'Generating shifted clickstreams from {self.min_sequence_index} -> {self.max_sequence_index}')
        sequence = self._shifters()

        self.logger.info(f'Shifter shape: {sequence.shape}')
        

        self.logger.info(f'Loading intial clickstream to {self.device}')

        if self.max_sequence_index == 11:
            self.logger.info('Initial clickstream writing to disk')
            initial_clickstream = self.df[self.load_columns]
            initial_clickstream = self.df[self.load_columns].values.astype(gpu_np.float32)
            
            self.logger.info(f'Initial clickstream shape: {initial_clickstream.shape}')
            self._sequence_to_disk(initial_clickstream, 0)

        self.logger.info(f'Writing sequence to disk: {self.max_sequence_index - 1}')
        self._sequence_to_disk(sequence.cpu().numpy(), self.max_sequence_index - 1)


    def _sequence_to_disk(self, partition, sequence_index):
        if self.save_s3:
            s3_client = boto3.client(
                's3',
                aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
            )

        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)

        partition_path = os.path.join(self.output_path, f'sequence_index_{sequence_index}.npz')
        self.logger.info(f'Saving to disk: {partition_path}')
        numpy.savez_compressed(partition_path, partition)

        if self.save_s3:
            self.logger.info(f'Uploading to s3: dissertation-data-dmiller/{partition_path}')
            s3_client.upload_file(partition_path, 'dissertation-data-dmiller', partition_path)

    def _lazy_load_shifted_index(self):

        torch.cuda.empty_cache()
        indx = self.sequences.pop(0)
        torch_container = []
        for col in self.feature_cols:
            sequence = self.df.groupby(self.grouper)[col].shift(indx).fillna(0).values.astype(gpu_np.float16)
            sequence_tensor = torch.tensor(sequence).to(self.device).half()
            torch_container.append(sequence_tensor.unsqueeze(1))
            torch.cuda.empty_cache()

        yield torch.cat(torch_container, dim=1).half()

In [6]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

torch.set_printoptions(sci_mode=False, precision=4, linewidth=400)

cpu_np.set_printoptions(suppress=True)
cpu_np.set_printoptions(precision=4)

def get_logger():
    logger = logging.getLogger(__name__)
    return logger


def scale_feature_cols(df, scaler, scaler_columns):
    df[scaler_columns] = scaler.fit_transform(df[scaler_columns])
    return df

def main(args):
    
    logger = get_logger()
    logger.info('Starting sessionize_users_cpu.py with arguments')
    logger.info(pformat(args.__dict__))
    
    data_read = os.path.join(args.input_path, f'files_used_{args.data_subset}')

    logger.info(f'Reading data from {data_read}')
    scaler_cols = [
        col for col in OUT_FEATURE_COLUMNS if 'sin' not in col and 'cos' not in col
    ]

    df = gpu_pd.read_parquet(data_read, columns=LOAD_COLS + ['date_time'])
    logger.info(f'Data read: {df.shape}')
    logger.info('Casting date time and sorting by date time')
    df['date_time'] = gpu_pd.to_datetime(df['date_time'])
    df = df.sort_values(by=['date_time'])
    logger.info('Data read: scaling scaler columns')
    train_partition = int(df.shape[0] * 0.7)
    
    train_df = df.iloc[:train_partition]
    test_df = df.iloc[train_partition:]
    
    scaler = MinMaxScaler(feature_range=(-1, 1))
    train_df, val_df = train_df.to_pandas(), test_df.to_pandas()
    logger.info(f'Fitting on train dataset')
    train_df[scaler_cols] = scaler.fit_transform(train_df[scaler_cols])
    
    logger.info(f'Applying scalar to validation dataset')
    val_df[scaler_cols] = scaler.transform(val_df[scaler_cols])
    train_df, val_df = gpu_pd.from_pandas(train_df), gpu_pd.from_pandas(val_df)
    df = gpu_pd.concat([train_df, val_df]).sort_values(by=['date_time'])
    logger.info(f'DF concatenated')
    logger.info(f'Shape after features: {df.shape}')

    logger.info('Scaling complete: implement sessionize')
    
    for seq_index in args.seq_list:

        sessionize = SessionizeData(
            df,
            seq_index,
            os.path.join(args.output_path, f'files_used_{args.data_subset}'),
            LOAD_COLS,
            OUT_FEATURE_COLUMNS,
            GROUPBY_COLS,
            args.save_s3
        )
    
        logger.info(f'Generating sequence for {seq_index}')
        sessionize.generate_sequence()
    
    logger.info(f'Sessionize complete for sequences {args.seq_list}')
   




In [7]:
class Arguments:
    seq_list = [10, 20, 30, 40]
    input_path = 'calculated_features'
    output_path = 'torch_ready_data'
    data_subset = None
    save_s3 = False

for data_subset in [30]:
    args = Arguments()
    args.data_subset = data_subset
    df = main(args)

2023-06-12 11:40:16,546 - __main__ - INFO - Starting sessionize_users_cpu.py with arguments
2023-06-12 11:40:16,548 - __main__ - INFO - {'data_subset': 30}
2023-06-12 11:40:16,549 - __main__ - INFO - Reading data from calculated_features/files_used_30
2023-06-12 11:40:18,860 - __main__ - INFO - Data read: (38500990, 36)
2023-06-12 11:40:18,861 - __main__ - INFO - Casting date time and sorting by date time
2023-06-12 11:40:18,924 - __main__ - INFO - Data read: scaling scaler columns
2023-06-12 11:40:30,745 - __main__ - INFO - Fitting on train dataset
2023-06-12 11:40:47,147 - __main__ - INFO - Applying scalar to validation dataset
2023-06-12 11:40:56,457 - __main__ - INFO - DF concatenated
2023-06-12 11:40:56,458 - __main__ - INFO - Shape after features: (38500990, 36)
2023-06-12 11:40:56,458 - __main__ - INFO - Scaling complete: implement sessionize
2023-06-12 11:40:56,460 - __main__ - INFO - Generating sequence for 10
2023-06-12 11:40:56,460 - __main__ - INFO - Generating shifted clic

| ID | GPU | MEM |
------------------
|  0 | 55% | 57% |


2023-06-12 11:41:34,992 - __main__ - INFO - Initial clickstream shape: (38500990, 35)
2023-06-12 11:41:34,993 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30/sequence_index_0.npz
2023-06-12 11:43:45,143 - __main__ - INFO - Writing sequence to disk: 10
2023-06-12 11:43:56,464 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30/sequence_index_10.npz
2023-06-12 11:47:56,465 - __main__ - INFO - Generating sequence for 20
2023-06-12 11:47:56,466 - __main__ - INFO - Generating shifted clickstreams from 11 -> 21
2023-06-12 11:47:56,466 - __main__ - INFO - Loading sequence: 11 -> 21
2023-06-12 11:47:59,978 - __main__ - INFO - Loading sequence: 12 -> 21
2023-06-12 11:48:03,448 - __main__ - INFO - Loading sequence: 13 -> 21
2023-06-12 11:48:06,979 - __main__ - INFO - Loading sequence: 14 -> 21
2023-06-12 11:48:10,428 - __main__ - INFO - Loading sequence: 15 -> 21
2023-06-12 11:48:13,827 - __main__ - INFO - Loading sequence: 16 -> 21
2023-06-12 11:48:17,209 - __m

| ID | GPU | MEM |
------------------
|  0 | 99% | 57% |


2023-06-12 11:48:42,094 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30/sequence_index_20.npz
2023-06-12 11:52:36,705 - __main__ - INFO - Generating sequence for 30
2023-06-12 11:52:36,706 - __main__ - INFO - Generating shifted clickstreams from 21 -> 31
2023-06-12 11:52:36,706 - __main__ - INFO - Loading sequence: 21 -> 31
2023-06-12 11:52:40,903 - __main__ - INFO - Loading sequence: 22 -> 31
2023-06-12 11:52:44,262 - __main__ - INFO - Loading sequence: 23 -> 31
2023-06-12 11:52:47,628 - __main__ - INFO - Loading sequence: 24 -> 31
2023-06-12 11:52:50,991 - __main__ - INFO - Loading sequence: 25 -> 31
2023-06-12 11:52:54,380 - __main__ - INFO - Loading sequence: 26 -> 31
2023-06-12 11:52:57,791 - __main__ - INFO - Loading sequence: 27 -> 31
2023-06-12 11:53:01,190 - __main__ - INFO - Loading sequence: 28 -> 31
2023-06-12 11:53:04,605 - __main__ - INFO - Loading sequence: 29 -> 31
2023-06-12 11:53:07,981 - __main__ - INFO - Loading sequence: 30 -> 31
2023-06-12 11:53

| ID | GPU | MEM |
------------------
|  0 | 44% | 57% |


2023-06-12 11:53:22,668 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30/sequence_index_30.npz
2023-06-12 11:57:13,480 - __main__ - INFO - Generating sequence for 40
2023-06-12 11:57:13,480 - __main__ - INFO - Generating shifted clickstreams from 31 -> 41
2023-06-12 11:57:13,481 - __main__ - INFO - Loading sequence: 31 -> 41
2023-06-12 11:57:16,982 - __main__ - INFO - Loading sequence: 32 -> 41
2023-06-12 11:57:20,575 - __main__ - INFO - Loading sequence: 33 -> 41
2023-06-12 11:57:24,125 - __main__ - INFO - Loading sequence: 34 -> 41
2023-06-12 11:57:27,498 - __main__ - INFO - Loading sequence: 35 -> 41
2023-06-12 11:57:30,866 - __main__ - INFO - Loading sequence: 36 -> 41
2023-06-12 11:57:34,314 - __main__ - INFO - Loading sequence: 37 -> 41
2023-06-12 11:57:37,773 - __main__ - INFO - Loading sequence: 38 -> 41
2023-06-12 11:57:41,264 - __main__ - INFO - Loading sequence: 39 -> 41
2023-06-12 11:57:44,758 - __main__ - INFO - Loading sequence: 40 -> 41
2023-06-12 11:57

| ID | GPU | MEM |
------------------
|  0 | 96% | 57% |


2023-06-12 11:57:59,330 - __main__ - INFO - Saving to disk: torch_ready_data/files_used_30/sequence_index_40.npz
2023-06-12 12:01:45,594 - __main__ - INFO - Sessionize complete for sequences [10, 20, 30, 40]


In [9]:
!aws s3 sync calculated_features/ s3://dissertation-data-dmiller/calculated_features/ --delete
!aws s3 sync torch_ready_data/ s3://dissertation-data-dmiller/torch_ready_data/ --delete
!aws s3 sync rl_ready_data_conv s3://dissertation-data-dmiller/rl_ready_data_conv/ --delete

delete: s3://dissertation-data-dmiller/calculated_features/.ipynb_checkpoints/core_dict_30_files-checkpoint.json
delete: s3://dissertation-data-dmiller/calculated_features/core_dict_10_files.json
delete: s3://dissertation-data-dmiller/calculated_features/core_dict_30_files.json
delete: s3://dissertation-data-dmiller/calculated_features/files_used_30
delete: s3://dissertation-data-dmiller/calculated_features/files_used_10
delete: s3://dissertation-data-dmiller/calculated_features/files_used_10_description.csv
delete: s3://dissertation-data-dmiller/calculated_features/files_used_2
upload: calculated_features/files_used_2/calculated_features.parquet to s3://dissertation-data-dmiller/calculated_features/files_used_2/calculated_features.parquet
upload: calculated_features/files_used_10/calculated_features.parquet to s3://dissertation-data-dmiller/calculated_features/files_used_10/calculated_features.parquet
upload: calculated_features/files_used_30/calculated_features.parquet to s3://disser