In [1]:

!python -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 --quiet
!python -m pip install boto3 python-dotenv --quiet



[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.4.0 requires botocore<1.27.60,>=1.27.59, but you have botocore 1.29.135 which is incompatible.[0m[31m
[0m

In [2]:
%load_ext dotenv
%dotenv env

In [3]:
LABEL = [
    "continue_work_session_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "global_events_user",
    "global_session_time",
    
    "year",
    "month",
    "day",
    "hour",
    "minute",
    "second"
]

OUT_FEATURE_COLUMNS = [
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time",
    
    "expanding_click_average",
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]


GROUPBY_COLS = ['user_id']

LOAD_COLS = LABEL + METADATA + OUT_FEATURE_COLUMNS



In [4]:
import cudf as gpu_pd
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy as cpu_np
import cupy as gpu_np
import pandas as cpu_pd
from pprint import pformat
import logging
import os
import glob
import boto3
import os
import numpy 
import logging

In [5]:
class SessionizeData:
    logger = logging.getLogger(__name__)
    def __init__(self, df, max_sequence_index, write_path, load_cols, feature_cols, grouper, save_s3=True):
        self.df = df
        self.max_sequence_index = max_sequence_index + 1
        self.min_sequence_index = self.max_sequence_index - 10
        self.device = self._device()
        self.sequences = cpu_np.arange(self.min_sequence_index, self.max_sequence_index).tolist()
        self.seq_container = []
        self.torch_sequences = None
        self.output_path = write_path
        self.save_s3 = save_s3
        self.load_columns = load_cols
        self.feature_cols = feature_cols
        self.grouper = grouper
        

    def _device(self):
        return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def _sequence_lazy(self):
         return next(self._lazy_load_shifted_index())

    def _shifters(self):
        for _ in range(self.min_sequence_index, self.max_sequence_index):
            self.logger.info(f'Loading sequence: {_} -> {self.max_sequence_index}')
            self.seq_container.append(self._sequence_lazy())
        if torch.cuda.is_available():
            GPUtil.showUtilization()

        sequences = torch.cat(self.seq_container, dim=1).half()
        return sequences

    def generate_sequence(self):

        self.logger.info(f'Generating shifted clickstreams from {self.min_sequence_index} -> {self.max_sequence_index}')
        sequence = self._shifters()

        self.logger.info(f'Shifter shape: {sequence.shape}')
        

        self.logger.info(f'Loading intial clickstream to {self.device}')

        if self.max_sequence_index == 11:
            self.logger.info('Initial clickstream writing to disk')
            initial_clickstream = self.df[self.load_columns]
            initial_clickstream = self.df[self.load_columns].values.astype(gpu_np.float32)
            
            self.logger.info(f'Initial clickstream shape: {initial_clickstream.shape}')
            self._sequence_to_disk(initial_clickstream, 0)

        self.logger.info(f'Writing sequence to disk: {self.max_sequence_index - 1}')
        self._sequence_to_disk(sequence.cpu().numpy(), self.max_sequence_index - 1)


    def _sequence_to_disk(self, partition, sequence_index):
        if self.save_s3:
            s3_client = boto3.client(
                's3',
                aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
            )

        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)

        partition_path = os.path.join(self.output_path, f'sequence_index_{sequence_index}.npz')
        self.logger.info(f'Saving to disk: {partition_path}')
        numpy.savez_compressed(partition_path, partition)

        if self.save_s3:
            self.logger.info(f'Uploading to s3: dissertation-data-dmiller/{partition_path}')
            s3_client.upload_file(partition_path, 'dissertation-data-dmiller', partition_path)

    def _lazy_load_shifted_index(self):

        torch.cuda.empty_cache()
        indx = self.sequences.pop(0)
        torch_container = []
        for col in self.feature_cols:
            sequence = self.df.groupby(self.grouper)[col].shift(indx).fillna(0).values.astype(gpu_np.float16)
            sequence_tensor = torch.tensor(sequence).to(self.device).half()
            torch_container.append(sequence_tensor.unsqueeze(1))
            torch.cuda.empty_cache()

        yield torch.cat(torch_container, dim=1).half()

In [6]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

torch.set_printoptions(sci_mode=False, precision=4, linewidth=400)

cpu_np.set_printoptions(suppress=True)
cpu_np.set_printoptions(precision=4)

def get_logger():
    logger = logging.getLogger(__name__)
    return logger


def scale_feature_cols(df, scaler, scaler_columns):
    df[scaler_columns] = scaler.fit_transform(df[scaler_columns])
    return df

def main(args):
    
    logger = get_logger()
    logger.info('Starting sessionize_users_cpu.py with arguments')
    logger.info(pformat(args.__dict__))
    
    data_read = os.path.join(args.input_path, f'files_used_{args.data_subset}')

    logger.info(f'Reading data from {data_read}')
    scaler_cols = [
        col for col in OUT_FEATURE_COLUMNS if 'sin' not in col and 'cos' not in col
    ]

    df = gpu_pd.read_parquet(data_read, columns=LOAD_COLS + ['date_time'])
    logger.info(f'Data read: {df.shape}')
    logger.info('Casting date time and sorting by date time')
    df['date_time'] = gpu_pd.to_datetime(df['date_time'])
    df = df.sort_values(by=['date_time'])
    logger.info('Data read: scaling scaler columns')
    scaler = MinMaxScaler(feature_range=(-1, 1))
    df = df.to_pandas()
    logger.info(f'Shape before features: {df.shape}')
    df[scaler_cols] = scaler.fit_transform(df[scaler_cols])
    df = gpu_pd.from_pandas(df)
    logger.info(f'Shape after features: {df.shape}')

    logger.info('Scaling complete: implement sessionize')
    
    for seq_index in args.seq_list:

        sessionize = SessionizeData(
            df,
            seq_index,
            os.path.join(args.output_path, f'files_used_{args.data_subset}'),
            LOAD_COLS,
            OUT_FEATURE_COLUMNS,
            GROUPBY_COLS,
            args.save_s3
        )
    
        logger.info(f'Generating sequence for {seq_index}')
        sessionize.generate_sequence()
    
    logger.info(f'Sessionize complete for sequences {args.seq_list}')
   




In [7]:
class Arguments:
    seq_list = [10, 20, 30, 40]
    input_path = 'calculated_features'
    output_path = 'torch_ready_data'
    data_subset = None
    save_s3 = True

for data_subset in [30]:
    args = Arguments()
    args.data_subset = data_subset
    df = main(args)

2023-05-18 10:30:47,712 - __main__ - INFO - Starting sessionize_users_cpu.py with arguments
2023-05-18 10:30:47,713 - __main__ - INFO - {'data_subset': 30}
2023-05-18 10:30:47,713 - __main__ - INFO - Reading data from calculated_features/files_used_30
