In [1]:
!pip install torch boto3 python-dotenv gputil --quiet


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.4.0 requires botocore<1.27.60,>=1.27.59, but you have botocore 1.29.64 which is incompatible.[0m[31m
[0m

In [2]:
import sys
import logging
import datetime

"""
current time yyyy_mm_dd_hh_mm_ss
using A6000 or bigger
"""
current_time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
so = open(f"logs_{current_time}.log", 'w', 10)
sys.stdout.echo = so
sys.stderr.echo = so

get_ipython().log.handlers[0].stream = so
get_ipython().log.setLevel(logging.INFO)

In [3]:
%load_ext dotenv
%dotenv env

In [4]:
# %load sessionization_pandas.py
import argparse
import glob
import os
import pdb
import pprint as pp
from datetime import datetime

import boto3
import torch
import tqdm

if torch.cuda.is_available():
    import cupy as np
    import numpy
    import cudf as pd
    import dask_cudf as dd
    import pandas as cpu_pd
    from cuml.preprocessing import MinMaxScaler
    import GPUtil
else:
    import numpy as np
    import numpy
    import pandas as pd
    import dask.dataframe as dd
    from sklearn.preprocessing import MinMaxScaler

torch.set_printoptions(sci_mode=False)
torch.set_printoptions(precision=4)

np.set_printoptions(suppress=True)
np.set_printoptions(precision=4)
np.set_printoptions(linewidth=200)

torch.set_printoptions(linewidth=400, precision=4, sci_mode=False)

SCALED_COLS =[
    'timestamp',
    'time_diff_seconds',
    '30_minute_session_count',
    '5_minute_session_count',
    'task_within_session_count',
    'user_count',
    'project_count',
    'country_count',
]

GENERATED_COLS = [
    'cum_events',
    'cum_projects',
    'cum_time',
    'cum_time_within_session',
    'av_time_across_clicks',
    'av_time_across_clicks_session',
    'rolling_average_tasks_within_session',
    'rolling_av_time_within_session',
    'rolling_time_between_sessions',
]

ENCODED_COLS = [
    'user_id',
    'project_id',
    'country'
]


GROUPBY_COLS = ['user_id']

TIMESTAMP_INDEX = 1

INITIAL_LOAD_COLUMNS = ENCODED_COLS +  ['label', 'date_time'] +  [col for col in SCALED_COLS if 'timestamp' not in col and 'project_count' not in col]

TIMESTAMP_INDEX = 1

PARTITION_LIST = [
    {
        'name': '125k',
        'size': 125000,
        'indexes': None
    },
    {
        'name': '125m',
        'size': 1250000,
        'indexes': None
    },
    {
        'name': '5m',
        'size': 5000000,
    },
    {
        'name': '10m',
        'size': 10000000,
    },
    {
        'name': '20m',
        'size': 20000000,
    },
    {
        'name': 'full',
        'size': None,
    }
]

class SessionizeData:
    def __init__(self, df, max_sequence_index, write_path, partition_list=PARTITION_LIST, save_s3=True):
        self.df = df
        self.max_sequence_index = max_sequence_index + 1
        self.min_sequence_index = self.max_sequence_index - 10
        self.device = self._device()
        self.sequences = numpy.arange(self.min_sequence_index, self.max_sequence_index).tolist()
        self.seq_container = []
        self.torch_sequences = None
        self.output_path = write_path
        self.partition_list = partition_list
        self.save_s3 = save_s3

    def _device(self):
        return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def _sequence_lazy(self):
         return next(self._lazy_load_shifted_index())

    def _shifters(self):
        for _ in range(self.min_sequence_index, self.max_sequence_index):
            print(f'Loading sequence: {_} -> {self.max_sequence_index}')
            self.seq_container.append(self._sequence_lazy())
            if torch.cuda.is_available():
                GPUtil.showUtilization()

        sequences = torch.cat(self.seq_container, dim=1).half()
        return sequences

    def generate_sequence(self):

        print(f'Generating shifted clickstreams from {self.min_sequence_index} -> {self.max_sequence_index}')
        sequence = self._shifters()

        print(f'Shifters shape: {sequence.shape}')

        cols_required =  ['label', 'total_events', 'timestamp_raw'] + ENCODED_COLS + SCALED_COLS + GENERATED_COLS
        print(f'Columns required: {cols_required}')
        print(f'Loading intial clickstream to {self.device}')

        if self.max_sequence_index == 11:
            print('Initial clickstream writing to disk')
            initial_clickstream = self.df[cols_required].values.astype(np.float32)
            self._sequence_to_disk(initial_clickstream, 0)

        print(f'Writing sequence to disk: {self.max_sequence_index - 1}') 
        self._sequence_to_disk(sequence.cpu().numpy(), self.max_sequence_index - 1)


    def _sequence_to_disk(self, partition, sequence_index):
        if self.save_s3:
            s3_client = boto3.client(
                's3',
                aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
            )

        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)

        partition_path = os.path.join(self.output_path, f'sequence_index_{sequence_index}.npz')
        print(f'Saving to disk: {partition_path}')
        np.savez_compressed(partition_path, partition)

        if self.save_s3:
            print(f'Uploading to s3: dissertation-data-dmiller/{partition_path}')
            s3_client.upload_file(partition_path, 'dissertation-data-dmiller', partition_path)

    def _lazy_load_shifted_index(self):

        torch.cuda.empty_cache()
        indx = self.sequences.pop(0)
        torch_container = []
        for col in ['project_id'] + SCALED_COLS + GENERATED_COLS:
            sequence = self.df.groupby(GROUPBY_COLS)[col].shift(indx).fillna(0).values.astype(np.float16)
            sequence_tensor = torch.tensor(sequence).to(self.device).half()
            torch_container.append(sequence_tensor.unsqueeze(1))
            torch.cuda.empty_cache()

        yield torch.cat(torch_container, dim=1)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--max_sequence_index', type=int, default=10)
    parser.add_argument('--input_path', type=str, default='../datasets/frequency_encoded_data')
    parser.add_argument('--output_path', type=str, default='torch_ready_data')
    parser.add_argument('--data_subset', type=int, default=60, help='Number of files to read from input path')
    return parser.parse_args()

def main(args):
    #
    torch.set_printoptions(sci_mode=False)
    torch.set_printoptions(precision=4)

    np.set_printoptions(suppress=True)
    np.set_printoptions(precision=4)


    current_time = datetime.now().strftime("%Y-%m-%d-%H:%M:%S")

    print(f"Starting {current_time}\nsubset of data: {args.data_subset}\nreading from {args.input_path}\nwrite_path {args.output_path}\nseq_list {args.seq_list}")


    files = glob.glob(f'{args.input_path}/*.csv')
    files = sorted(list(files))
    files = files[:args.data_subset]

class Arguments:
    def __init__(self, seq_list):
        self.seq_list = seq_list
        self.input_path = 'datasets/frequency_encoded_data'
        self.output_path = 'datasets/torch_ready_data_2'
        self.data_subset = 5
        self.save_s3 = False



In [5]:
class Arguments:
    def __init__(self, seq_list, data_subset):
        self.seq_list = seq_list
        self.input_path = 'frequency_encoded_data'
        self.output_path = 'torch_ready_data_4'
        self.data_subset = data_subset
        self.save_s3 = True


In [8]:
for data_subset in [5, 30, 45, 61]:
    print(f'Generating data for {data_subset} files: seq_list = {[10, 20, 30, 40]}')
    args = Arguments([10, 20, 30, 40], data_subset)
    main(args)


Generating data for 5 files: seq_list = [10, 20, 30, 40]


NameError: name 'Arguments' is not defined

In [13]:
import cupy as np
import numpy
import cudf as pd
import dask_cudf as dd
import pandas as cpu_pd
from cuml.preprocessing import MinMaxScaler
import glob

SCALED_COLS =[
    'timestamp',
    'time_diff_seconds',
    '30_minute_session_count',
    '5_minute_session_count',
    'task_within_session_count',
    'user_count',
    'project_count',
    'country_count',
]

GENERATED_COLS = [
    'cum_events',
    'cum_projects',
    'cum_time',
    'cum_time_within_session',
    'av_time_across_clicks',
    'av_time_across_clicks_session',
    'rolling_average_tasks_within_session',
    'rolling_av_time_within_session',
    'rolling_time_between_sessions',
]

ENCODED_COLS = [
    'user_id',
    'project_id',
    'country'
]


GROUPBY_COLS = ['user_id']

TIMESTAMP_INDEX = 1

INITIAL_LOAD_COLUMNS = ENCODED_COLS +  ['label', 'date_time'] +  [col for col in SCALED_COLS if 'timestamp' not in col and 'project_count' not in col]

In [14]:
def prepare_for_sessionization(data_paths: list, scaler: MinMaxScaler):

    df = dd.read_csv(data_paths, usecols=INITIAL_LOAD_COLUMNS)
    return df

    df = join_for_encodings(df)
    df['timestamp_raw'] = df['date_time'].astype('int64') // 10**9
    df = df.sort_values(by='date_time')

    df['timestamp'] = df['timestamp_raw']
    print(f'Loaded data: shape = {df.shape}, min_date, max_date: {df.date_time.min()}, {df.date_time.max()}')
    print(f'Data label true: {df.label.value_counts() / len(df)}')
    df = df[SCALED_COLS + GENERATED_COLS + ENCODED_COLS + ['timestamp_raw', 'label', 'total_events']]
    df[SCALED_COLS + GENERATED_COLS] = scaler.fit_transform(df[SCALED_COLS + GENERATED_COLS].values)
    return df.astype('float32')


In [19]:

files = glob.glob(f'frequency_encoded_data/*.csv')
files = sorted(list(files))
files = files[:61]
df = prepare_for_sessionization(files, MinMaxScaler())



In [20]:
display(df['user_id'].nunique().compute())
display(df['project_id'].nunique().compute())

104744

846