In [3]:

import argparse
import glob
import os
import pdb
import pprint as pp
from datetime import datetime

import boto3
import torch
import tqdm

if torch.cuda.is_available():
    import cupy as np
    import numpy
    import cudf as pd
    import dask_cudf as dd
    from cuml.preprocessing import MinMaxScaler
    import GPUtil
else:
    import numpy as np
    import numpy
    import pandas as pd
    import dask.dataframe as dd
    from sklearn.preprocessing import MinMaxScaler

torch.set_printoptions(sci_mode=False)
torch.set_printoptions(precision=4)

np.set_printoptions(suppress=True)
np.set_printoptions(precision=4)
np.set_printoptions(linewidth=200)

torch.set_printoptions(linewidth=400, precision=4, sci_mode=False)

SCALED_COLS =[
    'timestamp',
    'time_diff_seconds',
    '30_minute_session_count',
    '5_minute_session_count',
    'task_within_session_count',
    'user_count',
    'project_count',
    'country_count',
]

ENCODED_COLS = [
    'user_id',
    'project_id',
    'country'
]


GROUPBY_COLS = ['user_id']

TIMESTAMP_INDEX = 1

INITIAL_LOAD_COLUMNS = ENCODED_COLS +  ['label', 'date_time', 'workflow_id'] +  [col for col in SCALED_COLS if 'timestamp' not in col and 'project_count' not in col]

TIMESTAMP_INDEX = 1

COUNTRY_ENCODING = {
    'Finland': 1,
    'United States': 2,
    'China': 3,
    'Singapore': 4,
}

PARTITION_LIST = [
    {
        'name': '125k',
        'size': 125000,
        'indexes': None
    },
    {
        'name': '125m',
        'size': 1250000,
        'indexes': None
    },
    {
        'name': '5m',
        'size': 5000000,
    },
    {
        'name': '10m',
        'size': 10000000,
        
    },
    {
        'name': '20m',
        'size': 20000000,
    },
    {
        'name': 'full',
        'size': None,
    }
]


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def prepare_for_sessionization(data_paths: list, scaler: MinMaxScaler):
    df = dd.read_csv(data_paths, usecols=INITIAL_LOAD_COLUMNS)
    return df

def _encode_countries(x):
        if x == 'Finland':
            return 1
        elif x == 'United States':
            return 2
        elif x == 'China':
            return 3
        else:
            return 4

In [5]:
def join_for_encodings(df):


    project_id_value_counts = df['project_id'].value_counts().reset_index().rename(columns={'index': 'project_id', 'project_id': 'project_count'})
    df = df.merge(project_id_value_counts, on='project_id', how='left')

    if torch.cuda.is_available():
        df = df.compute().to_pandas()
    else:
        df = df.compute()

    df['country'] = df['country'].apply(_encode_countries)

    if torch.cuda.is_available():
        df = pd.from_pandas(df)


    user_id_hash = pd.DataFrame(df['user_id'].unique()).reset_index().rename(columns={'index': 'user_id_hash', 0: 'user_id'})
    project_id_hash = pd.DataFrame(df['project_id'].unique()).reset_index().rename(columns={'index': 'project_id_hash', 0: 'project_id'})

    av_num_events_per_session = df.groupby(['user_id', '30_minute_session_count']).count()
    f'Average number of events per session: {av_num_events_per_session[0]}'

    user_id_hash['user_id_hash'] = user_id_hash['user_id_hash'] + 1
    project_id_hash['project_id_hash'] = project_id_hash['project_id_hash'] + 1

    df = df.merge(user_id_hash, on='user_id', how='left')
    df = df.merge(project_id_hash, on='project_id', how='left')

    df = df.drop(columns=['user_id', 'project_id'])
    df = df.rename(columns={'user_id_hash': 'user_id', 'project_id_hash': 'project_id'})
    return df

In [6]:
class Arguments:
    def __init__(self, seq_list):
        self.seq_list = seq_list
        self.input_path = '../../../datasets/frequency_encoded_data'
        self.output_path = 'datasets/torch_ready_data'
        self.data_subset = 5
        self.save_s3 = False