In [17]:
!pip install --upgrade pip --quiet
!pip install --upgrade awscli python-dotenv --quiet
%load_ext dotenv
%dotenv env



[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cudf 22.10.1+2.gca9a422da9 requires cupy-cuda115<12.0.0a0,>=9.5.0, which is not installed.
aiobotocore 2.4.0 requires botocore<1.27.60,>=1.27.59, but you have botocore 1.29.142 which is incompatible.[0m[31m
[0m

In [19]:
!aws s3 sync rl_ready_data s3://dissertation-data-dmiller/rl_ready_data --quiet
!aws s3 sync calculated_features s3://dissertation-data-dmiller/calculated_features --quiet
!aws s3 sync labelled_session_count_data s3://dissertation-data-dmiller/labelled_session_count_data --quiet
!aws s3 sync torch_ready_data s3://dissertation-data-dmiller/torch_ready_data --quiet

In [None]:
!rm -r rl_ready_data calculated_features labelled_session_count_data torch_ready_data

In [1]:
# %load rl_constant
LABEL = [
    "continue_work_session_30_minutes",
    
]

METADATA = [
    "user_id",
    "session_30_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "cum_session_event_raw",
    "global_events_user",
    "global_session_time",
    "date_time",
]

OUT_FEATURE_COLUMNS = [
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time",
    
    "expanding_click_average",
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]


RL_STAT_COLS = [
    'session_size',
    'sim_size',
    'session_minutes',
    'sim_minutes',
    'cum_session_event_raw',
    'cum_platform_time_raw',
    'reward',
    'session_30_raw',
    'cum_platform_time_raw',
    'global_session_time',
]



In [2]:
import os
import pandas as pd
import logging

from tqdm import tqdm
import numpy as np
global logger
logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('rl_results_eval')
from functools import reduce
from pprint import pformat
import cudf as gpu_pd
import cupy as gpu_np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
ALL_COLS = LABEL + METADATA + OUT_FEATURE_COLUMNS  + RL_STAT_COLS
from cuml.preprocessing import MinMaxScaler

def convolve_delta_events(df, window):
    
    before_resample = df.shape
    df = df.to_pandas()
    
    logger.info(f'Convolution over delta last event')
    
    df['convolved_delta_event'] = (
        df.sort_values(by=['session_30_raw', 'cum_session_event_raw']) \
            .set_index('date_time') \
            .groupby(by=['user_id', 'session_30_raw'], group_keys=False) \
            .rolling(f'{window}T', min_periods=1)['delta_last_event'] \
            .mean()
            .reset_index(name='convolved_event_delta')['convolved_event_delta'] \
    ) 

    df['delta_last_event'] = df['convolved_delta_event']
    
    df['row_count'] = df.index.values 
    logger.info(f'Convolution complete: resampling at {window} minute intervals')
    resampled_df = df \
        .sort_values(by=['row_count']) \
        .set_index('date_time') \
        .groupby(by=['user_id']) \
        .resample(f'{window}T') \
        .max()['row_count'] \
        .reset_index() \
        .sort_values(by=['row_count']) 
    
    
    logger.info(f'Generated resample df, merging back into original df')
        
    df_out = df[df['row_count'] \
        .isin(resampled_df['row_count'])] \
        .drop(columns=['row_count']) \
        .reset_index(drop=True) 
    
    
    logger.info(f'Convolution complete: {before_resample} -> {df_out.shape}')
    return gpu_pd.DataFrame(df_out)

def generate_metadata(dataset):
    
    session_size = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_event_raw'].max().reset_index(name='session_size')
    session_minutes = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].max().reset_index(name='session_minutes')
    
    sim_minutes = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].quantile(.7, interpolation='nearest').reset_index(name='sim_minutes')
    sim_size = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_event_raw'].quantile(.7, interpolation='nearest').reset_index(name='sim_size')
    
    
    sessions = [session_size, session_minutes, sim_minutes, sim_size]
    sessions = reduce(lambda left, right: gpu_pd.merge(left, right, on=['user_id', 'session_30_raw']), sessions)
    dataset = gpu_pd.merge(dataset, sessions, on=['user_id', 'session_30_raw'])
    dataset['reward'] = dataset['cum_session_time_raw']
    return dataset



def setup_data_at_window(df, window):
    logger.info(f'Convolution over {window} minute window')
    df = df.sort_values(by='date_time')
    df['cum_session_event_raw'] = df.groupby(['user_id', 'session_30_raw'])['cum_platform_event_raw'].cumcount() + 1
    df = df.sort_values(by='date_time').reset_index(drop=True)
    df = convolve_delta_events(df, window)
    
    df['year'] = df['date_time'].dt.year
    df['month'] = df['date_time'].dt.month
    df['day'] = df['date_time'].dt.day
    df['hour'] = df['date_time'].dt.hour
    df['minute'] = df['date_time'].dt.minute
    df['second'] = df['date_time'].dt.second
    
    df['date_time'] = gpu_pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'second']])
    logger.info(f'Convolving over {window} minute window complete: generating metadata')
    df = df.sort_values(by='date_time')
    df = generate_metadata(df) 
    logger.info(f'Generating metadata complete')
    return df

def get_dataset(read_path, conv_path, n_files, window):
    
    conv_path, read_path = (
        os.path.join(conv_path, f'files_used_{n_files}'),
        os.path.join(read_path, f'files_used_{n_files}', 'predicted_data.parquet')
    )

    if not os.path.exists(conv_path):
        logger.info(f'Creating directory {conv_path}')
        os.makedirs(conv_path)
    
    logger.info(f'Convolutional dataset not found at {conv_path}: creating')
    logger.info(f'Getting dataset from {read_path}')
    df = gpu_pd.read_parquet(read_path)
    
        
    df['date_time'] = gpu_pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'second']])
    df = df.sort_values(by='date_time')
    subsets = [
        { "name": "train", "start": 0, "end": int(df.shape[0] * .7) },
        { "name": "eval", "start": int(df.shape[0] * .7), "end": int(df.shape[0]) }
    ]

    
    logger.info(f'Initial shape: {df.shape}')
    
    for s in subsets:
        logger.info(f'Running conv on {s["name"]} subset')
        subset_df = df[s['start']:s['end']].copy()
        subset_df = subset_df.sort_values(by='date_time')
        subset_df = setup_data_at_window(subset_df, window)
        updated_conv_path = os.path.join(conv_path, f'window_{window}_{s["name"]}.parquet')
        logger.info(f'Saving convolutional dataset to {updated_conv_path}')
        subset_df.to_parquet(updated_conv_path)


In [4]:
class Arguments:
    read_path = 'rl_ready_data'
    conv_path = 'rl_ready_data_conv'
    n_files = 30
    window = 2

In [5]:
get_dataset(Arguments.read_path, Arguments.conv_path, Arguments.n_files, Arguments.window)


2023-05-29 23:04:39,427 - rl_results_eval - INFO - Convolutional dataset not found at rl_ready_data_conv/files_used_30: creating
2023-05-29 23:04:39,428 - rl_results_eval - INFO - Getting dataset from rl_ready_data/files_used_30/predicted_data.parquet
2023-05-29 23:04:43,055 - rl_results_eval - INFO - Initial shape: (38500990, 41)
2023-05-29 23:04:43,056 - rl_results_eval - INFO - Running conv on train subset
2023-05-29 23:04:43,171 - rl_results_eval - INFO - Convolution over 2 minute window
2023-05-29 23:04:43,391 - numba.cuda.cudadrv.driver - INFO - init
2023-05-29 23:04:46,881 - rl_results_eval - INFO - Convolution over delta last event
2023-05-29 23:05:20,829 - rl_results_eval - INFO - Convolution complete: resampling at 2 minute intervals
2023-05-29 23:07:14,054 - rl_results_eval - INFO - Convolution complete: (26950693, 42) -> (320104, 43)
2023-05-29 23:07:14,166 - rl_results_eval - INFO - Convolving over 2 minute window complete: generating metadata
2023-05-29 23:07:14,294 - rl_

In [5]:
df = gpu_pd.read_parquet('rl_ready_data_conv/files_used_30/window_2_train.parquet')

In [14]:
subset = df[df['user_id'] == 2232].sort_values(by=['session_30_raw', 'cum_session_event_raw'])[['date_time', 'session_30_raw', 'cum_platform_event_raw', 'cum_session_time_raw', 'cum_session_event_raw', 'reward', 'session_size', 'session_minutes', 'sim_minutes', 'sim_size']]

In [15]:
subset.head(100)

Unnamed: 0,date_time,session_30_raw,cum_platform_event_raw,cum_session_time_raw,cum_session_event_raw,reward,session_size,session_minutes,sim_minutes,sim_size
34803,2020-12-10 08:55:44,1.0,3.0,1.099609,3,1.099609,14,37.25,32.9375,12.0
41457,2020-12-10 08:57:53,1.0,6.0,3.267578,6,3.267578,14,37.25,32.9375,12.0
9384,2020-12-10 09:04:03,1.0,8.0,9.429688,8,9.429688,14,37.25,32.9375,12.0
12611,2020-12-10 09:25:17,1.0,9.0,30.65625,9,30.65625,14,37.25,32.9375,12.0
28588,2020-12-10 09:27:33,1.0,12.0,32.9375,12,32.9375,14,37.25,32.9375,12.0
13694,2020-12-10 09:31:52,1.0,14.0,37.25,14,37.25,14,37.25,32.9375,12.0
31214,2020-12-10 10:44:24,2.0,15.0,0.0,1,0.0,1,0.0,0.0,1.0
3352,2020-12-10 11:36:57,3.0,17.0,0.783203,2,0.783203,19,36.25,29.34375,17.0
33871,2020-12-10 11:49:48,3.0,20.0,13.617188,5,13.617188,19,36.25,29.34375,17.0
16471,2020-12-10 11:53:54,3.0,23.0,17.71875,8,17.71875,19,36.25,29.34375,17.0
