In [1]:
# %load rl_constant
LABEL = [
    "continue_work_session_30_minutes",
]

METADATA = [
    "user_id",
    "session_30_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "global_events_user",
    "global_session_time",
    "date_time",
]

OUT_FEATURE_COLUMNS = [
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time",
    
    "expanding_click_average",
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]


RL_STAT_COLS = [
    'session_size',
    'sim_size',
    'session_minutes',
    'sim_minutes',
    'cum_session_event_raw',
    'cum_platform_time_raw',
    'reward',
    'session_30_raw',
    'cum_platform_time_raw',
    'global_session_time',
]


In [2]:
import os
import pandas as pd
import logging

global logger
logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('rl_results_eval')
from functools import reduce
from pprint import pformat
import cudf as gpu_pd

ALL_COLS = LABEL + METADATA + OUT_FEATURE_COLUMNS  + RL_STAT_COLS

def convolve_delta_events(df, window):
    
    before_resample = df.shape
    df['convolved_delta_event'] = (
        df \
            .dropna() \
            .sort_values(by='date_time') \
            .set_index('date_time') \
            . groupby(by=['user_id', 'session_30_raw'], group_keys=False) \
            .rolling(f'{window}T', min_periods=1)['delta_last_event'] \
            .mean()
            .reset_index(name='convolved_event_delta')['convolved_event_delta']
    )

    df['delta_last_event'] = df['convolved_delta_event']
    
    df = df.to_pandas()
    
    logger.info(f'Convolution complete: resampling with max at {window} minute intervals')
    sampled_events = df.sort_values(by=['date_time']) \
        .set_index('date_time') \
        .groupby(['user_id']) \
        .resample(f'{window}Min')['cum_platform_event_raw'] \
        .max() \
        .drop(columns=['user_id']) \
        .reset_index() \
        .dropna()
    
    logger.info(f'Resampling complete: {before_resample} -> {sampled_events.shape}')
    logger.info(f'Joining back to original df')
    
    # sampled_events, df = gpu_pd.DataFrame(sampled_events), gpu_pd.DataFrame(df)
    sampled_events = sampled_events.set_index(['user_id', 'cum_platform_event_raw'])
    df = df.set_index(['user_id', 'cum_platform_event_raw'])
    
    resampled_df = df[df.index.isin(sampled_events.index)].reset_index()
    
    resampled_df = gpu_pd.DataFrame(resampled_df)
        
    logger.info(f'Joining complete: {resampled_df.shape}')
    return resampled_df
    


def generate_metadata(dataset):
    
    session_size = dataset.groupby(['user_id', 'session_30_raw'])['size_of_session'].max().reset_index(name='session_size')
    session_minutes = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].max().reset_index(name='session_minutes')
    
    sim_minutes = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].quantile(.7, interpolation='nearest').reset_index(name='sim_minutes')
    sim_size = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_event_raw'].quantile(.7, interpolation='nearest').reset_index(name='sim_size')
    
    
    sessions = [session_size, session_minutes, sim_minutes, sim_size]
    sessions = reduce(lambda left, right: gpu_pd.merge(left, right, on=['user_id', 'session_30_raw']), sessions)
    dataset = gpu_pd.merge(dataset, sessions, on=['user_id', 'session_30_raw'])
    dataset['reward'] = dataset['cum_session_time_raw']
    return dataset



def setup_data_at_window(df, window):
    df['date_time'] = gpu_pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'second']], errors='coerce')
    size_of_session = df.groupby(['user_id', 'session_30_raw']).size().reset_index(name='size_of_session')
    df = gpu_pd.merge(df, size_of_session, on=['user_id', 'session_30_raw'])
    df['cum_session_event_raw'] = df.groupby(['user_id', 'session_30_raw'])['date_time'].cumcount() + 1
    
    logger.info(f'Convolution over {window} minute window')
    df = convolve_delta_events(df, window)
    logger.info(f'Convolving over {window} minute window complete: generating metadata')
    df = generate_metadata(df) 
    logger.info(f'Generating metadata complete')
    return df

def get_dataset(read_path, conv_path, n_files, window):
    
    conv_path, read_path = (
        os.path.join(conv_path, f'files_used_{n_files}'),
        os.path.join(read_path, f'files_used_{n_files}', 'predicted_data.parquet')
    )

    if not os.path.exists(conv_path):
        logger.info(f'Creating directory {conv_path}')
        os.makedirs(conv_path)
    
    logger.info(f'Convolutional dataset not found at {conv_path}: creating')
    logger.info(f'Getting dataset from {read_path}')
    df = gpu_pd.read_parquet(read_path)
    subsets = [
        { "name": "train", "start": 0, "end": int(df.shape[0] * .7) },
        { "name": "eval", "start": int(df.shape[0] * .7), "end": int(df.shape[0]) }
    ]

    
    logger.info(f'Initial shape: {df.shape}')
    
    for s in subsets:
        logger.info(f'Running conv on {s["name"]} subset')
        subset_df = df.iloc[s['start']:s['end']]
        subset_df = setup_data_at_window(subset_df, window)
        updated_conv_path = os.path.join(conv_path, f'window_{window}_{s["name"]}.parquet')
        logger.info(f'Saving convolutional dataset to {updated_conv_path}')
        all_cols = list(set(ALL_COLS))
    
        subset_df = subset_df[all_cols + [col for col in df.columns if 'seq' in col]]
        subset_df.to_parquet(updated_conv_path)

In [3]:
class Arguments:
    read_path = 'rl_ready_data'
    conv_path = 'rl_ready_data_conv'
    n_files = 30
    window = 2

In [4]:
get_dataset(Arguments.read_path, Arguments.conv_path, Arguments.n_files, Arguments.window)

2023-05-29 11:44:26,056 - rl_results_eval - INFO - Convolutional dataset not found at rl_ready_data_conv/files_used_30: creating
2023-05-29 11:44:26,057 - rl_results_eval - INFO - Getting dataset from rl_ready_data/files_used_30/predicted_data.parquet
2023-05-29 11:44:29,840 - rl_results_eval - INFO - Initial shape: (38500990, 40)
2023-05-29 11:44:29,840 - rl_results_eval - INFO - Running conv on train subset
2023-05-29 11:44:30,088 - rl_results_eval - INFO - Convolution over 2 minute window
2023-05-29 11:44:30,305 - numba.cuda.cudadrv.driver - INFO - init
2023-05-29 11:44:38,087 - rl_results_eval - INFO - Convolution complete: resampling with max at 2 minute intervals
2023-05-29 11:45:04,273 - rl_results_eval - INFO - Resampling complete: (26950693, 43) -> (324029, 3)
2023-05-29 11:45:04,274 - rl_results_eval - INFO - Joining back to original df
2023-05-29 11:45:28,129 - rl_results_eval - INFO - Joining complete: (3173698, 44)
2023-05-29 11:45:28,969 - rl_results_eval - INFO - Convolv

In [6]:
!pip install awscli python-dotenv --quiet

[0m

In [8]:
%load_ext dotenv
%dotenv env

In [10]:
!aws s3 sync rl_ready_data_conv s3://dissertation-data-dmiller/rl_ready_data_conv

upload: rl_ready_data_conv/files_used_10/window_2_eval.parquet to s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_10/window_2_eval.parquet
upload: rl_ready_data_conv/files_used_10/rl_conv_2.parquet to s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_10/rl_conv_2.parquet
upload: rl_ready_data_conv/files_used_2/window_2_eval.parquet to s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_2/window_2_eval.parquet
upload: rl_ready_data_conv/files_used_2/window_2_train.parquet to s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_2/window_2_train.parquet
upload: rl_ready_data_conv/files_used_30/window_2_eval.parquet to s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_30/window_2_eval.parquet
upload: rl_ready_data_conv/files_used_10/window_2_train.parquet to s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_10/window_2_train.parquet
upload: rl_ready_data_conv/files_used_2/rl_conv_2.parquet to s3://dissertation-data-dmiller/rl_r