In [1]:
!pip install --upgrade pip --quiet
!pip install --upgrade awscli python-dotenv --quiet
%load_ext dotenv
%dotenv env



[0m

In [2]:
# !aws s3 sync rl_ready_data_conv s3://dissertation-data-dmiller/rl_ready_data_conv --delete
# !aws s3 sync rl_ready_data s3://dissertation-data-dmiller/rl_ready_data --delete

In [3]:
# ! rm *.csv *.json

In [4]:
# %load rl_constant
LABEL = [
    "continue_work_session_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "cum_session_event_raw",
    "global_events_user",
    "global_session_time",
    "date_time"
]

OUT_FEATURE_COLUMNS = [
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event",
    "cum_session_time",
    "cum_session_time",
   
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    "cum_projects",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]

RESCALER_COLS = [
    'session_30_count',
    'session_5_count',
    'cum_session_event',
    'cum_session_time',
    'cum_platform_time',
    'cum_platform_event'
]

PREDICTION_COLS = [
    'seq_40',
]


GROUPBY_COLS = ['user_id']

RL_STAT_COLS = [
    'session_size',
    'sim_size',
    'session_minutes',
    'sim_minutes',
    'reward',
    'cum_session_time_raw'
]

TORCH_LOAD_COLS = [
    'user_id',
    'date_time'
] + [
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    "expanding_click_average",
    "cum_projects",
    "average_event_time",
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
    "delta_last_event"
] + [
    'seq_40',
    'session_30_raw',
]

In [5]:
import logging
import os
import cudf
import pandas as pd
import numpy as np
from pprint import pprint, pformat
import cudf as gpu_pd

logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')


COLS_FOR_INFLECTION = [
    'user_id',
    'date_time',
] + OUT_FEATURE_COLUMNS + PREDICTION_COLS

class SessionCalculate:
    logger = logging.getLogger('rl_results_eval')
    def __init__(self, df, use_gpu) -> None:
        self.df = df
        self.use_gpu = use_gpu
            
        
    def calculate_inflections(self):
      
        self.logger.info('Calculating subsequent date time')
        self.df['next_date_time'] = self.df.groupby('user_id')['date_time'].shift(-1)
        self.df = self.df.drop_duplicates(subset=['user_id', 'date_time'], keep='last').reset_index()
        if self.use_gpu:
            self.logger.info('Bringing to CPU for second calculation')
            self.df = self.df.to_pandas()
           
            
        self.df['diff_seconds'] = (self.df['next_date_time'] - self.df['date_time']).apply(lambda x: x.total_seconds())
        
        self.logger.info('Diff seconds calculated')
        if self.use_gpu:
            self.logger.info('Bringing back to GPU for final calculations')
            self.df = cudf.from_pandas(self.df)

        self.df['diff_minutes'] = (self.df['diff_seconds'] / 60)
        self.df['session_5'] = (self.df['diff_minutes'] < 5)
        self.df['session_30'] = self.df['diff_minutes'] < 30
        
        self.df['session_30'] = self.df['session_30'].fillna(False)
        self.df['session_5'] = self.df['session_5'].fillna(False)        
        self.logger.info(f'Labels calculated: removing rows with diff seconds > 0')
       
        

        self.logger.info(f'Number of rows following drop: {self.df.shape[0]}')
        self.logger.info(f'Sorting rows by date time and applying row count')
        self.df = self.df.sort_values(['date_time']).reset_index()
        self.df['row_count'] = self.df.index.values
        self.logger.info(f'Sorted rows and applied row count on updated index')  
        self.logger.info('Calculating inflection points')
        self.df['user_id'] = self.df['user_id'].astype('int32')
        
       
        inflections_5_merge = self.df[self.df['session_5'] == False].sort_values(by=['date_time'])
        inflections_30_merge = self.df[self.df['session_30'] == False].sort_values(by=['date_time']) 
     
        self.logger.info('Calculating session 5 inflections') 
        inflections_5_merge['session_5'] = inflections_5_merge.groupby('user_id').cumcount() + 1
        inflections_5_merge = inflections_5_merge.rename(columns={'session_5': 'session_5_count'})
        
        self.logger.info('Calculating session 30 inflections')
        inflections_30_merge['session_30'] = inflections_30_merge.groupby('user_id').cumcount() + 1
        inflections_30_merge = inflections_30_merge.rename(columns={'session_30': 'session_30_count'})
        
        inflections_5_merge = inflections_5_merge[['user_id', 'date_time', 'row_count', 'session_5_count']].sort_values(by=['row_count', 'user_id'])
        inflections_30_merge = inflections_30_merge[['user_id', 'date_time', 'row_count', 'session_30_count']].sort_values(by=['row_count', 'user_id'])
        inflections_5_merge = inflections_5_merge.drop(columns=['date_time'])
        
        inflections_30_merge = inflections_30_merge.rename(columns={'date_time': 'session_end_time'})

        
        if self.use_gpu:
            self.logger.info('Bringing back to GPU for labelling')
            self.df, inflections_5_merge, inflections_30_merge = self.df.to_pandas(), inflections_5_merge.to_pandas(), inflections_30_merge.to_pandas()
            self.df = self.df.sort_values(by=['row_count', 'user_id'])
            self.df = pd.merge_asof(self.df, inflections_5_merge, on='row_count', by='user_id', direction='forward')
            self.df = pd.merge_asof(self.df, inflections_30_merge, on='row_count', by='user_id', direction='forward')
            self.df['session_terminates_30_minutes'] = (self.df['session_end_time'] - self.df['date_time']).apply(lambda x: x.total_seconds() / 60) < 30
            self.df = cudf.from_pandas(self.df)
        else:
            self.logger.info('Labelling on CPU')
            self.df = pd.merge_asof(self.df.sort_values(by=['row_count', 'user_id']), inflections_5_merge, on='row_count', by='user_id', direction='forward')

            self.df = pd.merge_asof(self.df.sort_values(by=['row_count', 'user_id']), inflections_5_merge, on='row_count', by='user_id', direction='forward')
            self.df = pd.merge_asof(self.df.sort_values(by=['row_count', 'user_id']), inflections_30_merge, on='row_count', by='user_id', direction='forward') 
            self.df['session_terminates_30_minutes'] = (self.df['session_end_time'] - self.df['date_time']).apply(lambda x: x.total_seconds() / 60) < 30
 
        self.logger.info('Inflections calculated')
 
        session_end_30_minutes = self.df[self.df['session_terminates_30_minutes'] == False].shape[0]
        self.logger.info(f'Percent sessions end in 30 minutes: {session_end_30_minutes / self.df.shape[0]}')
        self.logger.info(f'Columns for df') 
        self.logger.info(pformat(self.df.columns))
        
        return self.df
        

In [6]:
import os
import pandas as pd
import logging

from tqdm import tqdm
import numpy as np
global logger
logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('rl_results_eval')
from functools import reduce
from pprint import pformat
import cudf as gpu_pd
import cupy as gpu_np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('mode.use_inf_as_na', True)
from cuml.preprocessing import MinMaxScaler

def convolve_delta_events(df, window, write_path):
    
    df = df.to_pandas()
   
    df = df.replace([np.inf, -np.inf], np.nan)
    before_resample = df.shape
    logger.info(f'Convolutional shape before resample: {before_resample}')
    logger.info(f'Convolution over delta last event')
    
    df['delta_last_event'] = (
        df.sort_values(by=['session_30_raw', 'cum_session_event_raw']) \
            .set_index('date_time') \
            .groupby(by=['user_id', 'session_30_raw'], group_keys=False) \
            .rolling(f'{window}T', min_periods=1)['delta_last_event'] \
            .mean()
            .reset_index(name='convolved_event_delta')['convolved_event_delta'] \
    ) 

    df = df.drop(columns=['session_30_raw'])
    df = df.loc[:,~df.columns.duplicated()].reset_index(drop=True)  
    

    
    df = gpu_pd.from_pandas(df)
  
    # remove duplicate columns
    df['year'] = df['date_time'].dt.year
    df['month'] = df['date_time'].dt.month
    df['day'] = df['date_time'].dt.day
    df['hour'] = df['date_time'].dt.hour
    df['minute'] = df['date_time'].dt.minute
    df['second'] = df['date_time'].dt.second
    
    df['user_id'] = df['user_id'].astype('int32')
    
   
    resampled_df = df.sort_values(by='date_time') \
        .drop_duplicates(subset=['user_id', 'year', 'month', 'day', 'hour', 'minute'], keep='last') \
        .sort_values(by=['date_time']) \
        .reset_index()
    
    logger.info(f'Convolution complete: {before_resample} -> {resampled_df.shape}')
    logger.info(f'Writing intermediate results to {write_path}_convolve.parquet')
    
    resampled_df.to_parquet(write_path + '_convolve.parquet')
    
    logger.info(f'Recalculating inflections')
    resample_events = SessionCalculate(resampled_df, use_gpu=True)
    resampled_event_out = resample_events.calculate_inflections()
   
    logger.info(f'Events sessionized: writing to {write_path}_session.parquet')
    resampled_event_out.to_parquet(write_path + '_session.parquet')
    logger.info(f'Events resampled')
    logger.info(sorted(pformat(resampled_event_out.columns)))
    
    return resampled_event_out
     
     

def generate_metadata_session(dataset):
    
    logger.info(f'Calculating session size and minutes')
    session_size = dataset.groupby(['user_id', 'session_30_count'])['cum_session_event'].max().reset_index(name='session_size')
    session_minutes = dataset.groupby(['user_id', 'session_30_count'])['cum_session_time'].max().reset_index(name='session_minutes')
    
    
    logger.info(f'Calculating sim size and minutes')
    sim_minutes = dataset.groupby(['user_id', 'session_30_count'])['cum_session_time'].quantile(.7, interpolation='nearest').reset_index(name='sim_minutes')
    sim_size = dataset.groupby(['user_id', 'session_30_count'])['cum_session_event'].quantile(.7, interpolation='nearest').reset_index(name='sim_size')
    
    
    sessions = [session_size, session_minutes, sim_minutes, sim_size]
    logger.info(f'Merging metadata')
    sessions = reduce(lambda left, right: pd.merge(left, right, on=['user_id', 'session_30_count']), sessions)
    
    logger.info(f'Merging metadata complete')
    dataset = pd.merge(dataset, sessions, on=['user_id', 'session_30_count'])

    return dataset



def setup_data_at_window(df, window, write_path):
    logger.info(f'Convolution over {window} minute window')
    df = df.sort_values(by='date_time')
    df['cum_session_event_raw'] = df.groupby(['user_id', 'session_30_raw'])['date_time'].cumcount() + 1
    df = df.sort_values(by='date_time').reset_index(drop=True)
    df  = convolve_delta_events(df, window, write_path)
    logger.info(f'Convolving over {window} minute window complete: generating metadata')
    logger.info(f'Generating metadata complete')
    return df


def reset_intra_session(subset_df):
   
    logger.info(f'Dropping sessions with less than one event') 
    
    
    
    logger.info(f'Resetting cum_session_event_count')
    subset_df['cum_session_event'] = subset_df.groupby(['user_id', 'session_30_count'])['date_time'].cumcount() + 1
    logger.info(f'Resetting cum_session_time and setting reward')
    subset_df = subset_df.to_pandas()
    subset_df['reward'] = subset_df.groupby(['user_id', 'session_30_count'])['date_time'].diff().dt.total_seconds().fillna(0) / 60
    subset_df['reward'] = subset_df[['reward', 'cum_session_event']].apply(lambda x: x['reward'] if x['cum_session_event'] > 1 else 0, axis=1)
    subset_df['cum_session_time'] = subset_df.groupby(['user_id', 'session_30_count'])['reward'].cumsum()
    
    logger.info(f'Resetting cum_platform_time and cum_platform_events')
    subset_df['cum_platform_time'] = subset_df.groupby(['user_id'])['reward'].cumsum()
    subset_df['cum_platform_event'] = subset_df.groupby(['user_id'])['cum_session_event'].cumcount()
    
    subset_df = gpu_pd.from_pandas(subset_df)
    
    return subset_df

def get_dataset(read_path, conv_path, n_files, window):
    
    conv_path, read_path = (
        os.path.join(conv_path, f'files_used_{n_files}'),
        os.path.join(read_path, f'files_used_{n_files}', 'predicted_data.parquet')
    )

    if not os.path.exists(conv_path):
        logger.info(f'Creating directory {conv_path}')
        os.makedirs(conv_path)
    
    logger.info(f'Convolutional dataset not found at {conv_path}: creating')
    logger.info(f'Getting dataset from {read_path}')
    df = gpu_pd.read_parquet(read_path, columns=TORCH_LOAD_COLS)
    
    df['date_time'] = gpu_pd.to_datetime(df['date_time'])
    logger.info(f'Non nan values: {df.count().min()}: 3ropping na')
    logger.info(f'NA values dropped: {df.count().min()}')
    
    df = df.sort_values(by='date_time')
    subsets = [
        { "name": "train", "start": 0, "end": int(df.shape[0] * .7) },
        { "name": "eval", "start": int(df.shape[0] * .7), "end": int(df.shape[0]) }
    ]

   
    logger.info(f'Initial shape: {df.shape}: dropping na and inf')
    df = df.dropna()
    logger.info(f'Final shape: {df.shape}: dropping na and inf')
    
    for s in subsets:
        
        base_conv_path = os.path.join(conv_path, f'window_{window}_{s["name"]}')
        updated_conv_path = os.path.join(conv_path, f"window_{window}_{s['name']}_final.parquet")
        logger.info(f'Running conv on {s["name"]} subset')
        subset_df = df[s['start']:s['end']].copy()
        subset_df = subset_df.sort_values(by='date_time')
        logger.info(f'Subset setup complete: {subset_df.shape}')
        subset_df = setup_data_at_window(subset_df, window, base_conv_path)
        logger.info(f'Subset convolution complete: {subset_df.shape}, resetting stats')
        subset_df = subset_df.sort_values(by='date_time')
        subset_df = reset_intra_session(subset_df)    
        logger.info(f'Intra session stats calculated: {subset_df.shape}, saving intermediate')
        subset_df.to_parquet(updated_conv_path + '_intra_session.parquet')
        logger.info(f'Intra session reset complete: {subset_df.shape}')
        logger.info(f'Stats reset complete, resetting metadata')
        subset_df = subset_df.to_pandas()
        subset_df = generate_metadata_session(subset_df)
        logger.info(f'Metadata reset complete: {subset_df.shape}')
        logger.info(f'Saving convolutional dataset to {updated_conv_path}')
        
        is_monotic_increasing_sess_time = subset_df.round(3).groupby(['user_id', 'session_30_count'])['cum_session_time'].is_monotonic_increasing.reset_index(name='is_monotic_increasing')
        
        if is_monotic_increasing_sess_time[is_monotic_increasing_sess_time['is_monotic_increasing'] == False].shape[0] > 0:
            logger.info(f'Non monotonic increasing reward found: perc {is_monotic_increasing_sess_time[is_monotic_increasing_sess_time["is_monotic_increasing"] == False].shape[0] / is_monotic_increasing_sess_time.shape[0]}')
            logger.info(is_monotic_increasing_sess_time[is_monotic_increasing_sess_time["is_monotic_increasing"] == False])
        else:
            logger.info(f'All rewards are monotonic increasing and no errors')
            
        
        is_monotic_increasing_date_time = subset_df.round(3).groupby(['user_id'])['date_time'].is_monotonic_increasing.reset_index(name='is_monotic_increasing')
        
        if is_monotic_increasing_date_time[is_monotic_increasing_date_time['is_monotic_increasing'] == False].shape[0] > 0:
            logger.info(f'Non monotonic increasing date time found: perc {is_monotic_increasing_date_time[is_monotic_increasing_date_time["is_monotic_increasing"] == False].shape[0] / is_monotic_increasing_date_time.shape[0]}')
            logger.info(is_monotic_increasing_date_time[is_monotic_increasing_date_time["is_monotic_increasing"] == False])
        else:
            logger.info(f'All date times are monotonic increasing and no errors')
       
        logger.info(f'Rescaling feature cols: {RESCALER_COLS}')
        
        for col in RESCALER_COLS:
            subset_df[f'{col}_raw'] = subset_df[col]
        subset_df[RESCALER_COLS] = MinMaxScaler(feature_range=(-1, 1)).fit_transform(subset_df[RESCALER_COLS])
       
        logger.info(f'Writing to disk: {subset_df.shape}') 
        subset_df.to_parquet(updated_conv_path)


In [7]:
class Arguments:
    read_path = 'rl_ready_data'
    conv_path = 'rl_ready_data_conv'
    n_files = 30
    window = 1

In [8]:
get_dataset(Arguments.read_path, Arguments.conv_path, Arguments.n_files, Arguments.window)

2023-06-03 10:39:55,145 - rl_results_eval - INFO - Convolutional dataset not found at rl_ready_data_conv/files_used_30: creating
2023-06-03 10:39:55,146 - rl_results_eval - INFO - Getting dataset from rl_ready_data/files_used_30/predicted_data.parquet
2023-06-03 10:39:58,210 - rl_results_eval - INFO - Non nan values: 36241442: 3ropping na
2023-06-03 10:39:58,212 - rl_results_eval - INFO - NA values dropped: 36241442
2023-06-03 10:39:58,335 - rl_results_eval - INFO - Initial shape: (36241442, 18): dropping na and inf
2023-06-03 10:39:58,431 - rl_results_eval - INFO - Final shape: (36241442, 18): dropping na and inf
2023-06-03 10:39:58,431 - rl_results_eval - INFO - Running conv on train subset
2023-06-03 10:39:58,550 - rl_results_eval - INFO - Subset setup complete: (25369009, 18)
2023-06-03 10:39:58,551 - rl_results_eval - INFO - Convolution over 1 minute window
2023-06-03 10:40:05,397 - rl_results_eval - INFO - Convolutional shape before resample: (25369009, 19)
2023-06-03 10:40:05,39

In [None]:
!rm -r rl_ready_data/files_used_10

In [None]:
!aws s3 sync rl_ready_data s3://dissertation-data-dmiller/rl_ready_data --delete