In [2]:
import glob
import os
from datetime import datetime

from pprint import pformat, pprint

In [3]:
import pandas as pd

In [4]:
import cupy as np
import cudf as pd
import pandas as cpu_pd


In [5]:
%pip install python-dotenv awscli --quiet
%load_ext dotenv
%dotenv env

[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
# !aws s3 sync s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_30/window_1/batched_train rl_ready_data_conv/files_used_30/window_1/batched_train --delete 

In [7]:
# !aws s3 sync s3://dissertation-data-dmiller/labelled_session_count_data labelled_session_count_data --delete

In [8]:
np.set_printoptions(suppress=True)
np.set_printoptions(precision=4)
np.set_printoptions(linewidth=200)

cpu_pd.set_option('display.max_columns', None)
cpu_pd.set_option('display.max_rows', None)
cpu_pd.set_option('display.width', 500)
cpu_pd.set_option('display.float_format', '{:20,.4f}'.format)
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [9]:
INITIAL_LOAD_COLUMNS = [
    'user_id',
    'project_id',
    'date_time',
    'session_5_count',
    'session_count',
    'session_terminates',
    'country',
]


In [10]:
DEBUG_COLS = [
    'user_id',
    'date_time',
    'session_count',
    'session_terminates',
    'cum_session_event_count',
    'cum_session_time_minutes',
    'expanding_click_average',
    'cum_platform_time_minutes',
    'cum_projects',
    'rolling_session_time',
    'rolling_session_events',
    'rolling_session_gap',
    'session_event_count',
    'session_time_minutes'
]

In [15]:
def get_logger():
    logger = logging.getLogger(__name__)
    return logger

def encode_counts(df, logger):

    logger.info('Encoding country counts')
    country_count = df['country'].value_counts().reset_index(name='country_count').rename(columns={'index': 'country'})
 
    logger.info('Encoding counts complete: joining users to df')
    df = df.merge(country_count, on='country', how='left')
    
    logger.info(f'Encoding user counts')
    user_count = df['user_id'].value_counts().reset_index(name='user_count').rename(columns={'index': 'user_id'})
    df = df.merge(user_count, on='user_id', how='left')
    
    logger.info(f'Encoding project counts')
    project_count = df['project_id'].value_counts().reset_index(name='project_count').rename(columns={'index': 'project_id'})
    df = df.merge(project_count, on='project_id', how='left')
    
    return df
   
def time_encodings(df):
    """
    Timestamp raw encoded in units of seconds
    """
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['date_hour'] = df['date_time'].dt.hour 
    df['date_minute'] = df['date_time'].dt.minute
    
    df['date_hour_sin'] = np.sin(2 * np.pi * df['date_hour'] / 24)
    df['date_hour_cos'] = np.cos(2 * np.pi * df['date_hour'] / 24)
    
    df['date_minute_sin'] = np.sin(2 * np.pi * df['date_minute'] / 60)
    df['date_minute_cos'] = np.cos(2 * np.pi * df['date_minute'] / 60)

    return df



def rolling_time_between_events_session(df, logger):
    logger.info('Calculating expanding session time averages')
    df = df.reset_index()
    df['row_count'] = df.index.values
    expanding_window = df.set_index('row_count') \
        .sort_values(by=['date_time']) \
        .groupby(['user_id', 'session_count']) \
        .rolling(10, min_periods=1)['delta_last_event'].mean() \
        .reset_index().rename(columns={'delta_last_event': 'expanding_click_average'}) \
        .sort_values(by='row_count')
    
    logger.info('Expanding averages calculated: joining to df')
    df = df.set_index('row_count').join(expanding_window[['row_count', 'expanding_click_average']].set_index('row_count'))
    logger.info('Expanding averages joined to df')
    df = df.sort_values(by='date_time')
    return df


def cum_time_in_session(df):
    print('Calculating cum session time')
    df['delta_last_event'] = df.groupby(['user_id'])['date_time'].diff()
    df = df.to_pandas()
    df['delta_last_event'] = df['delta_last_event'].dt.total_seconds()
    df['delta_last_event'] = df['delta_last_event'].fillna(0)
    print('Cum session time calculated: applying zeroing of first event')
    df = pd.from_pandas(df)
    df['delta_last_event'] = df[['cum_session_event_count', 'delta_last_event']].apply(lambda x: 0 if x['cum_session_event_count'] == 1 else x['delta_last_event'], axis=1)
    df['cum_session_time'] = df.groupby(['user_id', 'session_count'])['delta_last_event'].cumsum()
    df['cum_session_time'] = df['cum_session_time'] / 60
    print('Zero of first event applied: cum session time calculated')
    return df

def intra_session_stats(df, logger):
    
    logger.info('Sorting by date_time and user_id')
    df = df.sort_values(by=['date_time', 'user_id'])
    
    df = df.drop_duplicates(subset=['user_id', 'date_time'], keep='first')
    logger.info('Calculating cum_event_count')
    df['cum_session_event_count'] = df.groupby(['user_id', 'session_count'])['date_time'].cumcount() + 1
    logger.info('Cum_event_count calculated: calculating cum session time (minutes)')
    df = cum_time_in_session(df)
    return df


def cum_platform_time(df):
    df['cum_platform_time'] = df.groupby(['user_id'])['cum_session_time'].cumsum()
    return df
    
    
def running_user_stats(df, logger):
    logger.info('Calculating cumulative platform time')
    df = df.sort_values(by=['date_time'])
    df = cum_platform_time(df)
    logger.info('Calculating cumulative platform events')
    df['cum_platform_events'] = df.groupby(['user_id']).cumcount() + 1
    logger.info('Calculating running unique projects: shifting projects to find unique')
    
    df['project_id'] = df['project_id'].astype(int)
    df['user_id'] = df['user_id'].astype(int)
    df['previous_user_project'] = df.groupby('user_id')['project_id'].shift(1)
    df['previous_project_exists'] = df['previous_user_project'].notna()
    
    df['previous_user_project'] = df[['previous_user_project', 'previous_project_exists', 'project_id']].apply(
        lambda x: x['previous_user_project'] if x['previous_project_exists'] else x['project_id'], axis=1)
    logger.info('Calculating running unique projects: calculating unique projects')
    
    df['previous_user_project'] = df['previous_user_project'].astype(int)
    df['project_change'] = df['project_id'] != df['previous_user_project']
    
    df['cum_projects'] = df.groupby('user_id')['project_change'].cumsum() + 1
    
   
    df = df.drop(columns=['previous_user_project', 'previous_project_exists', 'project_change'])
    logger.info('Calculated running unique projects: calculating average event time delta')
    df = df.reset_index()
    df['row_count'] = df.index.values
    
    average_event_time = df.set_index('row_count') \
        .sort_values(by=['date_time']) \
        .groupby('user_id') \
        .rolling(1000, min_periods=1)['delta_last_event'].mean() \
        .reset_index().rename(columns={'delta_last_event': 'average_event_time'}) \
        .sort_values(by='row_count')
    df = df.set_index('row_count').join(average_event_time[['row_count', 'average_event_time']].set_index('row_count'))
    logger.info('Calculated average event time delta')
    return df


def time_from_previous_session_minutes(session_inflection_times, logger):
    session_inflection_times = session_inflection_times.sort_values(by=['session_count', 'user_id'])
    
    session_inflection_times['previous_session_end'] = session_inflection_times.groupby(['user_id'])['date_time_max'].shift(1)
    session_inflection_times['time_between_session_minutes'] = (session_inflection_times['date_time_min'] - session_inflection_times['previous_session_end']).dt.total_seconds() / 60
    session_inflection_times['time_between_session_minutes'] = session_inflection_times['time_between_session_minutes'].fillna(0)
    return session_inflection_times[['user_id', 'session_count', 'time_between_session_minutes', 'date_time_min', 'date_time_max']]

def rolling_average_session_statistics(df, session_inflection_times, logger):
 
    logger.info('Session inflection times calculated: calculating expanding session time')
    average_session_minutes = session_inflection_times.sort_values(by=['session_count', 'user_id']) \
    .set_index(['session_count', 'date_time_min', 'date_time_max']) \
    .groupby(['user_id']) \
    ['session_time_minutes'] \
    .rolling(10, min_periods=1, closed='left') \
    .mean() \
    .reset_index() \
    .rename(columns={'session_time_minutes': 'rolling_session_time'})
   
    average_session_minutes['rolling_session_time'] = average_session_minutes['rolling_session_time'].fillna(0)
    logger.info('Calculating average events per session')
    average_events_session = session_inflection_times.sort_values(by=['session_count', 'user_id']) \
        .set_index(['session_count', 'date_time_min', 'date_time_max']) \
        .groupby(['user_id']) \
        ['session_event_count'] \
        .rolling(10, min_periods=1, closed='left') \
        .mean() \
        .reset_index() \
        .rename(columns={'session_event_count': 'rolling_session_events'})
    
    average_events_session['rolling_session_events'] = average_events_session['rolling_session_events'].fillna(0)
    
    logger.info('Calculating time from previous session')
    time_between_session = time_from_previous_session_minutes(session_inflection_times, logger)
    
    time_between_session = time_between_session.sort_values(by=['session_count', 'user_id']) \
        .set_index(['session_count']) \
        .groupby(['user_id']) \
        ['time_between_session_minutes'] \
        .rolling(10, min_periods=1) \
        .mean() \
        .reset_index() \
        .rename(columns={'time_between_session_minutes': 'rolling_session_gap'})

    logger.info('Joining dataframes')
   
    session_stats = cpu_pd.merge(average_events_session, average_session_minutes, on=['user_id', 'session_count']) 
    session_stats = cpu_pd.merge(session_stats, session_inflection_times, on=['user_id', 'session_count'])
    session_stats = cpu_pd.merge(session_stats, time_between_session, on=['user_id', 'session_count'])
    session_stats = pd.from_pandas(session_stats)
    
    df = pd.merge(df, session_stats[['user_id', 'session_count', 'rolling_session_time', 'rolling_session_events', 'rolling_session_gap', 'session_event_count', 'session_time_minutes', 'rolling_session_gap']], on=['user_id', 'session_count'])
    logger.info('Dataframes joined::returning')
    return df


def assign_metadata(df, logger):
    logger.info(f'Obtaining global session time and user events')
    global_session_time = df.groupby('user_id')['cum_platform_time'].max().reset_index().rename(columns={'cum_platform_time': 'global_session_time'})
    
    logger.info('Joining session_time to df')
    df = pd.merge(df, global_session_time, on='user_id', how='left')

    
    logger.info('Calculating time meta [year, month, day, hour, minute, second]')

    df['year'] = df['date_time'].dt.year
    df['month'] = df['date_time'].dt.month
    df['day'] = df['date_time'].dt.day
    df['hour'] = df['date_time'].dt.hour
    df['minute'] = df['date_time'].dt.minute
    df['second'] = df['date_time'].dt.second
    
    logger.info('Assigning raw session time and counts, platform time and counts and session count')
    df['cum_session_time_raw'] = df['cum_session_time']
    df['cum_platform_time_raw'] = df['cum_platform_time']
    df['cum_session_event_raw'] = df['cum_session_event_count']
    df['cum_platform_event_raw'] = df['cum_platform_events']
    df['session_count_raw'] = df['session_count']
    df['user_count_raw'] = df['user_count']
    df['project_count_raw'] = df['project_count']
    
    return df
    
   
def hash_user_id(df):
    user_id = df[['user_id']].drop_duplicates().reset_index().rename(columns={'index': 'user_id_hash'})
    df = pd.merge(df, user_id, on='user_id')
    df = df.drop(columns=['user_id'])
    df = df.rename(columns={'user_id_hash': 'user_id'})
    return df

def hash_project_id(df):
    project_id = df[['project_id']].drop_duplicates().reset_index().rename(columns={'index': 'project_id_hash'})
    df = pd.merge(df, project_id, on='project_id')
    df = df.drop(columns=['project_id'])
    df = df.rename(columns={'project_id_hash': 'project_id'})
    return df


def generate_summary_session_stats(df, logger):
    logger.info('Generating session statistics')
    session_inflection_statistics = df.groupby(['user_id', 'session_count']).agg({'date_time': ['min', 'max', 'count']}).reset_index()
    session_inflection_statistics.columns = ['user_id', 'session_count', 'date_time_min', 'date_time_max', 'session_event_count']
    session_inflection_statistics = session_inflection_statistics.to_pandas()
    session_inflection_statistics['session_time_minutes'] = (session_inflection_statistics['date_time_max'] - session_inflection_statistics['date_time_min']).dt.total_seconds() / 60
    return session_inflection_statistics

def calculate_immediate_session_stats(df, session_inflections, logger):
    
    session_inflections = pd.from_pandas(session_inflections)
    session_inflections = session_inflections.sort_values(by=['session_count', 'user_id'])
    logger.info(f'Calculating immediate previous session time')
    session_inflections['previous_session_time'] = session_inflections.groupby(['user_id'])['session_time_minutes'].shift(1)
    
    logger.info(f'Calculate immmediate previous session events')
    session_inflections['previous_session_events'] = session_inflections.groupby(['user_id'])['session_event_count'].shift(1)
    session_inflections = session_inflections.fillna(0)    
    logger.info(f'Joining immediate session stats to df')
    df = pd.merge(df, session_inflections[['user_id', 'session_count', 'previous_session_time', 'previous_session_events']], on=['user_id', 'session_count'])
    
    return df
   
def _pretty_print_columns(df):
    for col in df.columns:
        print(f'    "{col}"')
def main(args):
    #

    np.set_printoptions(suppress=True)
    np.set_printoptions(precision=4)

    logger =  get_logger()
    logger.info(f'Running feature calculation with args')
    logger.info(pformat(args.__dict__))
    
    input_path = os.path.join(
        args.input_path,
        f'files_used_{args.n_files}_window_{args.data_window}.parquet'
    )


    output_path = os.path.join(
        args.output_path,
        f'files_used_{args.n_files}',
        f'calculated_features_window_{args.data_window}.parquet')
    
    logger.info(f'Input path: {input_path}')
    logger.info(f'Output path: {output_path}')
        
    
    logger.info(f'Loading data from input_path')
    df = pd.read_parquet(input_path, columns=INITIAL_LOAD_COLUMNS)
    df = hash_user_id(df)
    logger.info(f'User id hashed: {df.shape}')
    df = hash_project_id(df)
    logger.info(f'Project id hashed: {df.shape}')
    
    logger.info(f'Loaded data: shape = {df.shape}, min_date, max_date: {df.date_time.min()}, {df.date_time.max()}')
    label_count = df[df['session_terminates'] == True].shape[0] / df.shape[0]
    logger.info(f'Perc ending in {args.data_window}: {label_count}')
    df['date_time'] = pd.to_datetime(df['date_time'])
    logger.info(f'Sorting data by date_time')
    df = df.sort_values(by='date_time')
    logger.info('Finished sorting data: encoding value counts for user project and country')
    df = encode_counts(df, logger)
    
    logger.info(f'Finished encoding value counts: encoding time features: {df.shape}')
    df = time_encodings(df) 
   
    logger.info('Time encodings complete: encoding categorical features')
    
    logger.info('Categorical features encoded: calculating intra-session stats')
    df = intra_session_stats(df, logger)
    
    logger.info(f'Intra session stats calculated: calculating running user stats')
    df = running_user_stats(df, logger)
    logger.info('Beginning rolling window 10 calculation')
    
    df = rolling_time_between_events_session(df, logger)
    logger.info('Rolling window 10 calculation complete: beginning expanding window calculation')
    
   
    session_inflection_times = generate_summary_session_stats(df, logger)
    logger.info('Session inflection times calculated: columns')
    logger.info(pformat(session_inflection_times.columns))
    df = rolling_average_session_statistics(df, session_inflection_times, logger)
    df = calculate_immediate_session_stats(df, session_inflection_times, logger)
    logger.info('Time within session and average session clicks calculated:: calculating time between session')
    df['session_raw'] = df['session_count']

    logger.info('Assigning metadata')
    df = assign_metadata(df, logger)
    logger.info('Metadata assigned: dropping columns')
    
    logger.info('Returning df to dask for writing to disk')
       
    if not os.path.exists(os.path.dirname(output_path)):
        os.makedirs(os.path.dirname(output_path))
    
    
    logger.info(f'Final out columns:')
    _pretty_print_columns(df)

    df = df.sort_values(by='date_time').reset_index(drop=True)
    df['label'] = df['session_terminates'].apply(lambda x: x == False)
    df = df.to_pandas()
    logger.info(f'Brought back to CPU for writing to disk')
    logger.info(f'Writing to: {output_path}')
    df.to_parquet(output_path)

    logger.info('Finished writing to disk')
    

In [16]:
def test_is_monotonic_increasing_glob(df):
    df = df.to_pandas()
    df = df.sort_values(by=['user_id', 'date_time'])
    monotonic_date_time = df.groupby(['user_id'])['date_time'].is_monotonic_increasing.reset_index(name='is_monotonic_increasing')
    if monotonic_date_time['is_monotonic_increasing'].all() == True:
        print('monotonic date time')
    else:
        print(monotonic_date_time[monotonic_date_time['is_monotonic_increasing'] == False])
        return
    monotonic_plat_event = df.groupby(['user_id'])['cum_platform_event_raw'].is_monotonic_increasing.reset_index(name='is_monotonic_increasing')
    if monotonic_plat_event['is_monotonic_increasing'].all() == True:
        print('monotonic plat event')
    else:
        print(monotonic_plat_event[monotonic_plat_event['is_monotonic_increasing'] == False])
        return


    df['cum_platform_time_raw'] = df['cum_platform_time_raw'].round(2)

    monotonic_plat_time = df.groupby(['user_id'])['cum_platform_time_raw'].is_monotonic_increasing.reset_index(name='is_monotonic_increasing')
    if monotonic_plat_time['is_monotonic_increasing'].all() == True:
        print('monotonic plat time')
    else:
        print(monotonic_plat_time[monotonic_plat_time['is_monotonic_increasing'] == False])
        return


    
    monotonic_cum_projects = df.groupby(['user_id'])['cum_projects'].is_monotonic_increasing.reset_index(name='is_monotonic_increasing')
    
    if monotonic_cum_projects['is_monotonic_increasing'].all() == True:
        print('monotonic plat proj')
    else:
        print(monotonic_cum_projects[monotonic_cum_projects['is_monotonic_increasing'] == False])
        return

def test_is_monotonic_increasing_session(df):
    df = df.to_pandas()
    df = df.sort_values(by=['user_id', 'date_time'])
    delta_last_check = df.groupby(['user_id', 'session_30_count_raw'])['delta_last_event'].max().reset_index(name='delta_last_event_max')


    monotonic_session_event = df.groupby(['user_id', 'session_30_count_raw'])['cum_session_event_raw'].is_monotonic_increasing.reset_index(name='is_monotonic_increasing')
    monotonic_session_time = df.groupby(['user_id', 'session_30_count_raw'])['cum_session_time_raw'].is_monotonic_increasing.reset_index(name='is_monotonic_increasing')
    
    for df_grouper, msg in zip([monotonic_session_event, monotonic_session_time], ['event', 'time']):
        if df_grouper['is_monotonic_increasing'].all() == True:
            print(f'monotonic session {msg}')
        else:
            print(df_grouper[df_grouper['is_monotonic_increasing'] == False])
            return
    
    if delta_last_check['delta_last_event_max'].max() <= 30:
        print('session time diff max less than 30')
    else:
        print(delta_last_check[delta_last_check['delta_last_event_max'] > 30])
    

def test_df(df):
    print(f'Testing monotonic increasing')
    # test_is_monotonic_increasing_glob(df)
    print('Monotonic increasing global passed')
    test_is_monotonic_increasing_session(df)
    print('Monotonic increasing local passed')



In [17]:
class Args:
    input_path = 'labelled_session_count_data_2'
    output_path = 'calculated_features'
    n_files = 30
    data_window = 10

In [18]:
main(Args)

2023-07-10 10:26:20,173 - __main__ - INFO - Running feature calculation with args
2023-07-10 10:26:20,174 - __main__ - INFO - mappingproxy({'__dict__': <attribute '__dict__' of 'Args' objects>,
              '__doc__': None,
              '__module__': '__main__',
              '__weakref__': <attribute '__weakref__' of 'Args' objects>,
              'data_window': 10,
              'input_path': 'labelled_session_count_data_2',
              'n_files': 30,
              'output_path': 'calculated_features'})
2023-07-10 10:26:20,175 - __main__ - INFO - Input path: labelled_session_count_data_2/files_used_30_window_10.parquet
2023-07-10 10:26:20,175 - __main__ - INFO - Output path: calculated_features/files_used_30/calculated_features_window_10.parquet
2023-07-10 10:26:20,176 - __main__ - INFO - Loading data from input_path
2023-07-10 10:26:21,452 - __main__ - INFO - User id hashed: (38500990, 7)
2023-07-10 10:26:21,616 - __main__ - INFO - Project id hashed: (38500990, 7)
2023-07-10 10:

Calculating cum session time
Cum session time calculated: applying zeroing of first event


2023-07-10 10:26:30,331 - __main__ - INFO - Intra session stats calculated: calculating running user stats
2023-07-10 10:26:30,332 - __main__ - INFO - Calculating cumulative platform time


Zero of first event applied: cum session time calculated


2023-07-10 10:26:30,558 - __main__ - INFO - Calculating cumulative platform events
2023-07-10 10:26:30,650 - __main__ - INFO - Calculating running unique projects: shifting projects to find unique
2023-07-10 10:26:30,761 - __main__ - INFO - Calculating running unique projects: calculating unique projects
2023-07-10 10:26:30,936 - __main__ - INFO - Calculated running unique projects: calculating average event time delta
2023-07-10 10:26:31,837 - __main__ - INFO - Calculated average event time delta
2023-07-10 10:26:31,879 - __main__ - INFO - Beginning rolling window 10 calculation
2023-07-10 10:26:31,880 - __main__ - INFO - Calculating expanding session time averages
2023-07-10 10:26:32,601 - __main__ - INFO - Expanding averages calculated: joining to df
2023-07-10 10:26:32,857 - __main__ - INFO - Expanding averages joined to df
2023-07-10 10:26:33,093 - __main__ - INFO - Rolling window 10 calculation complete: beginning expanding window calculation
2023-07-10 10:26:33,094 - __main__ - 

    "index"
    "date_time"
    "session_5_count"
    "session_count"
    "session_terminates"
    "country"
    "user_id"
    "project_id"
    "country_count"
    "user_count"
    "project_count"
    "date_hour"
    "date_minute"
    "date_hour_sin"
    "date_hour_cos"
    "date_minute_sin"
    "date_minute_cos"
    "cum_session_event_count"
    "delta_last_event"
    "cum_session_time"
    "cum_platform_time"
    "cum_platform_events"
    "cum_projects"
    "average_event_time"
    "expanding_click_average"
    "rolling_session_time"
    "rolling_session_events"
    "rolling_session_gap"
    "session_event_count"
    "session_time_minutes"
    "previous_session_time"
    "previous_session_events"
    "session_raw"
    "global_session_time"
    "year"
    "month"
    "day"
    "hour"
    "minute"
    "second"
    "cum_session_time_raw"
    "cum_platform_time_raw"
    "cum_session_event_raw"
    "cum_platform_event_raw"
    "session_count_raw"
    "user_count_raw"
    "project_count_ra

2023-07-10 10:27:09,144 - __main__ - INFO - Brought back to CPU for writing to disk
2023-07-10 10:27:09,145 - __main__ - INFO - Writing to: calculated_features/files_used_30/calculated_features_window_10.parquet
2023-07-10 10:27:40,676 - __main__ - INFO - Finished writing to disk
