In [1]:
!pip install --upgrade pip --quiet
!pip install --upgrade awscli python-dotenv --quiet
%load_ext dotenv
%dotenv env



[0m

In [2]:
# %load rl_constant
LABEL = [
    "continue_work_session_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "cum_session_event_raw",
    "global_events_user",
    "global_session_time",
    "date_time",
]

OUT_FEATURE_COLUMNS = [
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time",
    
    "expanding_click_average",
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]

RESCALER_COLS = [
    'session_30_count',
    'session_5_count',
    'cum_session_event_count',
    'cum_session_time',
    'cum_platform_time',
    'cum_platform_event'
]

PREDICTION_COLS = [
    'seq_40',
]


GROUPBY_COLS = ['user_id']

RL_STAT_COLS = [
    'session_size',
    'sim_size',
    'session_minutes',
    'sim_minutes',
    'reward',
]

TORCH_LOAD_COLS = list(set(LABEL + METADATA + OUT_FEATURE_COLUMNS + RL_STAT_COLS))

In [3]:
import logging
import os
import cudf
import pandas as pd
import numpy as np
from pprint import pprint, pformat

logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')


class SessionCalculate:
    logger = logging.getLogger('rl_results_eval')
    def __init__(self, df, use_gpu) -> None:
        self.df = df
        self.use_gpu = use_gpu
            
    def _shift_time_comp(self):
        self.df['n_year'] = self.df.groupby('user_id')['year'].shift(-1)
        self.df['n_month'] = self.df.groupby('user_id')['month'].shift(-1)
        self.df['n_day'] = self.df.groupby('user_id')['day'].shift(-1)
        self.df['n_hour'] = self.df.groupby('user_id')['hour'].shift(-1)
        self.df['n_minute'] = self.df.groupby('user_id')['minute'].shift(-1)
        self.df['n_second'] = self.df.groupby('user_id')['second'].shift(-1)
        
        self._update_dates()
        self._next_date_time()

    def _update_dates(self):
        self.df['n_year'] = self.df[['n_year', 'year']].apply(lambda x: x['year'] if pd.isnull(x['n_year']) else x['n_year'], axis=1)
        self.df['n_month'] = self.df[['n_month', 'month']].apply(lambda x: x['month'] if pd.isnull(x['n_month']) else x['n_month'], axis=1)
        self.df['n_day'] = self.df[['n_day', 'day']].apply(lambda x: x['day'] if pd.isnull(x['n_day']) else x['n_day'], axis=1)
        self.df['n_hour'] = self.df[['n_hour', 'hour']].apply(lambda x: x['hour'] if pd.isnull(x['n_hour']) else x['n_hour'], axis=1)
        self.df['n_minute'] = self.df[['n_minute', 'minute']].apply(lambda x: x['minute'] if pd.isnull(x['n_minute']) else x['n_minute'], axis=1)
        self.df['n_second'] = self.df[['n_second', 'second']].apply(lambda x: x['second'] if pd.isnull(x['n_second']) else x['n_second'], axis=1)        
        
    
    def _next_date_time(self, cpu=True):
        
        if cpu:
            self.df['next_date_time'] = pd.to_datetime(dict(
                year=self.df['n_year'], month=self.df['n_month'], day=self.df['n_day'], 
                hour=self.df['n_hour'], minute=self.df['n_minute'], second=self.df['n_second'])
            )
        else:
            self.df['next_date_time'] = cudf.to_datetime(dict(
                year=self.df['n_year'], month=self.df['n_month'], day=self.df['n_day'], 
                hour=self.df['n_hour'], minute=self.df['n_minute'], second=self.df['n_second'])
            )
    
    def _calculate_inflection_30(self):
        inflections_30 = self.df[self.df['session_30'] == False]
        self.logger.info(f"Users with guaranteed > 1 30 minute sessions: {inflections_30.shape[0]}")


        self.logger.info('Grouping by user_id and counting session_30')
        inflections_30['session_30_count'] = inflections_30.groupby(['user_id'])['session_30'].cumcount() + 1
        inflections_30 = inflections_30[['user_id', 'date_time', 'session_30', 'session_30_count']]

        solo_user_session = self.df[~self.df['user_id'].isin(inflections_30['user_id'])]
        solo_user_session_end_time = solo_user_session.groupby(['user_id'])['date_time'].max().reset_index().rename(columns={'date_time': 'session_end_time'})
        self.logger.info(f'Calculating solo session: {solo_user_session.shape[0]}')
 
        solo_merged = pd.merge(solo_user_session, solo_user_session_end_time, on=['user_id'])
        solo_merged['session_30_count'] = 1
 
        self.logger.info(f'Solo session calculated: merging multiple sessions')
        merged_df = pd.merge_asof(
            self.df.sort_values(by='date_time'),
            inflections_30.sort_values(by='date_time'),
            on='date_time',
            by='user_id',
            direction='nearest',
        )
    

        self.logger.info('Multiple sessions calculated: generating session_end_time')
        session_end_times = merged_df.groupby(['user_id', 'session_30_count'])['date_time'].agg({'max'}).reset_index().rename(columns={'max': 'session_end_time'})
        merged_df = pd.merge(merged_df, session_end_times, on=['user_id', 'session_30_count']).drop(columns=['session_30_x', 'session_30_y'])
        self.logger.info(f'Merged df calculated. concatting with solo sessions')
        out_df = pd.concat([merged_df, solo_merged])
        out_df['label'] = (out_df['session_end_time'] - out_df['date_time']).dt.total_seconds() / 60 < 30
        self.logger.info(f'Label calculated. Returning df')
        return out_df

    def _calculate_inflection_5(self):
        inflections_5 = self.df[self.df['session_5'] == False]
        inflections_5['session_5_count'] = inflections_5.groupby(['user_id'])['session_5'].cumcount() + 1
        inflections_5 = inflections_5[['user_id', 'date_time', 'session_5', 'session_5_count']]
        
        solo_user_session = self.df[~self.df['user_id'].isin(inflections_5['user_id'])]
        solo_user_session_end_time = solo_user_session.groupby(['user_id'])['date_time'].max().reset_index().rename(columns={'date_time': '5_min_end_time'})


        solo_merged = pd.merge(solo_user_session, solo_user_session_end_time, on=['user_id'])
        solo_merged['session_5_count'] = 1
        
        merged_df = pd.merge_asof(
            self.df.sort_values(by='date_time'),
            inflections_5.sort_values(by='date_time'),
            on='date_time',
            by='user_id',
            direction='nearest',
        )
        
        
        end_time_5 = merged_df.groupby(['user_id', 'session_5_count'])['date_time'].agg({'max'}).reset_index().rename(columns={'max': '5_min_end_time'})
        merged_df = pd.merge(merged_df, end_time_5, on=['user_id', 'session_5_count']).drop(columns=['session_5_x', 'session_5_y'])
        self.logger.info(f'multiple 5 minute: {merged_df.shape[0]}: solo 5 minute: {solo_merged.shape[0]}')
        
        unique_cols = [
            col for col in merged_df.columns if col not in [col for col in solo_merged.columns]
        ]
        
        return pd.concat([merged_df, solo_merged])
        
    def calculate_inflections(self):
       
        self.logger.info('Calculating subsequent date time')
        self.df = self.df.sort_values(by=['date_time'])
        self.df = self.df.drop(columns=['session_30_raw', 'session_30_count', 'session_5_count']).sort_values(by=['date_time'])
        self._shift_time_comp()
        
        self.df = self.df.sort_values(by=['date_time'])
        self.df = self.df.drop_duplicates(subset=['date_time', 'next_date_time']) 
        
        self.df['diff_seconds'] = (self.df['next_date_time'] - self.df['date_time']).apply(lambda x: x.total_seconds())
        self.logger.info(f'Number of rows following diff seconds calculation: {self.df.shape[0]}') 
        self.logger.info(f'Removing duplicates')
        self.df = self.df[self.df['diff_seconds'] > 0]
        
        self.logger.info(f'Diff seconds calculated and duplicates removed: {self.df.shape[0]}')
        
        self.df['diff_minutes'] = self.df['diff_seconds'] / 60
        self.df['session_5'] = self.df['diff_minutes'] < 5
        self.df['session_30'] = self.df['diff_minutes'] < 30

        
        self.logger.info(f'Labels calculated: removing with diff seconds > 0')
        
        self.df = self.df.dropna() 
        self.logger.info(f'Number of rows following drop: {self.df.shape[0]}')
        self.logger.info(f'Sorting rows by date time and applying row count')
        self.df = self.df.sort_values(['date_time']).reset_index()
        self.logger.info(f'Sorted rows and applied row count on updated index')  
        self.logger.info('Calculating inflection points')
        self.df['user_id'] = self.df['user_id'].astype('int32')
        self.logger.info(f'Calculating inflection points for 30 minutes')
        self.df = self._calculate_inflection_30()
        
        self.logger.info(f'Calculating inflection points for 5 minutes')
        self.df = self._calculate_inflection_5()
        return self.df


In [4]:
import os
import pandas as pd
import logging

from tqdm import tqdm
import numpy as np
global logger
logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('rl_results_eval')
from functools import reduce
from pprint import pformat
import cudf as gpu_pd
import cupy as gpu_np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('mode.use_inf_as_na', True)
ALL_COLS = LABEL + METADATA + OUT_FEATURE_COLUMNS  + RL_STAT_COLS
from cuml.preprocessing import MinMaxScaler

def convolve_delta_events(df, window):
    
    df = df.to_pandas()
   
    df = df.dropna() 
    df = df.replace([np.inf, -np.inf], np.nan)
    before_resample = df.shape
    logger.info(f'Convolutional shape before resample: {before_resample}')
    logger.info(f'Convolution over delta last event')
    
    df['convolved_delta_event'] = (
        df.sort_values(by=['session_30_raw', 'cum_session_event_raw']) \
            .set_index('date_time') \
            .groupby(by=['user_id', 'session_30_raw'], group_keys=False) \
            .rolling(f'{window}T', min_periods=1)['delta_last_event'] \
            .mean()
            .reset_index(name='convolved_event_delta')['convolved_event_delta'] \
    ) 

    df['delta_last_event'] = df['convolved_delta_event']
    df = df.loc[:,~df.columns.duplicated()]  
  
    # remove duplicate columns
    df['year'] = df['date_time'].dt.year
    df['month'] = df['date_time'].dt.month
    df['day'] = df['date_time'].dt.day
    df['hour'] = df['date_time'].dt.hour
    df['minute'] = df['date_time'].dt.minute
    df['second'] = df['date_time'].dt.second
   
    df['date_time'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'second']])

    df = df[df['minute'] % 2 == 0].reset_index(drop=True).sort_values(by=['date_time'])
    
    resampled_df = df.drop_duplicates(subset=['user_id', 'year', 'month', 'day', 'hour', 'minute'], keep='last').sort_values(by=['date_time']) 
   
    df['user_id'] = df['user_id'].astype('int32') 
    logger.info(f'Convolution complete: {before_resample} -> {resampled_df.shape}')
    logger.info(f'Recalculating inflections')
    resample_events = SessionCalculate(resampled_df, use_gpu=True)
    resampled_event_out = resample_events.calculate_inflections()
    
    logger.info(f'Events resampled')
    logger.info(sorted(pformat(resampled_event_out.columns)))
    
    return resampled_event_out
     
     

def generate_metadata_session(dataset):
    
    logger.info(f'Calculating session size and minutes')
    session_size = dataset.groupby(['user_id', 'session_30_count'])['cum_session_event'].max().reset_index(name='session_size')
    session_minutes = dataset.groupby(['user_id', 'session_30_count'])['cum_session_time'].max().reset_index(name='session_minutes')
    
    
    logger.info(f'Calculating sim size and minutes')
    sim_minutes = dataset.groupby(['user_id', 'session_30_count'])['cum_session_time'].quantile(.7, interpolation='nearest').reset_index(name='sim_minutes')
    sim_size = dataset.groupby(['user_id', 'session_30_count'])['cum_session_event'].quantile(.7, interpolation='nearest').reset_index(name='sim_size')
    
    
    sessions = [session_size, session_minutes, sim_minutes, sim_size]
    logger.info(f'Merging metadata')
    sessions = reduce(lambda left, right: pd.merge(left, right, on=['user_id', 'session_30_count']), sessions)
    
    logger.info(f'Merging metadata complete')
    dataset = pd.merge(dataset, sessions, on=['user_id', 'session_30_count'])
    logger.info(f'Calculating reward')
    dataset['reward'] = dataset['cum_session_time']
    logger.info(f'Calculating reward complete')
    return dataset



def setup_data_at_window(df, window):
    logger.info(f'Convolution over {window} minute window')
    df = df.sort_values(by='date_time')
    df['cum_session_event_raw'] = df.groupby(['user_id', 'session_30_raw'])['cum_platform_event_raw'].cumcount() + 1
    df = df.sort_values(by='date_time').reset_index(drop=True)
    df  = convolve_delta_events(df, window)
    logger.info(f'Convolving over {window} minute window complete: generating metadata')
    df = df.sort_values(by='date_time')
    logger.info(f'Generating metadata complete')
    return df


def reset_intra_session(subset_df):
    
    logger.info(f'Resetting cum_session_event_count')
    subset_df['cum_session_event'] = subset_df.groupby(['user_id', 'session_30_count'])['cum_platform_event_raw'].cumcount() + 1
    
    logger.info(f'Resetting cum_session_time')
    subset_df['diff_minutes'] = subset_df[['diff_minutes', 'date_time', 'session_end_time']].apply(lambda x: 0 if x['date_time'] == x['session_end_time'] else x['diff_minutes'], axis=1)
    subset_df['cum_session_time'] = subset_df.groupby(['user_id', 'session_30_count'])['diff_minutes'].cumsum()
    
    logger.info(f'Resetting cum_platform_time and cum_platform_events')
    subset_df['cum_platform_time'] = subset_df.groupby(['user_id'])['diff_minutes'].cumsum()
    subset_df['cum_platform_event'] = subset_df.groupby(['user_id'])['cum_session_event'].cumcount() + 1
    return subset_df

def get_dataset(read_path, conv_path, n_files, window):
    
    conv_path, read_path = (
        os.path.join(conv_path, f'files_used_{n_files}'),
        os.path.join(read_path, f'files_used_{n_files}', 'predicted_data.parquet')
    )

    if not os.path.exists(conv_path):
        logger.info(f'Creating directory {conv_path}')
        os.makedirs(conv_path)
    
    logger.info(f'Convolutional dataset not found at {conv_path}: creating')
    logger.info(f'Getting dataset from {read_path}')
    df = gpu_pd.read_parquet(read_path)
    
    df['date_time'] = gpu_pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'second']])
    logger.info(f'Non nan values: {df.count().min()}: dropping na')
    logger.info(f'NA values dropped: {df.count().min()}')
    
    df = df.sort_values(by='date_time')
    subsets = [
        { "name": "train", "start": 0, "end": int(df.shape[0] * .7) },
        { "name": "eval", "start": int(df.shape[0] * .7), "end": int(df.shape[0]) }
    ]

   
    logger.info(f'Initial shape: {df.shape}: dropping na and inf')
    df = df.dropna()
    logger.info(f'Final shape: {df.shape}: dropping na and inf')
    
    for s in subsets:
        logger.info(f'Running conv on {s["name"]} subset')
        subset_df = df[s['start']:s['end']].copy()
        subset_df = subset_df.sort_values(by='date_time')
        logger.info(f'Subset setup complete: {subset_df.shape}')
        subset_df = setup_data_at_window(subset_df, window)  
        logger.info(f'Subset convolution complete: {subset_df.shape}, resetting stats')
        subset_df = reset_intra_session(subset_df)     
        logger.info(f'Intra session reset complete: {subset_df.shape}')
        logger.info(f'Stats reset complete, resetting metadata')
        updated_conv_path = os.path.join(conv_path, f'window_{window}_{s["name"]}.parquet')
        subset_df = generate_metadata_session(subset_df)
        logger.info(f'Metadata reset complete: {subset_df.shape}')
        logger.info(f'Saving convolutional dataset to {updated_conv_path}')
        
        is_monotic_increasing = subset_df.round(3).groupby(['user_id', 'session_30_count'])['reward'].is_monotonic_increasing.reset_index(name='is_monotic_increasing')
        
        if is_monotic_increasing[is_monotic_increasing['is_monotic_increasing'] == False].shape[0] > 0:
            logger.info(f'Non monotonic increasing reward found: perc {is_monotic_increasing[is_monotic_increasing["is_monotic_increasing"] == False].shape[0] / is_monotic_increasing.shape[0]}')
            logger.info(is_monotic_increasing[is_monotic_increasing["is_monotic_increasing"] == False])
        else:
            logger.info(f'All rewards are monotonic increasing and no errors')
       
        logger.info(f'Rescaling feature cols: {RESCALER_COLS}')
        
        for col in RESCALER_COLS:
            subset_df[f'{col}_raw'] = subset_df[col]
        subset_df[RESCALER_COLS] = MinMaxScaler(feature_range=(-1, 1)).fit_transform(subset_df[RESCALER_COLS])
       
        logger.info(f'Writing to disk: {subset_df.shape}') 
        subset_df.to_parquet(updated_conv_path)


In [5]:
class Arguments:
    read_path = 'rl_ready_data'
    conv_path = 'rl_ready_data_conv'
    n_files = 2
    window = 2

In [6]:
get_dataset(Arguments.read_path, Arguments.conv_path, Arguments.n_files, Arguments.window)


2023-05-30 17:47:10,031 - rl_results_eval - INFO - Convolutional dataset not found at rl_ready_data_conv/files_used_30: creating
2023-05-30 17:47:10,032 - rl_results_eval - INFO - Getting dataset from rl_ready_data/files_used_30/predicted_data.parquet
2023-05-30 17:47:13,409 - rl_results_eval - INFO - Non nan values: 38500990: dropping na
2023-05-30 17:47:13,411 - rl_results_eval - INFO - NA values dropped: 38500990
2023-05-30 17:47:13,490 - rl_results_eval - INFO - Initial shape: (38500990, 41): dropping na and inf
2023-05-30 17:47:13,588 - rl_results_eval - INFO - Final shape: (38500990, 41): dropping na and inf
2023-05-30 17:47:13,589 - rl_results_eval - INFO - Running conv on train subset
2023-05-30 17:47:13,704 - rl_results_eval - INFO - Subset setup complete: (26950693, 41)
2023-05-30 17:47:13,704 - rl_results_eval - INFO - Convolution over 2 minute window
2023-05-30 17:47:19,454 - rl_results_eval - INFO - Convolutional shape before resample: (416593, 42)
2023-05-30 17:47:19,455 

In [7]:
df = pd.read_parquet('rl_ready_data_conv/files_used_30/window_2_eval.parquet')

In [10]:
df.round(3).head(10)

Unnamed: 0,index,continue_work_session_30_minutes,user_id,cum_platform_event_raw,cum_platform_time_raw,cum_session_time_raw,global_events_user,global_session_time,year,month,day,hour,minute,second,country_count,date_hour_sin,date_hour_cos,date_minute_sin,date_minute_cos,cum_session_event_count,delta_last_event,cum_session_time,expanding_click_average,cum_platform_time,cum_platform_events,cum_projects,average_event_time,rolling_session_time,rolling_session_events,rolling_session_gap,previous_session_time,previous_session_events,seq_1,seq_10,seq_20,seq_30,seq_30_heuristic,seq_40,date_time,cum_session_event_raw,convolved_delta_event,n_year,n_month,n_day,n_hour,n_minute,n_second,next_date_time,diff_seconds,diff_minutes,session_30_count,session_end_time,session_30,label,session_5_count,5_min_end_time,session_5,cum_session_event,cum_platform_event,session_size,session_minutes,sim_minutes,sim_size,reward,session_30_count_raw,session_5_count_raw,cum_session_event_count_raw
0,0,0.0,2262,1,1.617,1.617,2864.0,22544.0,2021,12,6,1,40,57,1.0,0.259,0.966,-0.866,-0.5,-0.429,-1.0,-0.834,-0.94,-0.834,-0.991,-0.999,-0.961,-0.969,-0.991,-0.859,-0.966,-0.989,0.197,0.212,0.156,0.181,0.186,0.176,2021-12-06 01:40:57,1,-1.0,2021.0,12.0,6.0,1.0,42.0,34.0,2021-12-06 01:42:34,97.0,1.617,-1.0,2021-12-06 02:00:29,,True,-1.0,2021-12-06 02:00:29,,1,-1.0,10,19.533,15.967,7,1.617,1.0,1,-0.998
1,3,0.0,2262,2,5.95,5.95,2864.0,22544.0,2021,12,6,1,42,34,1.0,0.259,0.966,-0.951,-0.309,-0.143,0.905,-0.391,-0.948,-0.391,-0.991,-0.999,-0.961,-0.969,-0.991,-0.859,-0.966,-0.989,0.178,0.205,0.144,0.198,0.191,0.186,2021-12-06 01:42:34,8,0.905,2021.0,12.0,6.0,1.0,46.0,54.0,2021-12-06 01:46:54,260.0,4.333,-1.0,2021-12-06 02:00:29,,True,-1.0,2021-12-06 02:00:29,,2,-0.778,10,19.533,15.967,7,5.95,1.0,1,-0.997
2,13,0.0,2262,3,7.9,7.9,2864.0,22544.0,2021,12,6,1,46,54,1.0,0.259,0.966,-0.995,0.105,-0.857,-0.813,-0.191,-0.989,-0.191,-0.989,-0.999,-0.968,-0.944,-0.987,-0.99,-0.959,-0.994,0.211,0.217,0.232,0.236,0.268,0.228,2021-12-06 01:46:54,5,-0.813,2021.0,12.0,6.0,1.0,48.0,51.0,2021-12-06 01:48:51,117.0,1.95,-1.0,2021-12-06 02:00:29,,True,-1.0,2021-12-06 02:00:29,,3,-0.556,10,19.533,15.967,7,7.9,1.0,1,-1.0
3,22,0.0,2262,4,9.917,9.917,2864.0,22544.0,2021,12,6,1,48,51,1.0,0.259,0.966,-0.951,0.309,0.143,-0.985,0.015,-0.938,0.015,-0.991,-0.999,-0.961,-0.969,-0.991,-0.859,-0.966,-0.989,0.184,0.216,0.161,0.192,0.186,0.195,2021-12-06 01:48:51,15,-0.985,2021.0,12.0,6.0,1.0,50.0,52.0,2021-12-06 01:50:52,121.0,2.017,-1.0,2021-12-06 02:00:29,,True,-1.0,2021-12-06 02:00:29,,4,-0.333,10,19.533,15.967,7,9.917,1.0,1,-0.996
4,30,0.0,2262,5,11.917,11.917,2864.0,22544.0,2021,12,6,1,50,52,1.0,0.259,0.966,-0.866,0.5,0.143,-0.995,0.22,-0.984,0.22,-0.989,-0.999,-0.968,-0.944,-0.987,-0.99,-0.959,-0.994,0.172,0.191,0.163,0.209,0.223,0.219,2021-12-06 01:50:52,25,-0.995,2021.0,12.0,6.0,1.0,52.0,52.0,2021-12-06 01:52:52,120.0,2.0,-1.0,2021-12-06 02:00:29,,True,-1.0,2021-12-06 02:00:29,,5,-0.111,10,19.533,15.967,7,11.917,1.0,1,-0.996
5,40,0.0,2262,6,13.8,13.8,2864.0,22544.0,2021,12,6,1,52,52,1.0,0.259,0.966,-0.743,0.669,1.0,-1.0,0.413,-0.978,0.413,-0.991,-0.999,-0.961,-0.969,-0.991,-0.859,-0.966,-0.989,0.189,0.224,0.161,0.223,0.219,0.204,2021-12-06 01:52:52,33,-1.0,2021.0,12.0,6.0,1.0,54.0,45.0,2021-12-06 01:54:45,113.0,1.883,-1.0,2021-12-06 02:00:29,,True,-1.0,2021-12-06 02:00:29,,6,0.111,10,19.533,15.967,7,13.8,1.0,1,-0.993
6,46,0.0,2262,7,15.967,15.967,2864.0,22544.0,2021,12,6,1,54,45,1.0,0.259,0.966,-0.588,0.809,-0.571,-0.99,0.635,-0.975,0.635,-0.989,-0.999,-0.965,-0.953,-0.992,-0.99,-0.976,-0.998,0.189,0.203,0.182,0.207,0.221,0.218,2021-12-06 01:54:45,9,-0.99,2021.0,12.0,6.0,1.0,56.0,55.0,2021-12-06 01:56:55,130.0,2.167,-1.0,2021-12-06 02:00:29,,True,-1.0,2021-12-06 02:00:29,,7,0.333,10,19.533,15.967,7,15.967,1.0,1,-0.999
7,53,0.0,2262,8,17.85,17.85,2864.0,22544.0,2021,12,6,1,56,55,1.0,0.259,0.966,-0.407,0.914,-0.286,-0.991,0.828,-0.968,0.828,-0.989,-0.999,-0.965,-0.953,-0.992,-0.99,-0.976,-0.998,0.184,0.188,0.203,0.226,0.209,0.227,2021-12-06 01:56:55,15,-0.991,2021.0,12.0,6.0,1.0,58.0,48.0,2021-12-06 01:58:48,113.0,1.883,-1.0,2021-12-06 02:00:29,,True,-1.0,2021-12-06 02:00:29,,8,0.556,10,19.533,15.967,7,17.85,1.0,1,-0.998
8,58,0.0,2262,9,19.533,19.533,2864.0,22544.0,2021,12,6,1,58,48,1.0,0.259,0.966,-0.208,0.978,0.143,-0.989,1.0,-0.983,1.0,-0.989,-0.999,-0.965,-0.953,-0.992,-0.99,-0.976,-0.998,0.227,0.201,0.215,0.234,0.246,0.227,2021-12-06 01:58:48,25,-0.989,2021.0,12.0,6.0,2.0,0.0,29.0,2021-12-06 02:00:29,101.0,1.683,-1.0,2021-12-06 02:00:29,,True,-1.0,2021-12-06 02:00:29,,9,0.778,10,19.533,15.967,7,19.533,1.0,1,-0.996
9,60,0.0,2262,10,19.533,19.533,2864.0,22544.0,2021,12,6,2,0,29,1.0,0.5,0.866,0.0,1.0,0.429,-0.946,1.0,-0.98,1.0,-0.989,-0.999,-0.965,-0.953,-0.992,-0.99,-0.976,-0.998,0.198,0.209,0.204,0.259,0.237,0.235,2021-12-06 02:00:29,31,-0.946,2021.0,12.0,6.0,2.0,56.0,55.0,2021-12-06 02:56:55,3386.0,0.0,-1.0,2021-12-06 02:00:29,,True,-1.0,2021-12-06 02:00:29,,10,1.0,10,19.533,15.967,7,19.533,1.0,1,-0.995


In [9]:
for col in sorted(df.columns):
    print(col)

5_min_end_time
average_event_time
continue_work_session_30_minutes
convolved_delta_event
country_count
cum_platform_event
cum_platform_event_raw
cum_platform_events
cum_platform_time
cum_platform_time_raw
cum_projects
cum_session_event
cum_session_event_count
cum_session_event_count_raw
cum_session_event_raw
cum_session_time
cum_session_time_raw
date_hour_cos
date_hour_sin
date_minute_cos
date_minute_sin
date_time
day
delta_last_event
diff_minutes
diff_seconds
expanding_click_average
global_events_user
global_session_time
hour
index
label
minute
month
n_day
n_hour
n_minute
n_month
n_second
n_year
next_date_time
previous_session_events
previous_session_time
reward
rolling_session_events
rolling_session_gap
rolling_session_time
second
seq_1
seq_10
seq_20
seq_30
seq_30_heuristic
seq_40
session_30
session_30_count
session_30_count_raw
session_5
session_5_count
session_5_count_raw
session_end_time
session_minutes
session_size
sim_minutes
sim_size
user_id
year
