In [40]:
!apt-get update && apt-get install awscli dotenv zip -q

Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Hit:2 http://archive.ubuntu.com/ubuntu focal InRelease                         
Hit:3 http://archive.ubuntu.com/ubuntu focal-updates InRelease                 
Get:4 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Get:5 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Fetched 222 kB in 1s (417 kB/s)    
Reading package lists... Done
Reading package lists...
Building dependency tree...
Reading state information...
E: Unable to locate package dotenv


In [43]:
!pip install python-dotenv -q

[0m

In [44]:
%load_ext dotenv

In [45]:
%dotenv env

In [38]:
import argparse
import glob

import logging
import os
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
import pprint
import numpy as np
import cudf as pd
import pandas as cpu_pd

import dask_cudf as dd
import pprint
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--read_path', type=str, default='datasets/encoded_time_data')
    parser.add_argument('--write_path', type=str, default='datasets/frequency_encoded_2')
    parser.add_argument('--n_files', type=int, default=2)
    parser.add_argument('--use_gpu', type=bool, default=False)
    parser.add_argument('--test_env', type=bool, default=True)
    args = parser.parse_args()
    return args

ALL_COLUMNS = [
    "project_id",
    "workflow_id",
    "user_id",
    "country",
    "year",
    "month",
    "day",
    "hour",
    "minute",
    "second",
    "day_of_week",
    "date_time",
    "previous_date_time",
    "time_diff_seconds",
    "5_minute_session_count",
    "30_minute_session_count",
    "max_session_time",
    "time_until_end_of_session",
    "label",
    "task_within_session_count",
    "project_workflow_count",
    "user_count",
    "country_count"
]

LOAD_COLUMNS = [
    "project_id",
    "workflow_id",
    "user_id",
    "country",  
    "date_time",
]

In [13]:
TEST_U_ID = [2373355, 10, 4301]
SAMPLE_COLS = [
    'user_id',
    'date_time',
    'row_count',
    'label_30',
    'session_30',
]
def outer_apply_inflection_points(inflections):
    def inner_find_session(row):
        inflection_for_user = inflections[row['user_id']]
        index_for_row = row['row_count']
        return 1 + np.searchsorted(inflection_for_user, index_for_row, side='left')

    return inner_find_session

def get_inflection_points_5(subset):
    return subset[subset['label_5'] == False].index.values

def get_inflection_points_30(subset):
    return subset[subset['label_30'] == False].index.values

class SessionCalculate:
    logger = logging.getLogger(__name__)
    def __init__(self, df, write_path, use_gpu, test_env) -> None:
        self.df = df
        self.write_path = write_path
        self.use_gpu = use_gpu
        self.test_env = test_env
    
    def calculate_inflections(self):
       
        self.logger.info('Calculating subsequent date time')
        self.df['next_date_time'] = self.df.groupby('user_id')['date_time'].shift(-1)
        self.df = self.df.drop_duplicates(subset=['user_id', 'date_time'], keep='last').reset_index()
        if self.use_gpu:
            self.logger.info('Bringing to CPU for second calculation')
            self.df = self.df.to_pandas()
           
            
        self.df['diff_seconds'] = (self.df['next_date_time'] - self.df['date_time']).apply(lambda x: x.total_seconds())
        self.logger.info('Diff seconds calculated')

        self.df['diff_minutes'] = (self.df['diff_seconds'] / 60)
        self.df['label_5'] = (self.df['diff_minutes'] < 5)
        self.df['label_30'] = self.df['diff_minutes'] < 30

        
        self.logger.info(f'Labels calculated: removing rows with diff seconds > 0')
    
 
        self.df = self.df.drop(columns=['next_date_time', 'diff_seconds'])
        self.logger.info(f'Number of rows following drop: {self.df.shape[0]}')
        self.logger.info(f'Sorting rows by date time and applying row count')
        self.df = self.df.sort_values(['date_time']).reset_index()
        self.df['user_id'] = self.df['user_id'].astype('int32')
        self.df['row_count'] = self.df.index.values
        self.logger.info(f'Sorted rows and applied row count on updated index')  
        self.logger.info('Calculating inflection points')
        
        inflections_5_merge = self.df[self.df['label_5'] == False]
        inflections_30_merge = self.df[self.df['label_30'] == False]
        
        inflections_5_merge['session_5'] = inflections_5_merge.groupby('user_id').cumcount() + 1
        inflections_30_merge['session_30'] = inflections_30_merge.groupby('user_id').cumcount() + 1
        
        inflections_5_merge = inflections_5_merge[['user_id', 'row_count', 'session_5']].sort_values(['row_count'])
        inflections_30_merge = inflections_30_merge[['user_id', 'row_count', 'session_30']].sort_values(['row_count'])
        
   
        self.logger.info('Inflection points calcularting for 5') 
        self.df = cpu_pd.merge_asof(self.df, inflections_5_merge, on='row_count', by='user_id', direction='forward')
        self.logger.info('Inflection points calcularting for 30')
        self.df = cpu_pd.merge_asof(self.df, inflections_30_merge, on='row_count', by='user_id', direction='forward')
        self.logger.info('Inflection points calculated')

        if self.use_gpu:
            self.logger.info('Bringing back to GPU for final calculations')
            self.df = pd.from_pandas(self.df)
              
        self.logger.info('Inflections calculated')
   
    
    def write_inflections_parquet(self):
    
        self.df = self.df.drop(columns=['index', 'level_0'])
       
        if not self.test_env:
            self.df = self.df.drop(columns=['diff_minutes', 'row_count'])
        
        if self.use_gpu:
            import dask_cudf as ddf
            self.logger.info('Bringing back to dask GPU for final calculations')
            self.df = ddf.from_cudf(self.df, npartitions=30)
        else:
            import dask.dataframe as ddf
            self.logger.info('Bringing back to dask CPU for final calculations')
            self.df = ddf.from_pandas(self.df, npartitions=30)
        
        self.logger.info(f'Writing inflections to {self.write_path}')    
        # write_path = self.write_path + '.parquet.gzip'
        self.df.to_parquet(self.write_path)

In [14]:
def get_logger():
    logger = logging.getLogger(__name__)
    return logger

def main(args):
    cpu_pd.set_option('display.max_columns', None)
    cpu_pd.set_option('display.max_rows', None)
    cpu_pd.options.mode.chained_assignment = None  # default='warn'
    cpu_pd.set_option('display.width', 500)

    logger = get_logger()
    read_path, write_path, n_files, use_gpu, test_env = args.read_path, args.write_path, args.n_files, args.use_gpu, args.test_env
    logger.info(f'Starting Session Assignment')
    logger.info(f'Read: {read_path}, Write: {write_path}, N Files: {n_files}, GPU: {use_gpu}, Test Env: {test_env}')
    files_to_read = sorted(list(glob.iglob(f'{read_path}/*.csv')))
    logger.info(f'Found {len(files_to_read)} files to read')
    df = dd.read_csv(files_to_read[:n_files], usecols=LOAD_COLUMNS).compute()
    df['date_time'] = pd.to_datetime(df['date_time'])
    df = df.sort_values(by=['date_time'])
    session_calculator = SessionCalculate(df, args.write_path, args.use_gpu, args.test_env)
    session_calculator.calculate_inflections()
    
    session_calculator.write_inflections_parquet()


    


In [15]:
class Arguments:
    read_path = 'frequency_encoded_data'
    write_path = 'labelled_session_count_data'
    n_files = 61
    use_gpu = True

    test_env = True

In [16]:
main(Arguments)

2023-03-19 15:08:45,433 - __main__ - INFO - Starting Session Assignment
2023-03-19 15:08:45,434 - __main__ - INFO - Read: frequency_encoded_data, Write: frequency_encoded_data_2, N Files: 61, GPU: True, Test Env: True
2023-03-19 15:08:45,435 - __main__ - INFO - Found 60 files to read
2023-03-19 15:08:57,012 - __main__ - INFO - Calculating subsequent date time
2023-03-19 15:08:57,170 - __main__ - INFO - Bringing to CPU for second calculation
2023-03-19 15:10:07,440 - __main__ - INFO - Diff seconds calculated
2023-03-19 15:10:07,765 - __main__ - INFO - Labels calculated: removing rows with diff seconds > 0
2023-03-19 15:10:09,033 - __main__ - INFO - Number of rows following drop: 38500990
2023-03-19 15:10:09,034 - __main__ - INFO - Sorting rows by date time and applying row count
2023-03-19 15:10:18,104 - __main__ - INFO - Sorted rows and applied row count on updated index
2023-03-19 15:10:18,105 - __main__ - INFO - Calculating inflection points
2023-03-19 15:10:19,549 - __main__ - INFO 

In [17]:
def test_session_assignment(df):

    df['date_time'] = cpu_pd.to_datetime(df['date_time'])
    df = df.sort_values(['date_time'])

    less_5 = df[df['label_5'] == True]
    less_30 = df[df['label_30'] == True]
    
    assert less_5.shape[0] == df[df['diff_minutes'] < 5].shape[0]
    assert less_30.shape[0] == df[df['diff_minutes'] < 30].shape[0]
        
def test_session_boundary(calculated_df):
    calculated_df['date_time'] = cpu_pd.to_datetime(calculated_df['date_time'])
    calculated_df = calculated_df.sort_values(['date_time'])
    
    for user_stats, df_subset in calculated_df.groupby(['user_id', 'session_30']):
        df_subset_in_range = df_subset.iloc[:df_subset.shape[0] - 2]
        inflection = df_subset.iloc[-1]
        if df_subset_in_range.shape[0] > 2:
            df_subset_in_range = df_subset_in_range.iloc[0: df_subset_in_range.shape[0] - 2]
            assert df_subset_in_range[df_subset_in_range['label_30'] == False].shape[0] == 0
            assert inflection['label_30'] == False
        
        if inflection.session_30 > 1:
            user_id, session_30 = user_stats
            previous_inflections = calculated_df[
                (calculated_df['user_id'] == user_id) & (calculated_df['session_30'] == session_30 - 1)
            ]
            
            max_row = previous_inflections.iloc[-1]
            if max_row.shape[0] > 0:
                assert max_row['label_30'] == False
        



In [18]:
df = dd.read_parquet('frequency_encoded_data_2').compute().to_pandas()

In [19]:
sample_users = df[['user_id']].drop_duplicates().reset_index(drop=True).sample(1000)
df = df[df['user_id'].isin(sample_users['user_id'])]


In [20]:
test_session_assignment(df)

In [21]:
test_session_boundary(df)

In [None]:
!zip -r labelled_session_count_data.zip labelled_session_count_data -q

In [46]:
!aws s3 cp labelled_session_count_data.zip s3://dissertation-data-dmiller/labelled_session_count_data.zip

upload: ./frequency_encoded_data.zip to s3://dissertation-data-dmiller/labelled_session_count_data.zip
