In [2]:
# %load session_calculate.py
import pandas as pd
import numpy as np
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
import cudf
import dask_cudf as dd

import os
import glob

TEST_U_ID = [2373355, 10, 4301]

OUT_COLUMNS = [
    'user_id',
    'project_id',
    'date_time',
    'session_5_count',
    'session_30_count',
    'session_terminates_30_minutes',
    'country',
]

from pprint import pformat
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.width', 500)

class SessionCalculate:
    logger = logging.getLogger(__name__)
    def __init__(self, df, write_path, use_gpu, test_env) -> None:
        self.df = df
        self.write_path = write_path
        self.use_gpu = use_gpu
        self.test_env = test_env
    
    def calculate_inflections(self):
       
        self.logger.info('Calculating subsequent date time')
        self.df['next_date_time'] = self.df.groupby('user_id')['date_time'].shift(-1)
        self.df = self.df.drop_duplicates(subset=['user_id', 'date_time'], keep='last').reset_index()
        if self.use_gpu:
            self.logger.info('Bringing to CPU for second calculation')
            self.df = self.df.to_pandas()
           
            
        self.df['diff_seconds'] = (self.df['next_date_time'] - self.df['date_time']).apply(lambda x: x.total_seconds())
        
        self.logger.info('Diff seconds calculated')
        if self.use_gpu:
            self.logger.info('Bringing back to GPU for final calculations')
            self.df = cudf.from_pandas(self.df)

        self.df['diff_minutes'] = (self.df['diff_seconds'] / 60)
        self.df['session_5'] = (self.df['diff_minutes'] < 5)
        self.df['session_30'] = self.df['diff_minutes'] < 30
        
        self.df['session_30'] = self.df['session_30'].fillna(False)
        self.df['session_5'] = self.df['session_5'].fillna(False)        
        self.logger.info(f'Labels calculated: removing rows with diff seconds > 0')
       
        

        self.logger.info(f'Number of rows following drop: {self.df.shape[0]}')
        self.logger.info(f'Sorting rows by date time and applying row count')
        self.df = self.df.sort_values(['date_time']).reset_index()
        self.df['row_count'] = self.df.index.values
        self.logger.info(f'Sorted rows and applied row count on updated index')  
        self.logger.info('Calculating inflection points')
        self.df['user_id'] = self.df['user_id'].astype('int32')
        
       
        inflections_5_merge = self.df[self.df['session_5'] == False].sort_values(by=['date_time'])
        inflections_30_merge = self.df[self.df['session_30'] == False].sort_values(by=['date_time']) 
     
        self.logger.info('Calculating session 5 inflections') 
        inflections_5_merge['session_5'] = inflections_5_merge.groupby('user_id').cumcount() + 1
        inflections_5_merge = inflections_5_merge.rename(columns={'session_5': 'session_5_count'})
        
        self.logger.info('Calculating session 30 inflections')
        inflections_30_merge['session_30'] = inflections_30_merge.groupby('user_id').cumcount() + 1
        inflections_30_merge = inflections_30_merge.rename(columns={'session_30': 'session_30_count'})
        
        inflections_5_merge = inflections_5_merge[['user_id', 'date_time', 'row_count', 'session_5_count']].sort_values(by=['row_count', 'user_id'])
        inflections_30_merge = inflections_30_merge[['user_id', 'date_time', 'row_count', 'session_30_count']].sort_values(by=['row_count', 'user_id'])
        inflections_5_merge = inflections_5_merge.drop(columns=['date_time'])
        
        inflections_30_merge = inflections_30_merge.rename(columns={'date_time': 'session_end_time'})

        
        if self.use_gpu:
            self.logger.info('Bringing back to GPU for labelling')
            self.df, inflections_5_merge, inflections_30_merge = self.df.to_pandas(), inflections_5_merge.to_pandas(), inflections_30_merge.to_pandas()
            self.df = self.df.sort_values(by=['row_count', 'user_id'])
            self.df = pd.merge_asof(self.df, inflections_5_merge, on='row_count', by='user_id', direction='forward')
            self.df = pd.merge_asof(self.df, inflections_30_merge, on='row_count', by='user_id', direction='forward')
            self.df['session_terminates_30_minutes'] = (self.df['session_end_time'] - self.df['date_time']).apply(lambda x: x.total_seconds() / 60) < 30
            self.df = cudf.from_pandas(self.df)
        else:
            self.logger.info('Labelling on CPU')
            self.df = pd.merge_asof(self.df.sort_values(by=['row_count', 'user_id']), inflections_5_merge, on='row_count', by='user_id', direction='forward')

            self.df = pd.merge_asof(self.df.sort_values(by=['row_count', 'user_id']), inflections_5_merge, on='row_count', by='user_id', direction='forward')
            self.df = pd.merge_asof(self.df.sort_values(by=['row_count', 'user_id']), inflections_30_merge, on='row_count', by='user_id', direction='forward') 
            self.df['session_terminates_30_minutes'] = (self.df['session_end_time'] - self.df['date_time']).apply(lambda x: x.total_seconds() / 60) < 30
 
        self.logger.info('Inflections calculated')
 
        session_end_30_minutes = self.df[self.df['session_terminates_30_minutes'] == False].shape[0]
        self.logger.info(f'Percent sessions end in 30 minutes: {session_end_30_minutes / self.df.shape[0]}')
        self.logger.info(f'Columns for df') 
        self.logger.info(pformat(self.df.columns))
        
        
    def write_inflections_parquet(self):
    

        self.logger.info(f'Writing inflections to {self.write_path}')

        self.df = self.df[OUT_COLUMNS].sort_values(by=['date_time', 'user_id']).to_parquet(self.write_path)
        # write_path = self.write_path + '.parquet.gzip'

In [3]:
LOAD_COLUMNS = [
    "project_id",
    "user_id",
    "country",  
    "date_time",
]

def get_logger():
    logger = logging.getLogger(__name__)
    return logger


def main(args):
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.options.mode.chained_assignment = None  # default='warn'
    pd.set_option('display.width', 500)

    logger = get_logger()
    read_path, write_path, n_files = args.read_path, args.write_path, args.n_files
    logger.info(f'Read: {read_path}, Write: {write_path}, N Files: {n_files}')
    files_to_read = sorted(list(glob.iglob(f'{read_path}/*.parquet')))
    logger.info(f'Found {len(files_to_read)} files to read')
    if args.n_files < 30:
        logger.info(f'Loading partial data for testing: {args.n_files}')
        df_list = []
        for file in files_to_read[:args.n_files]:
            df_list.append(cudf.read_parquet(file, usecols=LOAD_COLUMNS))
        df = cudf.concat(df_list)
    else:
        df = cudf.read_parquet(args.read_path, usecols=LOAD_COLUMNS)
        
    df['date_time'] = cudf.to_datetime(df['date_time'])
    df = df.sort_values(by=['date_time'])
    
    write_path = os.path.join(write_path, f'files_used_{n_files}')
    session_calculator = SessionCalculate(df, write_path, args.use_gpu, args.test_env)
    session_calculator.calculate_inflections()    

    session_calculator.write_inflections_parquet()




In [4]:
class Arguments:
    read_path = 'labelled_session_count_data'
    write_path = 'labelled_session_count_data_2'
    n_files = 30
    use_gpu = True

    test_env = True

In [5]:
main(Arguments)

2023-03-28 14:00:04,208 - __main__ - INFO - Read: labelled_session_count_data, Write: labelled_session_count_data_2, N Files: 30
2023-03-28 14:00:04,210 - __main__ - INFO - Found 30 files to read
2023-03-28 14:00:07,323 - __main__ - INFO - Calculating subsequent date time
2023-03-28 14:00:07,484 - __main__ - INFO - Bringing to CPU for second calculation
2023-03-28 14:00:07,486 - numba.cuda.cudadrv.driver - INFO - init
2023-03-28 14:01:17,079 - __main__ - INFO - Diff seconds calculated
2023-03-28 14:01:17,079 - __main__ - INFO - Bringing back to GPU for final calculations
2023-03-28 14:01:19,336 - __main__ - INFO - Labels calculated: removing rows with diff seconds > 0
2023-03-28 14:01:19,337 - __main__ - INFO - Number of rows following drop: 38500990
2023-03-28 14:01:19,337 - __main__ - INFO - Sorting rows by date time and applying row count
2023-03-28 14:01:19,397 - __main__ - INFO - Sorted rows and applied row count on updated index
2023-03-28 14:01:19,397 - __main__ - INFO - Calcula

In [10]:
df = cudf.read_parquet('labelled_session_count_data_2/files_used_30')

In [11]:
display(df.session_5_count.count() / df.shape[0])
display(df.session_30_count.count() / df.shape[0])

1.0

1.0

In [12]:
df.session_terminates_30_minutes.value_counts() / df.shape[0]

True     0.596643
False    0.403357
Name: session_terminates_30_minutes, dtype: float64

In [13]:
df.shape

(38500990, 7)