In [1]:
# %load session_calculate.py
import pandas as pd
import numpy as np
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
import cudf
import dask_cudf as dd

import os
import glob

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.width', 500)

In [65]:
# %load session_calculate.py
import pandas as pd
import numpy as np
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
import cudf
import dask_cudf as dd

import os
import glob


OUT_COLUMNS = [
    'user_id',
    'project_id',
    'date_time',
    'session_5_count',
    'session_count',
    'session_terminates',
    'country',
]

from pprint import pformat
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.width', 500)

class SessionCalculate:
    logger = logging.getLogger(__name__)
    def __init__(self, df, write_path, use_gpu, test_env, data_window) -> None:
        self.df = df
        self.write_path = write_path
        self.use_gpu = use_gpu
        self.test_env = test_env
        self.data_window = data_window
    
    def calculate_inflections(self):
       
        self.logger.info('Calculating subsequent date time')
        self.df['next_date_time'] = self.df.groupby('user_id')['date_time'].shift(-1)
        self.df = self.df.drop_duplicates(subset=['user_id', 'date_time'], keep='last').reset_index()
        if self.use_gpu:
            self.logger.info('Bringing to CPU for second calculation')
            self.df = self.df.to_pandas()
           
            
        self.df['diff_seconds'] = (self.df['next_date_time'] - self.df['date_time']).apply(lambda x: x.total_seconds())
        
        self.logger.info('Diff seconds calculated')
        if self.use_gpu:
            self.logger.info('Bringing back to GPU for final calculations')
            self.df = cudf.from_pandas(self.df)

        self.df['diff_minutes'] = (self.df['diff_seconds'] / 60)
        self.df['session_5'] = (self.df['diff_minutes'] < 5)
        self.df[f'session'] = self.df['diff_minutes'] < self.data_window
        
        self.df[f'session'] = self.df['session'].fillna(False)
        self.df['session_5'] = self.df['session_5'].fillna(False)        
        self.logger.info(f'Labels calculated: removing rows with diff seconds > 0')
       
        

        self.logger.info(f'Number of rows following drop: {self.df.shape[0]}')
        self.logger.info(f'Sorting rows by date time and applying row count')
        self.df = self.df.sort_values(['date_time']).reset_index()
        self.df['row_count'] = self.df.index.values
        self.logger.info(f'Sorted rows and applied row count on updated index')  
        self.logger.info('Calculating inflection points')
        self.df['user_id'] = self.df['user_id'].astype('int32')
        
       
        inflections_5_merge = self.df[self.df['session_5'] == False].sort_values(by=['date_time'])
        inflections_window_merge = self.df[self.df[f'session'] == False].sort_values(by=['date_time']) 
     
        self.logger.info('Calculating session 5 inflections') 
        inflections_5_merge['session_5_count'] = inflections_5_merge.groupby('user_id').cumcount() + 1
        
        self.logger.info('Calculating session inflections')
        inflections_window_merge['session_count'] = inflections_window_merge.groupby('user_id').cumcount() + 1
        
        inflections_5_merge = inflections_5_merge[['user_id', 'date_time', 'row_count', 'session_5_count']].sort_values(by=['row_count', 'user_id'])
        inflections_window_merge = inflections_window_merge[['user_id', 'date_time', 'row_count', 'session_count']].sort_values(by=['row_count', 'user_id'])
        inflections_5_merge = inflections_5_merge.drop(columns=['date_time'])
        
        inflections_window_merge = inflections_window_merge.rename(columns={'date_time': 'session_end_time'})

        
        if self.use_gpu:
            self.logger.info('Bringing back to GPU for labelling')
            self.df, inflections_5_merge, inflections_window_merge = self.df.to_pandas(), inflections_5_merge.to_pandas(), inflections_window_merge.to_pandas()
            self.df = self.df.sort_values(by=['row_count', 'user_id'])
            self.df = pd.merge_asof(self.df, inflections_5_merge, on='row_count', by='user_id', direction='forward')
            self.df = pd.merge_asof(self.df, inflections_window_merge, on='row_count', by='user_id', direction='forward')
            self.df[f'session_terminates'] = (self.df['session_end_time'] - self.df['date_time']).apply(lambda x: x.total_seconds() / 60) < self.data_window
            self.df = cudf.from_pandas(self.df)
        else:
            self.logger.info('Labelling on CPU')
            self.df = pd.merge_asof(self.df.sort_values(by=['row_count', 'user_id']), inflections_5_merge, on='row_count', by='user_id', direction='forward')

            self.df = pd.merge_asof(self.df.sort_values(by=['row_count', 'user_id']), inflections_5_merge, on='row_count', by='user_id', direction='forward')
            self.df = pd.merge_asof(self.df.sort_values(by=['row_count', 'user_id']), inflections_window_merge, on='row_count', by='user_id', direction='forward') 
            self.df[f'session_terminates'] = (self.df['session_end_time'] - self.df['date_time']).apply(lambda x: x.total_seconds() / 60) < self.data_window
 
        self.logger.info('Inflections calculated')
 
        session_end_window = self.df[self.df[f'session_terminates'] == False].shape[0]
        self.logger.info(f'Percent sessions end in {self.data_window} minutes: {session_end_window / self.df.shape[0]}')
        self.logger.info(f'Columns for df') 
        self.logger.info(pformat(self.df.columns))
        
        
    def write_inflections_parquet(self):
   
        write_path = self.write_path + f'_window_{self.data_window}.parquet'

        self.logger.info(f'Writing inflections to {write_path}')

        self.df = self.df[OUT_COLUMNS].sort_values(by=['date_time', 'user_id']).to_parquet(write_path)
        # write_path = self.write_path + '.parquet.gzip'

In [66]:
LOAD_COLUMNS = [
    "project_id",
    "user_id",
    "country",  
    "date_time",
]

def get_logger():
    logger = logging.getLogger(__name__)
    return logger


def main(args):

    logger = get_logger()
    read_path, write_path, n_files, data_window = args.read_path, args.write_path, args.n_files, args.data_window
    read_path = os.path.join(read_path, f'files_used_{n_files}')
    logger.info(f'Read: {read_path}, Write: {write_path}, N Files: {n_files}')

    df = cudf.read_parquet(read_path, columns=LOAD_COLUMNS)
    logger.info(f'Loaded {df.shape[0]} rows')
    logger.info(
        f'Read path; {read_path}, Write path: {write_path}, N Files: {n_files}, Data Window: {data_window}'
    )
        
    df['date_time'] = cudf.to_datetime(df['date_time'])
    df = df.sort_values(by=['date_time'])
    
    write_path = os.path.join(write_path, f'files_used_{n_files}')
    session_calculator = SessionCalculate(df, write_path, args.use_gpu, args.test_env, data_window)
    session_calculator.calculate_inflections()    

    session_calculator.write_inflections_parquet()




In [69]:
class Arguments:
    read_path = 'labelled_session_count_data'
    write_path = 'labelled_session_count_data_2'
    n_files = 30
    use_gpu = True
    data_window = 10

    test_env = True

In [70]:
main(Arguments)

2023-07-10 10:16:35,697 - __main__ - INFO - Read: labelled_session_count_data/files_used_30, Write: labelled_session_count_data_2, N Files: 30
2023-07-10 10:16:36,185 - __main__ - INFO - Loaded 38500990 rows
2023-07-10 10:16:36,186 - __main__ - INFO - Read path; labelled_session_count_data/files_used_30, Write path: labelled_session_count_data_2, N Files: 30, Data Window: 10
2023-07-10 10:16:36,289 - __main__ - INFO - Calculating subsequent date time
2023-07-10 10:16:36,538 - __main__ - INFO - Bringing to CPU for second calculation
2023-07-10 10:17:39,244 - __main__ - INFO - Diff seconds calculated
2023-07-10 10:17:39,245 - __main__ - INFO - Bringing back to GPU for final calculations
2023-07-10 10:17:41,045 - __main__ - INFO - Labels calculated: removing rows with diff seconds > 0
2023-07-10 10:17:41,046 - __main__ - INFO - Number of rows following drop: 38500990
2023-07-10 10:17:41,046 - __main__ - INFO - Sorting rows by date time and applying row count
2023-07-10 10:17:41,180 - __ma