In [1]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
import os
# import xgb


In [2]:
!pip install python-dotenv awscli --quiet

[0m

In [3]:
%load_ext dotenv
%dotenv env

In [4]:
from cuml.ensemble import RandomForestClassifier as cuRFC
from cuml.preprocessing import MinMaxScaler as cuMinMaxScaler
import cudf as gpu_pd
import cupy as gpu_np


In [5]:
# %load rl_constant.py
FEATURE_COLUMNS = [
    
    "user_count",
    "project_count",
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_count",
    "session_5_count",
    "cum_session_event_count",
    "cum_session_time",
    "expanding_click_average",
   
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    "delta_last_event",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]


METADATA = [
    "date_time"
]

RL_STAT_COLS = [
    'session_size',
    'session_minutes',
    'size_cutoff',
    'time_cutoff',
    'reward'
]

PREDICTION_COLS = [
    "label",
]

LOAD_COLS = list(set(FEATURE_COLUMNS + METADATA + PREDICTION_COLS))

CORE_PATH = 'calculated_features'
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.85
RANDOM_STATE = 42

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
def generate_eval_test_preds(eval_split, test_split, model, pred_window):
    eval_features, eval_label = eval_split[FEATURE_COLUMNS], eval_split['label']
    test_features, test_label = test_split[FEATURE_COLUMNS], test_split['label']
    print('Collecting eval predictions')
    eval_preds = model.predict(eval_features)
    print('Collecting test predictions')
    test_preds = model.predict(test_features)
    print('Generating acc, prec, rec, auc metrics')
    eval_preds, eval_label, test_preds, test_label = (
        eval_preds.to_pandas(),
        eval_label.to_pandas(),
        test_preds.to_pandas(),
        test_label.to_pandas()
    )
    
    metric_container = []
    
    
    for window, dataset in zip(["EVAL", "TEST"], [(eval_preds, eval_label), (test_preds, test_label)]):
        for metric, fn in zip(["ACC", "PREC", "REC", "AUC"], [accuracy_score, precision_score, recall_score, roc_auc_score]):
            print(f"Generating {window} -> {metric}")
            metric_container.append(
                {
                    "Metric Name": f"{metric} {window} {pred_window}",
                    "Metric Value": fn(dataset[0], dataset[1])
                }
            )
                  
            
    return metric_container 
    
    


def generate_baseline_model(args):
    df_container = []
    n_files = args.n_files
    for pred_window in [10, 20, 30]:
        read_path = os.path.join(
            CORE_PATH,
            f'files_used_{n_files}',
            f'calculated_features_window_{pred_window}.parquet'
        )
        assert os.path.exists(read_path), f'Path {read_path} does not exist'
        print(f'Reading {read_path}')
        
        input_df = pd.read_parquet(read_path, columns=FEATURE_COLUMNS + METADATA + PREDICTION_COLS)

        input_df['date_time'] = pd.to_datetime(input_df['date_time'])
        print('Sorting by date_time')
        input_df = input_df.sort_values(by=['date_time'])
        input_df = input_df.dropna()
        train_split, eval_split, test_split = (
            input_df[:int(len(input_df)*TRAIN_SPLIT)], 
            input_df[int(len(input_df)*TRAIN_SPLIT):int(len(input_df)*EVAL_SPLIT)],
            input_df[int(len(input_df)*EVAL_SPLIT):]
        )

        print(f'Train split: {train_split.shape[0]}, eval split: {eval_split.shape[0]}, test split: {test_split.shape[0]}')
        print('Creating scalar range -1, 1')
        min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
        feature_cols_scaling = [col for col in FEATURE_COLUMNS if 'sin' not in col or 'cos' not in col]
        print('Scaling features and fitting scalar')
        train_split[feature_cols_scaling] = min_max_scaler.fit_transform(train_split[feature_cols_scaling])
        print('Scaling evaluation features on fitted scalar')
        eval_split[feature_cols_scaling] = min_max_scaler.transform(eval_split[feature_cols_scaling])
        print('Scaling test features on fitted scalar')
        test_split[feature_cols_scaling] = min_max_scaler.transform(test_split[feature_cols_scaling])
        train_split, eval_split, test_split = (
            train_split.dropna(),
            eval_split.dropna(),
            test_split.dropna()
        )
        
        train_split, eval_split, test_split = (
            gpu_pd.from_pandas(train_split),
            gpu_pd.from_pandas(eval_split),
            gpu_pd.from_pandas(test_split)
        )

        print(f'Data preprocessed and nan dropped: {train_split.shape[0]}, eval split: {eval_split.shape[0]}, test split: {test_split.shape[0]}')
        model = cuRFC(
            n_estimators=100,
            max_depth=10,
            random_state=RANDOM_STATE)
        
        print('Fitting model')
        model.fit(train_split[FEATURE_COLUMNS], train_split['label'])
        print('Model fitted: generating eval preds')
        
        metrics =  generate_eval_test_preds(eval_split, test_split, model, pred_window)
        df_container.extend(metrics)
    
    df_metrics = pd.DataFrame(df_container)
    df_metrics.to_csv("baseline_metrics_full_dat.csv", index=False)


In [8]:
class Args:
    n_files = 30

generate_baseline_model(Args)

Reading calculated_features/files_used_30/calculated_features_window_10.parquet
Sorting by date_time
Train split: 26950693, eval split: 5775148, test split: 5775149
Creating scalar range -1, 1
Scaling features and fitting scalar
Scaling evaluation features on fitted scalar
Scaling test features on fitted scalar
Data preprocessed and nan dropped: 26950693, eval split: 5775148, test split: 5775149
Fitting model
Model fitted: generating eval preds
Collecting eval predictions
Collecting test predictions
Generating acc, prec, rec, auc metrics
Generating EVAL -> ACC
Generating EVAL -> PREC
Generating EVAL -> REC
Generating EVAL -> AUC
Generating TEST -> ACC
Generating TEST -> PREC
Generating TEST -> REC
Generating TEST -> AUC
Reading calculated_features/files_used_30/calculated_features_window_20.parquet
Sorting by date_time
Train split: 26950693, eval split: 5775148, test split: 5775149
Creating scalar range -1, 1
Scaling features and fitting scalar
Scaling evaluation features on fitted sca