# Imports
This notebook uses preprocessed dataset by following [notebook](12_PowerConverter_dataset_preprocessing.ipynb).

**notes**
* CPU monitoring in terminal:  
```bash
top
```
* GPU monitoring in terminal:  
```bash
pip install gpustat
watch -c gpustat -cp --color
```

In [1]:
import sys
import pandas as pd

# to save results to data directory
module_path = ".."
if module_path not in sys.path:
    sys.path.insert(1, module_path)
# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)

In [12]:
import re
import tracemalloc
from copy import copy
from datetime import datetime
from time import time
from typing import Union

import dill
import lightgbm as lgbm
import lime
import ltv.common as common
import ltv.data_postprocessing as data_postprocessing
import ltv.data_preprocessing as data_preprocessing
import ltv.lightgbm_optimizer as lgbmo
import matplotlib.pyplot as plt
import numpy as np
import shap
import sklearn
from eli5 import explain_prediction_df, explain_weights, explain_weights_df
from eli5.sklearn import PermutationImportance
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from lime.lime_tabular import LimeTabularExplainer
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.utils import LabelEncoder
from sklearn.metrics import classification_report, log_loss, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import PowerTransformer

tracemalloc.start()

import tracemalloc

import ray
from ray import tune
from ray.tune import JupyterNotebookReporter
from ray.tune.integration.lightgbm import TuneReportCheckpointCallback
from ray.tune.integration.wandb import WandbLogger
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.schedulers import AsyncHyperBandScheduler

tracemalloc.start()

# temporarily remove deprecation warnings
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Dataset

**identifiers**

In [3]:
column_types = common.json_load("../data/column_types_im.json")

identifier = column_types["identifier"]
cat_cols = column_types["categorical"]

target = column_types["target"]
target_ltv = column_types["target_ltv"]
target_ltv_cls = target_ltv + "_cls"

n_payments = column_types["n_payments"]
n_payments_ltv = column_types["n_payments_ltv"]
n_payments_cls = n_payments_ltv + "_cls"

RANDOM_STATE = 1
TASK = "cls"
TEST_SIZE_TRAIN = 0.2
TEST_SIZE_VALID = 0.5

# Dataset
* **[PREV]Oversampling of the spenders**
```
data_reg02_train['temp_cls'] = pd.qcut(data_reg02_train[target_ltv], 15,
                                       labels=False, duplicates='drop',
                                       retbins=False)
#
# SCALING OF THE DATA SO IT IS NOT SCALED ON OVERSAMPLED TRAIN
#
classes = data_reg02_train_scaled['temp_cls'].unique()
major_cls_cnt = data_reg02_train_scaled['temp_cls'].value_counts().max()
cls_cnts = dict(zip(classes, (np.ones(len(classes))*major_cls_cnt).astype(int)))
over = RandomOverSampler(sampling_strategy=cls_cnts, random_state=random_state)
data_reg02_train_scaled_over, data_reg02_train_over_labels = over.fit_resample(data_reg02_train_scaled.drop(columns=['temp_cls']),data_reg02_train_scaled['temp_cls'])
```

In [4]:
data_raw = pd.read_pickle(
    "../scripts/im_pltv_dataset_v1-1_3d_from_2021-02-01_to_2021-08-20_raw.pkl"
)
# Fill NA - 0 for numerical and 'NA' for categorical
# categorical
data_raw[cat_cols].fillna("NA", inplace=True)
data_raw[cat_cols] = data_raw[cat_cols].astype(str)
# non-categorical
non_cat_cols = data_raw.drop(columns=cat_cols + [identifier]).columns.tolist()
data_raw[non_cat_cols] = data_raw[non_cat_cols].fillna(0)
# drop columns we are not using in evaluation
data_raw.drop(columns=[n_payments_ltv, identifier], inplace=True)

In [5]:
# some categorical column names contain ".", this create an issue when the feature is
# one-hot-encodded in pytorch-widedeep, error log:
# KeyError: 'module name can\'t contain ".", got: emb_layer_battlepass_8008.0'
for i, col in enumerate(cat_cols):
    if "." in col:
        cat_cols[i] = col.replace(".", "_")
        data_raw.rename(columns={col: col.replace(".", "_")}, inplace=True)

In [6]:
PowerTran = False

if TASK == "cls":
    data = data_raw[data_raw[target] == 0].reset_index(drop=True).copy()
    data[target_ltv_cls] = (data[target_ltv] > 0).astype(int)
    print(
        "Size of dataset classes before tutorial players drop:\n{}".format(
            data[target_ltv_cls].value_counts()
        )
    )
    target_col = target_ltv_cls
if TASK == "reg":
    # regressor02 - data
    data = data_raw[data_raw[target] > 0].reset_index(drop=True).copy()
    # regressor01 - data
    # data = data_raw[data_raw[target] == 0 and data_raw[target_ltv] > 0].reset_index(drop=True).copy()
    data[target_ltv] = data[target_ltv] - data[target]
    print(
        "Size of dataset\n\u2022 fraction of all data: {}\n\u2022 number of samples: {}".format(
            len(data) / len(data_raw), len(data)
        )
    )
    target_col = target_ltv

data, cat_cols_f, cont_cols_f = data_preprocessing.cols_preprocess(
    data, cat_cols, target_col, task=TASK, verbose=True
)

# train,test,valid
if TASK == "cls":
    data_train, data_valid = train_test_split(
        data,
        test_size=TEST_SIZE_TRAIN,
        stratify=data[target_ltv_cls],
        random_state=RANDOM_STATE,
    )
    data_valid, data_test = train_test_split(
        data_valid,
        test_size=TEST_SIZE_VALID,
        stratify=data_valid[target_ltv_cls],
        random_state=RANDOM_STATE,
    )
if TASK == "reg":
    data_train, data_valid = train_test_split(
        data, test_size=TEST_SIZE_TRAIN, random_state=RANDOM_STATE
    )
    data_valid, data_test = train_test_split(
        data_valid, test_size=TEST_SIZE_VALID, random_state=RANDOM_STATE
    )

data_train.reset_index(inplace=True, drop=True)
data_valid.reset_index(inplace=True, drop=True)
data_test.reset_index(inplace=True, drop=True)

# data scale
data_train_scaled, Scaler = data_preprocessing.scale(
    data_train, cat_cols_f + [target_col], scaler_sk="Standard"
)
data_valid_scaled, Scaler = data_preprocessing.scale(
    data_valid, cat_cols_f + [target_col], scaler_sk=Scaler
)
data_test_scaled, Scaler = data_preprocessing.scale(
    data_test, cat_cols_f + [target_col], scaler_sk=Scaler
)

# regressor power transform of target_ltv
if TASK == "reg" and PowerTran == True:
    Ptran = PowerTransformer(standardize=False)
    Ptran.fit(
        data_train_scaled.loc[
            data_train_scaled[target_ltv] > 0, target_ltv
        ].values.reshape(-1, 1)
    )
    data_train_scaled.loc[
        data_train_scaled[target_ltv] > 0, target_ltv
    ] = Ptran.transform(
        data_train_scaled.loc[
            data_train_scaled[target_ltv] > 0, target_ltv
        ].values.reshape(-1, 1)
    ).flatten()
    data_valid_scaled.loc[
        data_valid_scaled[target_ltv] > 0, target_ltv
    ] = Ptran.transform(
        data_valid_scaled.loc[
            data_valid_scaled[target_ltv] > 0, target_ltv
        ].values.reshape(-1, 1)
    ).flatten()
    # no need to power-transform test target_ltv
    # data_reg_test_scaled.loc[data_reg_train_scaled[target_ltv] > 0, target_ltv] = Ptran_reg.transform(data_reg_train_scaled. loc[data_reg_train_scaled[target_ltv] > 0,target_ltv].values.reshape(-1, 1)).flatten()

# Drop tutorial players in the train dataset + [CHECK] Dataset imbalance
if TASK == "cls":
    feature_to_drop = "n_ads_watched"
    data_train_scaled = data_train_scaled[
        data_train_scaled[feature_to_drop] != 0
    ].reset_index(drop=True)
    print(
        "Size of training dataset classes after tutorial players drop:\n{}".format(
            data_train[target_ltv_cls].value_counts()
        )
    )
    data_train_scaled.drop(columns=[target_ltv], inplace=True)
    data_valid_scaled.drop(columns=[target_ltv], inplace=True)
    data_test_scaled.drop(columns=[target_ltv], inplace=True)
    cont_cols_f.remove(target_ltv)

Size of dataset classes before tutorial players drop:
0    2343739
1      32080
Name: sum_payments_package_key_ltv_cls, dtype: int64
Dropped constant columns:
['key1_get_iap', 'key2_get_iap', 'key3_get_iap', 'diamond_get_iap', 'n_payments_package_key', 'sum_payments_package_key', 'most_frequent_package_key_bought', 'nunique_package_keys_bought', 'battlepass_0_0', 'battlepass_22_0', 'battlepass_23_0', 'battlepass_8004_0', 'battlepass_8005_0', 'battlepass_8008_0', 'time_to_first_purchase', 'time_to_last_purchase', 'time_between_last_purchase_last_login']
Fraction of dropped unique categorical feature values:
most_frequent_network_type      0.500000
most_frequent_iap_bought        0.947368
first_time_zone                 0.589744
first_login_weekday             0.000000
first_device_manufacturer       0.989431
first_device_model              0.873106
most_frequent_country           0.740260
first_login_day_time            0.000000
first_login_country             0.739130
form_factor      

**[PLACEHOLDER]Create classes ZERO-REPEAT vs NON-ZERO-REPEAT vs NON-SPENDER**
* 0 - non-spenders
* 1 - zero-repeat spender
* 2 - non-zero-repeat spender

In [7]:
# data_clsZNZ = data_raw[data_raw[n_payments] > 1].reset_index(drop=True).copy()
# data_clsZNZ[n_payments_cls] = 0
# data_clsZNZ.loc[data_clsZNZ[n_payments_ltv] == 1, n_payments_cls] = 1
# data_clsZNZ.loc[data_clsZNZ[n_payments_ltv] > 1, n_payments_cls] = 2
# data_clsZNZ.drop(columns=[target_ltv, n_payments_ltv, identifier], inplace=True)
# data_clsZNZ[n_payments_cls].value_counts()

## Categorical features tranformation

In [8]:
CAT_FEATURE_TRANSFORMATION = "Entity Embedding"

if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
    label_encoder = LabelEncoder(cat_cols_f)
    label_encoder.fit(data[cat_cols_f])

    data_train_scaled_enc = data_train_scaled.copy()
    data_valid_scaled_enc = data_valid_scaled.copy()
    data_test_scaled_enc = data_test_scaled.copy()

    data_train_scaled_enc[cat_cols_f] = label_encoder.transform(
        data_train_scaled_enc[cat_cols_f]
    )
    data_valid_scaled_enc[cat_cols_f] = label_encoder.transform(
        data_valid_scaled_enc[cat_cols_f]
    )
    data_test_scaled_enc[cat_cols_f] = label_encoder.transform(
        data_test_scaled_enc[cat_cols_f]
    )
    data_test_scaled_enc[cat_cols_f].head()

if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
    # using pretrained embedding from pytorch-widedeep model and its tab_preprocessor
    with open("dl_entity_emb_model_" + TASK + ".dill", "rb") as f:
        model = dill.load(f)
    with open("dl_entity_emb_model_tab_preprocessor_" + TASK + ".dill", "rb") as f:
        tab_preprocessor = dill.load(f)

    t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor, return_dataframe=True)
    data_train_scaled_enc, data_train_y = t2v.transform(
        data_train_scaled, target_col=target_col
    )
    data_valid_scaled_enc, data_valid_y = t2v.transform(
        data_valid_scaled, target_col=target_col
    )
    data_test_scaled_enc, data_test_y = t2v.transform(
        data_test_scaled, target_col=target_col
    )
    data_train_scaled_enc[target_col] = data_train_y
    data_valid_scaled_enc[target_col] = data_valid_y
    data_test_scaled_enc[target_col] = data_test_y

    cols_list = list(data_test_scaled_enc.columns)
    cat_cols_f_emb = []
    for cat_col in cat_cols_f:
        r = re.compile(cat_col + "*")
        cat_cols_f_emb.extend(list(filter(r.match, cols_list)))
# data_test_scaled_enc[cat_cols_f_emb].head()

# LightGBM

In [9]:
data_train_scaled_enc = data_train_scaled_enc.sample(100000)
data_valid_scaled_enc = data_valid_scaled_enc.sample(30000)

In [10]:
if tracemalloc.take_snapshot == "cls":
    # config["objective"] = "binary"
    # config["metric"] = "binary_logloss"
    # focal_loss https://github.com/jrzaurin/LightGBM-with-Focal-Loss
    config["is_unbalance"] = True
    fobj = focal_loss = lambda x, y: focal_loss_lgb(x, y, 0.25, 1.0)
    feval = eval_error = lambda x, y: focal_loss_lgb_eval_error(x, y, 0.25, 1.0)
    ray_metric = "-" + "focal_loss"

if TASK == "reg":
    config["objective"] = "regression"
    config["metric"] = "rmse"
    fobj = None
    feval = None
    ray_metric = "-" + config["metric"]

if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
    lgb_cat_cols = cat_cols_f
if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
    lgb_cat_cols = []

lgbtrain = lgbm.Dataset(
    data_train_scaled_enc.drop(columns=[target_col]),
    data_train_scaled_enc[target_col],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
    data_valid_scaled_enc.drop(columns=[target_col]),
    data_valid_scaled_enc[target_col],
    reference=lgbtrain,
    free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([data_train_scaled_enc, data_valid_scaled_enc]).reset_index(
    drop=True
)
flgbtrain = lgbm.Dataset(
    ftrain.drop(columns=[target_col]),
    ftrain[target_col],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbtest = lgbm.Dataset(
    data_test_scaled_enc.drop(columns=[target_col]),
    data_test_scaled_enc[target_col],
    categorical_feature=lgb_cat_cols,
    reference=flgbtrain,
    free_raw_data=False,
)

In [47]:
start = time()

config = {
    "eta": tune.loguniform(1e-4, 1e-1),
    "subsample": tune.uniform(0.5, 1.0),
    "max_depth": tune.randint(1, 9),
    "wandb": {
        "project": "GBM_classifier",
        "api_key_file": "../data/wandb_api.key",
        "log_config": True,
    },
}


def training_function(config, train, valid):
    lgbm_config = config.copy()
    lgbm_config.pop("wandb")
    trainer = lgbm.train(
        lgbm_config,
        train,
        valid_sets=[valid],
        valid_names=[""],
        verbose_eval=False,
        feval=[
            lgbmo.feval,
            lgbmo.lgb_focal_f1_1,
            lgbmo.lgb_focal_f1_0,
            lgbmo.lgb_focal_recall_0,
            lgbmo.lgb_focal_recall_1,
            lgbmo.lgb_focal_precision_0,
            lgbmo.lgb_focal_precision_1,
            lgbmo.lgb_focal_accuracy,
        ],
        fobj=fobj,
        callbacks=[
            TuneReportCheckpointCallback(
                {
                    ray_metric: ray_metric,
                    "-f1_0": "-f1_0",
                    "-f1_1": "-f1_1",
                    "-recall_0": "-recall_0",
                    "-recall_1": "-recall_1",
                    "-precision_0": "-precision_0",
                    "-precision_1": "-precision_1",
                    "-accuracy": "-accuracy",
                }
            )
        ],
    )


asha_scheduler = AsyncHyperBandScheduler(
    time_attr="training_iteration",
    metric=ray_metric,
    mode="min",
    max_t=100,
    grace_period=10,
    reduction_factor=3,
    brackets=1,
)

analysis = tune.run(
    tune.with_parameters(training_function, train=lgbtrain, valid=lgbvalid),
    # resources_per_trial={"cpu": 4, "gpu": 0},
    num_samples=2,
    progress_reporter=JupyterNotebookReporter(overwrite=True),
    scheduler=asha_scheduler,
    config=config,
    loggers=DEFAULT_LOGGERS + (WandbLogger,),
)

runtime = time() - start
print("Optimization time:\n".format(runtime))

params = copy(analysis.get_best_config(ray_metric, "min"))
params.pop("wandb")
# params["n_estimators"] = 1000

start = time()
model = lgbm.train(
    params,
    flgbtrain,
    valid_sets=[lgbtest],
    callbacks=[lgbm.log_evaluation(show_stdv=False)],
)
runtime = time() - start
print("Final model training time:\n{}".format(str(datetime.timedelta(seconds=runtime))))

Trial name,status,loc,eta,max_depth,subsample,iter,total time (s),-focal_loss,-f1_0,-f1_1
training_function_8361f_00000,TERMINATED,,0.00186841,3,0.572318,100,36.2122,0.201523,0.993289,0
training_function_8361f_00001,TERMINATED,,0.00128785,5,0.828295,10,8.01187,0.253181,0.993289,0


2021-10-25 13:58:19,895	INFO tune.py:617 -- Total run time: 41.70 seconds (41.51 seconds for the tuning loop).


Optimization time:



In [40]:
analysis.trial_dataframes

## Prediction & evaluation

In [41]:
# the classifier predictions do not return labels but value beween 0 and 1 ?
result = pd.DataFrame(
    {"PLTV": np.rint(model.predict(lgbtest.data)), "LTV": data_test[target_ltv].values}
)

if TASK == "cls":
    print(
        "Classification report:\n{}".format(
            classification_report((result["LTV"] > 0).astype(int), result["PLTV"])
        )
    )
    print(
        "Missed summed LTV fraction in target_ltv==0 dataset as of incorrect classification:\n{}".format(
            result[(result["LTV"] > 0) & (result["PLTV"] == 0)]["LTV"].sum()
            / result["LTV"].sum()
        )
    )
    print("Summed LTV in target_ltv==0 dataset:\n{}".format(result["LTV"].sum()))
if TASK == "reg":
    if PowerTran == True:
        result["PLTV"] = Ptran.inverse_transform(
            result["PLTV"].to_numpy().reshape(-1, 1)
        )
    print(
        "RMSE score:\n{}".format(
            str(mean_squared_error(result["LTV"], result["PLTV"], squared=False))
        )
    )

Classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    234374
           1       0.33      0.01      0.02      3208

    accuracy                           0.99    237582
   macro avg       0.66      0.51      0.51    237582
weighted avg       0.98      0.99      0.98    237582

Missed summed LTV fraction in target_ltv==0 dataset as of incorrect classification:
0.9845011019458371
Summed LTV in target_ltv==0 dataset:
87082.31999999999


# APPENDIX - Tensorboard

In [None]:
from tensorboard import notebook

notebook.list()

In [None]:
%load_ext tensorboard
%tensorboard --logdir ~/ray_results