In [None]:
%load_ext autoreload
%autoreload 2

# Install LightAutoML

Uncomment if doesn't clone repository by git. (ex.: colab, kaggle version)

In [None]:
#! pip install -U lightautoml

# Import necessary libraries

In [None]:
# Standard python libraries
from copy import deepcopy
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler

from lightautoml.addons.uplift.base import AutoML, AutoUplift, AutoUpliftTX, BaseLearnerWrapper, MetaLearnerWrapper, Wrapper
from lightautoml.addons.uplift import meta_learners
from lightautoml.addons.uplift.metrics import (_available_uplift_modes,
                                               TUpliftMetric,
                                               calculate_graphic_uplift_curve,
                                               calculate_min_max_uplift_auc,
                                               calculate_uplift_at_top,
                                               calculate_uplift_auc,
                                               perfect_uplift_curve)
from lightautoml.addons.uplift.utils import create_linear_automl
from lightautoml.report.report_deco import ReportDecoUplift


%matplotlib inline

# Parameters

## Setting

In [None]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name
TREATMENT_NAME = 'CODE_GENDER'

## Fix torch number of threads and numpy seed

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Example data load

Load a dataset from the repository if doesn't clone repository by git.

In [None]:
DATASET_DIR = './example_data/test_data_files'
DATASET_NAME = 'sampled_app_train.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)
DATASET_URL = 'https://raw.githubusercontent.com/sberbank-ai-lab/LightAutoML/master/example_data/test_data_files/sampled_app_train.csv'

In [None]:
%%time

if not os.path.exists(DATASET_FULLNAME):
    os.makedirs(DATASET_DIR, exist_ok=True)

    dataset = requests.get(DATASET_URL).text
    with open(DATASET_FULLNAME, 'w') as output:
        output.write(dataset)

In [None]:
%%time

data = pd.read_csv('./example_data/test_data_files/sampled_app_train.csv')
data.head()

# (Optional) Some user feature preparation

In [None]:
%%time 

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['report_dt'] = np.datetime64('2018-01-01')

data['constant'] = 1
data['allnan'] = np.nan

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)


data['CODE_GENDER'] = (data['CODE_GENDER'] == 'M').astype(int)

# Data splitting for train-test

In [None]:
%%time


stratify_value = data[TARGET_NAME] + 10 * data[TREATMENT_NAME]

train, test = train_test_split(data, test_size=3000, stratify=stratify_value, random_state=42)

test_target, test_treatment = test[TARGET_NAME].values.ravel(), test[TREATMENT_NAME].values.ravel()

# Setup columns roles

In [None]:
%%time

roles = {
    'target': TARGET_NAME,
    'treatment': TREATMENT_NAME,
    DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt'
}

# Uplift modeling

## AutoUplift (use predefined uplift methods)

### Fit autouplift

In [None]:
%%time

task = Task('binary')

autouplift = AutoUplift(task,
                        add_dd_candidates=True,
                        metric='adj_qini', 
                        normed_metric=True, 
                        test_size=0.2, 
                        threshold_imbalance_treatment=0.0,
                        timeout=100)

autouplift.fit(train, roles)

### Show rating of uplift methods (meta-learners)

In [None]:
%%time

rating_table = autouplift.get_metalearners_ranting()
rating_table

###  Get best metalearner with report functionaly (should refit on train data for generating report) 

In [None]:
%%time 

best_metalearner_repo = autouplift.create_best_metalearner(need_report=True, update_metalearner_params={'timeout': 100})
best_metalearner_repo.fit(train, roles)
best_metalearner_repo.predict(test)

# Path to report: PATH_TO_CURRENT_NOTEBOOK/lama_report/lama_interactive_report.html

### Predict to test data and check metrics

In [None]:
%%time

uplift_pred, treatment_pred, control_pred = autouplift.predict(test)
uplift_pred = uplift_pred.ravel()

roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])

uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)

logging.info('--- Check scores ---')
logging.info('OOF scores "ROC_AUC":')
logging.info('\tTreatment = %f', roc_auc_treatment)
logging.info('\tControl   = %f', roc_auc_control)
logging.info('Uplift score of test group (default="adj_qini"):')
logging.info('\tBaseline      = %f', auc_base)
logging.info('\tAlgo (Normed) = %f (%f)', uplift_auc_algo, uplift_auc_algo_normed)
logging.info('\tPerfect       = %f', auc_perfect)

## AutoUplift (custom uplift methods)

### Fit autouplift

In [None]:
%%time

# Set uplift candidate for choosing best of them
# !!!ATTENTION!!!
#    This is a demonstration of the possibilities, 
#    You may use default set of candidates 

task = Task('binary')

uplift_candidates = [
    MetaLearnerWrapper(
        name='TLearner__Default', 
        klass=meta_learners.TLearner, 
        params={'base_task': task}
    ),  
    MetaLearnerWrapper(
        name='TLearner__Custom', 
        klass=meta_learners.TLearner, 
        params={
            'treatment_learner': BaseLearnerWrapper(
                name='__TabularAutoML__',
                klass=TabularAutoML, 
                params={'task': task, 'timeout': 10}),
            'control_learner': BaseLearnerWrapper(
                name='__Linear__',
                klass=create_linear_automl,
                params={'task': Task('binary')})
        }
    ),
    MetaLearnerWrapper(
        name='XLearner__Custom',
        klass=meta_learners.XLearner,
        params={
            'outcome_learners': [
                TabularAutoML(task=task, timeout=10), # [sec] , Only speed up example, don't change it!
                create_linear_automl(task=Task('binary'))
            ],
            'effect_learners': [BaseLearnerWrapper(
                name='__TabularAutoML__',
                klass=TabularAutoML, 
                params={'task': Task('reg'), 'timeout': 5})],
            'propensity_learner': create_linear_automl(task=Task('binary')),
        }    
    )
]

autouplift = AutoUplift(task,
                        uplift_candidates=uplift_candidates, 
                        add_dd_candidates=True,
                        metric='adj_qini', 
                        normed_metric=True, 
                        test_size=0.2, 
                        threshold_imbalance_treatment=0.0,    # Doesn't affect, see warnings
                        timeout=600)                          # Doesn't affect, see warnings

autouplift.fit(train, roles)

### Show rating of uplift methods (meta-learners)

In [None]:
%%time

rating_table = autouplift.get_metalearners_ranting()
rating_table

### Predict to test data and check metrics

In [None]:
%%time

uplift_pred, treatment_pred, control_pred = autouplift.predict(test)
uplift_pred = uplift_pred.ravel()

roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])

uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)

logging.info('--- Check scores ---')
logging.info('OOF scores "ROC_AUC":')
logging.info('\tTreatment = %f', roc_auc_treatment)
logging.info('\tControl   = %f', roc_auc_control)
logging.info('Uplift score of test group (default="adj_qini"):')
logging.info('\tBaseline      = %f', auc_base)
logging.info('\tAlgo (Normed) = %f (%f)', uplift_auc_algo, uplift_auc_algo_normed)
logging.info('\tPerfect       = %f', auc_perfect)

###  Get best metalearner with report functionaly (should refit on train data for generating report) 

In [None]:
%%time 

best_metalearner_repo = autouplift.create_best_metalearner(need_report=False, update_metalearner_params={'timeout': 60})
best_metalearner_repo.fit(train, roles)
best_metalearner_repo.predict(test)

## AutoUplift with custom metric

### Fit autouplift

In [None]:
%%time

# Using a custom metric
# How to determine custom metric, see below

task = Task('binary')


class CustomUpliftMetric(TUpliftMetric):
    def __call__(self, target: np.ndarray, uplift_pred: np.ndarray, treatment: np.ndarray) -> float:
        up_10 = calculate_uplift_at_top(target, uplift_pred, treatment, 10)
        up_20 = calculate_uplift_at_top(target, uplift_pred, treatment, 20)
    
        return 0.5 * (up_10 + up_20)

autouplift = AutoUplift(task,
                        add_dd_candidates=True,
                        metric=CustomUpliftMetric(), 
                        normed_metric=True, 
                        test_size=0.2, 
                        threshold_imbalance_treatment=0.0,
                        cpu_limit=10,
                        timeout=100)

autouplift.fit(train, roles)

### Show rating of uplift methods (meta-learners)

In [None]:
%%time

rating_table = autouplift.get_metalearners_ranting()
rating_table

###  Get best metalearner with report functionaly (should refit on train data for generating report) 

In [None]:
%%time 

# Warning: can't create best metalearner with report functionaly when custom metric is defined.
# Return just best metalearner

best_metalearner_repo = autouplift.create_best_metalearner(need_report=True, update_metalearner_params={'timeout': 60})
best_metalearner_repo.fit(train, roles)
best_metalearner_repo.predict(test)

## AutoUpliftTX

### Fit autouplift

In [None]:
%%time

autouplift = AutoUpliftTX(Task('binary'), timeout=180)
autouplift.fit(train, roles)

### Show rating of uplift methods (meta-learners)

In [None]:
%%time

rating_table = autouplift.get_metalearners_ranting()
rating_table

### Predict to test data and check metrics

In [None]:
%%time

uplift_pred, treatment_pred, control_pred = autouplift.predict(test)
uplift_pred = uplift_pred.ravel()

roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])

uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)

logging.info('--- Check scores ---')
logging.info('OOF scores "ROC_AUC":')
logging.info('\tTreatment = %f', roc_auc_treatment)
logging.info('\tControl   = %f', roc_auc_control)
logging.info('Uplift score of test group (default="adj_qini"):')
logging.info('\tBaseline      = %f', auc_base)
logging.info('\tAlgo (Normed) = %f (%f)', uplift_auc_algo, uplift_auc_algo_normed)
logging.info('\tPerfect       = %f', auc_perfect)

###  Get best metalearner with report functionaly (should refit on train data for generating report) 

In [None]:
%%time 

best_metalearner_repo = autouplift.create_best_metalearner(need_report=True, update_metalearner_params={'timeout': 100})
best_metalearner_repo.fit(train, roles)
best_metalearner_repo.predict(test)

# Path to report: PATH_TO_CURRENT_NOTEBOOK/lama_report/lama_interactive_report.html

### Customization of training

#### The list of baselearners

In [None]:
%%time
# The list of baselearners will be used for each stage.
# Names of baselearnerwrapper should be unique, but if not is will be renamed.

autouplift = AutoUpliftTX(
    Task('binary'),
    baselearners = [
        BaseLearnerWrapper(
            name='__Linear__',
            klass=create_linear_automl,
            params={}
        )
    ],
    metalearners=['XLearner'],
    timeout=120,
    timeout_single_learner=15
)
autouplift.fit(train, roles)

#### Several stages

In [None]:
%%time
# Specified stage with baselearners
# For remain stages will be used default baselearners

blw_lin = BaseLearnerWrapper(
    name='__Linear__',
    klass=create_linear_automl,
    params={'task': Task('binary')}
)

blw_tab = BaseLearnerWrapper(
    name='__TabularAutoML__',
    klass=TabularAutoML,
    params={'task': Task('binary'), 'timeout': 60}
)

autouplift = AutoUpliftTX(
    Task('binary'),
    baselearners = {
        ('propensity',): [deepcopy(blw_lin)],
        ('outcome_control',): [deepcopy(blw_lin)],
        ('outcome_treatment',): [deepcopy(blw_tab)],
    },
    timeout=None,
    timeout_single_learner=30
)
autouplift.fit(train, roles)

## MetaLearner

### TLearner

#### Fit on train data

In [None]:
%%time

# Default setting
tlearner = meta_learners.TLearner(base_task=Task('binary'), cpu_limit=5)
tlearner.fit(train, roles)

#### Predict to test data and check metrics

In [None]:
%%time

uplift_pred, treatment_pred, control_pred = tlearner.predict(test)
uplift_pred = uplift_pred.ravel()

roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])

uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)

logging.info('--- Check scores ---')
logging.info('OOF scores "ROC_AUC":')
logging.info('\tTreatment = %f', roc_auc_treatment)
logging.info('\tControl   = %f', roc_auc_control)
logging.info('Uplift score of test group (default="adj_qini"):')
logging.info('\tBaseline      = %f', auc_base)
logging.info('\tAlgo (Normed) = %f (%f)', uplift_auc_algo, uplift_auc_algo_normed)
logging.info('\tPerfect       = %f', auc_perfect)

### XLearner

#### Fit on train data

In [None]:
%%time

# Custom base algorithm
xlearner = meta_learners.XLearner(
    propensity_learner=TabularAutoML(task=Task('binary'), timeout=10),
    outcome_learners=[
        TabularAutoML(task=Task('binary'), timeout=10),
        TabularAutoML(task=Task('binary'), timeout=10)
    ],
    effect_learners=[
        TabularAutoML(task=Task('reg'), timeout=10),
        TabularAutoML(task=Task('reg'), timeout=10)
    ]
)
xlearner.fit(train, roles)

#### Predict to test data and check metrics

In [None]:
%%time

uplift_pred, treatment_pred, control_pred = xlearner.predict(test)
uplift_pred = uplift_pred.ravel()

roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])

uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)

logging.info('--- Check scores ---')
logging.info('OOF scores "ROC_AUC":')
logging.info('\tTreatment = %f', roc_auc_treatment)
logging.info('\tControl   = %f', roc_auc_control)
logging.info('Uplift score of test group (default="adj_qini"):')
logging.info('\tBaseline      = %f', auc_base)
logging.info('\tAlgo (Normed) = %f (%f)', uplift_auc_algo, uplift_auc_algo_normed)
logging.info('\tPerfect       = %f', auc_perfect)

# Uplift metrics and graphics (using xlearner predictions)

In [None]:
%%time 

UPLIFT_METRIC = 'adj_qini'

logging.info("All available uplift metrics: %s", _available_uplift_modes)

## Algorithm uplift curve 

In [None]:
%%time

# Algorithm curve
xs_xlearner, ys_xlearner = calculate_graphic_uplift_curve(
    test_target, uplift_pred, test_treatment, mode=UPLIFT_METRIC
)

## Baseline, perfect curve

In [None]:
# Baseline curve
xs_base, ys_base = [0, 1], [0, ys_xlearner[-1]]

# Perfect curver
perfect_uplift = perfect_uplift_curve(test_target, test_treatment)
xs_perfect, ys_perfect = calculate_graphic_uplift_curve(
    test_target, perfect_uplift, test_treatment, mode=UPLIFT_METRIC)

In [None]:
plt.figure(figsize=(10, 7))

plt.plot(xs_base, ys_base, 'black')
plt.plot(xs_xlearner, ys_xlearner, 'red')
plt.plot(xs_perfect, ys_perfect, 'green')

plt.fill_between(xs_xlearner, ys_xlearner, alpha=0.5, color='orange')

plt.xlabel('Cumulative percentage of people in T/C groups')
plt.ylabel('Uplift metric (%s)'.format(UPLIFT_METRIC))
plt.grid()
plt.legend(['Baseline', 'XLearner', 'Perfect']);

## Uplift TOP-K

In [None]:
tops = np.arange(5, 101, 5)

uplift_at_tops = []
for top in tops:
    uat = calculate_uplift_at_top(test_target, uplift_pred, test_treatment, top=top)
    uplift_at_tops.append(uat)

plt.figure(figsize=(10, 7))

plt.plot(tops, uplift_at_tops, marker='.')

plt.legend(['Uplift_At_K'])
plt.xticks(np.arange(0, 101, 10))
plt.grid()

## Custom metric

In [None]:
# Custom metric can be used in AutoUplift(AutoUpliftTX).
# There msut be a function's signature:
# def custom_metric(target, uplift_pred, treatment) -> float:


class CustomUpliftMetric(TUpliftMetric):
    def __call__(self, target: np.ndarray, uplift_pred: np.ndarray, treatment: np.ndarray) -> float:
        up_10 = calculate_uplift_at_top(target, uplift_pred, treatment, 10)
        up_20 = calculate_uplift_at_top(target, uplift_pred, treatment, 20)
    
        return 0.5 * (up_10 + up_20)


metric = CustomUpliftMetric()
metric_value = metric(test_target, uplift_pred, test_treatment)

print("Metric = {}".format(metric_value))

# Report

In [None]:
%%time

RDU = ReportDecoUplift()
tlearner_deco = RDU(meta_learners.TLearner(base_task=Task('binary')))
tlearner_deco.fit(train, roles)
tlearner_deco.predict(test);

# Path to report: PATH_TO_CURRENT_NOTEBOOK/lama_report/lama_interactive_report.html