In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from fastprogress.fastprogress import progress_bar
import time
from IPython.display import Image

import catboost
from catboost import CatBoostClassifier, Pool

from sklift.models import SoloModel, TwoModels, ClassTransformation
from sklift.viz import plot_uplift_preds, plot_qini_curve
from sklift.metrics.metrics import qini_auc_score

from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, ClassifierMixin

from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
from causalml.inference.tree import uplift_tree_string, uplift_tree_plot

import os

In [None]:
# from google.colab import drive 
# drive.mount('/content/gdrive')

In [2]:
RANDOM_SEED = 42
#INPUT_DIR_PATH = 'gdrive/My Drive/Colab Notebooks/Uplift modeling/Megafon_competition/' #For Google Collabratory
INPUT_DIR_PATH = ''

In [3]:
def optimize_memory(df):
    if isinstance(df, pd.DataFrame):
        for col in df.columns:
            if df[col].dtype in [np.float64, np.float32, np.float16]:
                df[col] = pd.to_numeric(df[col], downcast='float')
            elif df[col].dtype in [np.int64, np.int32, np.int16]:
                df[col] = pd.to_numeric(df[col], downcast='integer')
            elif df[col].dtype == object:
                num_unique_values = len(df[col].unique())
                num_total_values = len(df[col])
                if num_unique_values / num_total_values < 0.5:
                    df[col] = df[col].astype('category')
    else:
        print('df must be a pandas dataframe')
        
    return df

In [4]:
train = pd.read_csv(INPUT_DIR_PATH+'data/train.csv')
train = optimize_memory(train)
train.sample(5)

Unnamed: 0,id,treatment_group,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_42,X_43,X_44,X_45,X_46,X_47,X_48,X_49,X_50,conversion
524609,524609,control,-15.39236,-0.044622,15.340264,37.339787,49.351273,-165.682755,6.102479,87.186852,...,2.827943,215.697815,1.224194,-56.060402,1.210694,-581.038574,-199.578156,27.799335,-5.655712,0
340398,340398,control,-72.174934,0.376704,-13.622648,81.12336,62.589439,245.587067,71.51664,-79.088028,...,225.627457,108.121986,1.46163,-32.267235,0.012962,-51.207626,-72.360222,-173.552383,8.662238,0
487567,487567,control,-34.888592,-0.423316,-35.415543,-33.493195,38.709469,161.724243,-14.061209,-1.698628,...,82.747849,220.934448,0.845487,111.540237,0.848956,-154.850342,-114.972214,-115.840637,13.322409,0
58026,58026,control,-77.667435,0.945189,-14.853078,29.938492,39.199524,64.230095,60.291851,-11.811647,...,-181.741913,115.334801,-0.555093,53.831001,0.196731,74.083679,28.751091,218.4254,-0.452371,0
346479,346479,control,113.499191,1.927675,3.599983,-30.140182,39.914288,-107.276871,-8.437339,26.796019,...,-48.684643,310.204559,-1.155913,-27.113625,0.840909,888.547791,44.420414,27.020794,-34.82439,0


In [5]:
test = pd.read_csv(INPUT_DIR_PATH+'data/test.csv')
test = optimize_memory(test)
test.sample(5)

Unnamed: 0,id,treatment_group,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_41,X_42,X_43,X_44,X_45,X_46,X_47,X_48,X_49,X_50
577449,577449,treatment,49.45491,1.251801,-7.604509,-3.150386,-138.734497,-299.37674,-47.561665,-130.316208,...,-100.580971,30.225328,-88.34919,0.233047,1.293828,0.348233,-339.235168,-47.733768,-34.817074,-29.048889
803078,803078,treatment,-2.045335,0.48741,-66.664825,83.180092,87.065498,135.514954,-49.086739,-5.832792,...,-162.70311,230.958359,-327.595551,0.894962,123.168236,0.159431,-196.421783,86.066147,-177.537354,-18.018614
655399,655399,control,-56.902828,-1.061202,-19.417646,50.56641,40.477215,72.796486,29.19997,-123.445976,...,20.055613,-28.163145,57.292763,-0.286499,28.529348,-1.245791,421.766785,152.385239,71.74778,7.383751
104401,104401,control,-50.609207,-0.266064,30.78994,-3.01849,21.674282,-79.963036,32.190964,-11.215665,...,28.152878,-111.198776,-68.237076,-0.424404,37.585541,0.289776,-514.079651,11.60954,116.036316,17.911335
5682,5682,treatment,-8.02509,1.236707,25.92201,59.943859,-42.591663,-219.223312,-41.96294,-61.102264,...,49.46859,-92.299911,366.210236,0.59911,52.793106,-0.691105,14.962437,-160.089127,147.849915,-10.524097


# Preprocessing

In [6]:
X = train.drop(['id', 'conversion'], axis=1)
X['treatment_group'] = X['treatment_group'].map({'treatment':1, 'control':0}).astype(float)
y = train['conversion'].copy()
treatment = X['treatment_group'].copy()

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED)

treatment_train = X_train['treatment_group'].copy()
treatment_val = X_val['treatment_group'].copy()

X_train.drop(['treatment_group'], inplace=True, axis=1)
X_val.drop(['treatment_group'], inplace=True, axis=1)

X_test = test.drop(['id', 'treatment_group'], axis=1) 

X.drop(['treatment_group'], axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Stacking
## First strategy

In [8]:
class MyStackingFirstStrategy():
    
    def __init__(self, models):
        self.models = models
    
    def fit(self, X, y, treatment):
        for model in self.models:
            model.fit(X, y, treatment)
    
    def predict(self, X):
        prediction_list = []

        for model in self.models:
            prediction_list.append(model.predict(X))
            
        return np.mean(np.array(prediction_list), axis=0)

In [11]:
def show_qini_score(prediction, y, treatment):
    return 'qini score ' + str(qini_auc_score(y, prediction, treatment))

In [10]:
%%time

model1 = SoloModel(CatBoostClassifier(random_seed=RANDOM_SEED, verbose=0))
model2 = TwoModels(CatBoostClassifier(random_seed=RANDOM_SEED, verbose=0), 
                   CatBoostClassifier(random_seed=RANDOM_SEED, verbose=0))

model3 = ClassTransformation(CatBoostClassifier(random_seed=RANDOM_SEED, verbose=0))

models_list = [model1, model2, model3]

stacking1 = MyStackingFirstStrategy(models_list)
stacking1.fit(X_train, y_train, treatment_train)

It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.


Wall time: 5min 12s


In [15]:
print(show_qini_score(stacking1.predict(X_val), y_val, treatment_val))

qini score 0.24082877060388466


## Second strategy

In [8]:
class MyStackingSecondStrategy():
    
    def __init__(self, models, main_model):
        self.models = models
        self.main_model = main_model
    
    def fit(self, X, y, treatment):
        X = pd.concat([X, treatment], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED)
        
        treatment_train = X_train['treatment_group'].copy()
        treatment_test = X_test['treatment_group'].copy()
        
        X_train.drop('treatment_group', axis=1, inplace=True)
        X_test.drop('treatment_group', axis=1, inplace=True)
        
        for model in self.models:
            model.fit(X_train, y_train, treatment_train)
            
        prediction_list = []

        for model in self.models:
            prediction_list.append(model.predict(X_test))
            
        prediction_list = np.array(prediction_list).T
        
        self.main_model.fit(prediction_list, y_test, treatment_test)
        
    def predict(self, X):
        prediction_list = []

        for model in self.models:
            prediction_list.append(model.predict(X))
            
        prediction_list = np.array(prediction_list).T
        
        return self.main_model.predict(prediction_list)

In [9]:
%%time

model1 = SoloModel(CatBoostClassifier(random_seed=RANDOM_SEED, verbose=0))
model2 = TwoModels(CatBoostClassifier(random_seed=RANDOM_SEED, verbose=0), 
                   CatBoostClassifier(random_seed=RANDOM_SEED, verbose=0))

model3 = ClassTransformation(CatBoostClassifier(random_seed=RANDOM_SEED, verbose=0))

models_list = [model1, model2, model3]
main_model = TwoModels(CatBoostClassifier(random_seed=RANDOM_SEED, verbose=0), 
                   CatBoostClassifier(random_seed=RANDOM_SEED, verbose=0))

stacking2 = MyStackingSecondStrategy(models_list, main_model)
stacking2.fit(X_train, y_train, treatment_train)

It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.


Wall time: 4min 13s


In [12]:
print(show_qini_score(stacking2.predict(X_val), y_val, treatment_val))

qini score 0.23373224369085852
