# R: A/B Testing with DoubleML

<img src="figures/ab_testing.jpg" alt="An illustration of A/B testing." style="width: 400px;"/>

Image Source Freepik http://www.freepik.com, Designed by macrovector


In [1]:
# Load required packages for this tutorial
library(DoubleML)
library(mlr3)
library(mlr3learners)
library(data.table)
library(ggplot2)

# suppress messages during fitting
lgr::get_logger("mlr3")$set_threshold("warn")


"package 'mlr3' was built under R version 4.1.0"
"package 'data.table' was built under R version 3.6.3"


In [None]:
library(data.table)

# Load data set from url (internet connection required)
url = "https://raw.githubusercontent.com/DoubleML/doubleml-docs/master/doc/examples/data/high42.CSV"
df = fread(url)

dim(df)

# TODO: Add info on types of variables
str(df)

In [None]:
head(df)

In [None]:
hist_A = ggplot(df, aes(x = A, fill = factor(A))) +
            geom_bar() + theme_minimal() +
            ggtitle("Treatment") + xlab("A") +
            theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5),
                  text = element_text(size = 20))
hist_A

In [None]:
# Unconditional ATE
df[, mean(Y), by = A]

ATE_uncond = df[A == 1, mean(Y)] - df[A==0, mean(Y)]
ATE_uncond

### `DoubleMLData`


In [None]:
features_base = colnames(df)[grep("V", colnames(df))]
data_dml = DoubleMLData$new(df,
                           y_col = "Y",
                           d_cols = "A",
                           x_cols = features_base)

In [None]:
print(data_dml)

### IRM


In [None]:
# TODO: helper function for prediction of nuisance

pred_acc_irm = function() {
    
}

### Benchmark: Model based on Linear and Logistic Regression

In [None]:
linreg = lrn("regr.lm")
logreg_class = lrn("classif.log_reg")

set.seed(1234)
dml_irm_regression = DoubleMLIRM$new(data_dml,
                                    ml_g = linreg,
                                    ml_m = logreg_class,
                                    trimming_threshold = 0.025,
                                    n_folds = 3,
                                    n_rep = 3)

dml_irm_regression$fit(store_predictions = TRUE)
dml_irm_regression$summary()

In [None]:
# TODO: Calculate Bias
# TODO: Summary for predictive accuracy
# TODO: Print RMSE mean and sd for ml_g and log_loss for ml_m

### Double Machine Learning based on Lasso


In [None]:
lasso = lrn("regr.cv_glmnet", nfolds = 5, s = "lambda.min")
lasso_class = lrn("classif.cv_glmnet", nfolds = 5, s = "lambda.min")

set.seed(1234)
dml_irm_lasso = DoubleMLIRM$new(data_dml,
                               ml_g = lasso,
                               ml_m = lasso_class,
                               trimming_threshold = 0.025,
                               n_folds = 3,
                               n_rep = 3)

dml_irm_lasso$fit(store_predictions = TRUE)

dml_irm_lasso$summary()

In [None]:
# TODO: Calculate Bias
# TODO: Summary for predictive accuracy
# TODO: Print RMSE mean and sd for ml_g and log_loss for ml_m

### Double Machine Learning based on Random Forest


In [None]:
randomForest = lrn("regr.ranger")
randomForest_class = lrn("classif.ranger")

set.seed(1234)
dml_irm_forest = DoubleMLIRM$new(data_dml,
                                ml_g = randomForest,
                                ml_m = randomForest_class,
                                trimming_threshold = 0.025,
                                n_folds = 3,
                                n_rep = 3)

# Set nuisance-part specific parameters
dml_irm_forest$set_ml_nuisance_params("ml_g0", "A",
                                      list("mtry" = 200,
                                           "num.trees" = 250))
dml_irm_forest$set_ml_nuisance_params("ml_g1", "A",
                                      list("mtry" = 200,
                                           "num.trees" = 250))
dml_irm_forest$set_ml_nuisance_params("ml_m", "A",
                                      list("mtry" = 200,
                                           "num.trees" = 250))

dml_irm_forest$fit(store_predictions = TRUE)
dml_irm_forest$summary()

In [None]:
# TODO: Calculate Bias
# TODO: Summary for predictive accuracy
# TODO: Print RMSE mean and sd for ml_g and log_loss for ml_m

### Double Machine Learning based on Extreme Gradient Boosting

In [None]:
boost = lrn("regr.xgboost",
            objective = "reg:squarederror")
boost_class = lrn("classif.xgboost",
                  objective = "binary:logistic",
                  eval_metric = "logloss")

set.seed(1234)
dml_irm_boost = DoubleMLIRM$new(data_dml,
                               ml_g = boost,
                               ml_m = boost_class,
                               trimming_threshold = 0.025,
                               n_folds = 3,
                               n_rep = 3)

dml_irm_boost$set_ml_nuisance_params("ml_g0", "A",
                                     list("nrounds" = 30,
                                          "eta" = 0.2))
dml_irm_boost$set_ml_nuisance_params("ml_g1", "A",
                                     list("nrounds" = 30,
                                          "eta" = 0.2))
dml_irm_boost$set_ml_nuisance_params("ml_m", "A",
                                      list("nrounds" = 15,
                                           "eta" = 0.2))

dml_irm_boost$fit(store_predictions = TRUE)
dml_irm_boost$summary()

In [None]:
# TODO: Calculate Bias
# TODO: Summary for predictive accuracy
# TODO: Print RMSE mean and sd for ml_g and log_loss for ml_m

#### IRM - Summary of coefficient estimates



In [None]:
# TODO: Add summary of results

In [None]:
# TODO: Add visualization of summary

#### IRM - Summary of prediction assessment metric's mean values

In [None]:
# TODO: Add comparison/summary for predictive accuracy (nuisance)

In [None]:
# TODO: Add visualization of summary

### Inspection of the benchmark model


In [None]:
# TODO: Function for propensity score visualiztation

rep_propscore_plot = function() {
    
}

## 2. Partially Linear Regression Model (PLR)


In [None]:
# TODO: Function for calculation of prediction accuracy PLR

pred_acc_plr = function() {
    
}

### Benchmark: Model based on Linear and Logistic Regression

In [None]:
linreg = lrn("regr.lm")
logreg_class = lrn("classif.log_reg")

set.seed(1234)
dml_plr_regression = DoubleMLPLR$new(data_dml,
                                    ml_g = linreg,
                                    ml_m = logreg_class,
                                    trimming_threshold = 0.025,
                                    n_folds = 3,
                                    n_rep = 3)

dml_plr_regression$fit(store_predictions = TRUE)
dml_plr_regression$summary()

In [None]:
# TODO: Calculate Bias
# TODO: Summary for predictive accuracy
# TODO: Print RMSE mean and sd for ml_g and log_loss for ml_m

### Double Machine Learning based on Lasso

In [None]:
lasso = lrn("regr.cv_glmnet", nfolds = 5, s = "lambda.min")
lasso_class = lrn("classif.cv_glmnet", nfolds = 5, s = "lambda.min")

set.seed(1234)
dml_plr_lasso = DoubleMLPLR$new(data_dml,
                               ml_g = lasso,
                               ml_m = lasso_class,
                               n_folds = 3,
                               n_rep = 3)

dml_plr_lasso$fit(store_predictions = TRUE)

dml_plr_lasso$summary()


In [None]:
# TODO: Calculate Bias
# TODO: Summary for predictive accuracy
# TODO: Print RMSE mean and sd for ml_g and log_loss for ml_m

### Double Machine Learning based on Random Forests

In [None]:
randomForest = lrn("regr.ranger")
randomForest_class = lrn("classif.ranger")

set.seed(1234)
dml_plr_forest = DoubleMLPLR$new(data_dml,
                                ml_g = randomForest,
                                ml_m = randomForest_class,
                                n_folds = 3,
                                n_rep = 3)


dml_plr_forest$fit(store_predictions = TRUE)
dml_plr_forest$summary()

In [None]:
# TODO: Calculate Bias
# TODO: Summary for predictive accuracy
# TODO: Print RMSE mean and sd for ml_g and log_loss for ml_m

### Double Machine Learning based on Extreme Gradient Boosting

In [None]:
boost = lrn("regr.xgboost",
            objective = "reg:squarederror",
            eta = 0.1, nrounds = 57, max_depth = 2)

boost_class = lrn("classif.xgboost",
                  objective = "binary:logistic",
                  eval_metric = "logloss",
                  eta = 0.1, nrounds = 10, max_depth = 3)

set.seed(1234)
dml_plr_boost = DoubleMLPLR$new(data_dml,
                               ml_g = boost,
                               ml_m = boost_class,
                               n_folds = 3,
                               n_rep = 3)

dml_plr_boost$fit(store_predictions = TRUE)
dml_plr_boost$summary()

In [None]:
# TODO: Calculate Bias
# TODO: Summary for predictive accuracy
# TODO: Print RMSE mean and sd for ml_g and log_loss for ml_m

#### PLR - Summary of coefficient estimates

In [None]:
# TODO: Add summary of results


In [None]:
# TODO: Add visualization of summary


#### PLR - Summary of prediction assessment metric's mean values


In [None]:
# TODO: Add comparison/summary for predictive accuracy (nuisance)


In [None]:
# TODO: Add visualization of summary
