In [1]:
import numpy as np
import pandas as pd
import sys
import copy
import abc

from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from pandas.api.types import is_integer_dtype
from numpy.random import default_rng
from scipy.special import expit
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib widget

In [None]:
bts = pd.read_csv('/Users/tejomay/Dropbox/nyu/3001_mlt/open_bandit_dataset/bts/all/all.csv')

SeaVan1 :
X is drawn uniformly from {0,1,2}
Y|X = x ~ N(x, 1)
R|X = expit(4-4x) where expit(x) = 1.(1+e^-x)

In [3]:
np.random.seed(2021)
sv1 = pd.DataFrame({"X" : np.random.randint(0,3,size=50000)})
sv1["Y"] = sv1["X"].apply(lambda x : np.random.normal(x))
sv1["R"] = sv1["X"].apply(lambda x : 0 if 1/(1+np.exp(4*x-4)) < 0.5 else 1)
sv1.tail(5)

Unnamed: 0,X,Y,R
49995,2,3.670323,0
49996,0,1.520434,1
49997,0,0.43179,1
49998,1,1.246051,1
49999,0,1.029791,1


SeaVan2: X is drawn uniformly from {0,1,2}
Y|X = x ~ N(1[x>=1], 1)
R|X = expit(4-4x) where expit(x) = 1.(1+e^-x)

In [4]:
np.random.seed(2022)
sv2 = pd.DataFrame({"X" : np.random.randint(0,3,size=50000)})
sv2["Y"] = sv2["X"].apply(lambda x : np.random.normal(1 if x>=1 else 0))
sv2["R"] = sv2["X"].apply(lambda x : 0 if 1/(1+np.exp(4*x-4)) < 0.5 else 1)
sv2.tail(5)

Unnamed: 0,X,Y,R
49995,2,0.99153,0
49996,1,2.674972,1
49997,2,1.871297,0
49998,0,-0.256136,1
49999,0,0.549157,1


### Applying Baseline Models

Direct Methods : Linear Regression / Non-linear Regression / Regression Tree
MAR estimates : IPW , SN-IPW, IW, SN-IW, lin-impute, NL impute

SeaVan1

In [5]:
train, test = train_test_split(sv1, test_size=0.2, random_state=2021)

In [6]:
lr = LinearRegression()
obs_train_x = train.loc[train.R==1][["X"]]
obs_train_y = train.loc[train.R==1]["Y"]

lr.fit(obs_train_x, obs_train_y)

LinearRegression()

In [7]:
test_x = test[["X"]]
test_y = test["Y"]
pred_y = lr.predict(test_x)

def calc_rmse(pred, label):
    return np.sum(np.abs(pred-label))**0.5
rmse = calc_rmse(pred_y, test_y)

rmse

89.57631143879969

SeaVan2

### Load UCI datasets

In [8]:
yeast = pd.read_csv("yeast.csv")
yeast_data = yeast.drop("Sequence Name", axis=1)

bean = pd.read_csv("dry_bean.csv")
bean_data = bean.rename(columns={"Class": "label"})

letter_data = pd.read_csv('letter-recognition.data', names = ['label']+[f'x{i}' for i in range(16)])

In [9]:
def create_rewards(df, y_col, drop_context=False):
    
    classes = df[y_col].unique()
    contexts = df.drop(y_col, axis=1).columns
    df = pd.concat([df, pd.DataFrame(columns=classes)], axis=1).fillna(0)
    
    def fill_in_rewards(row, classes):
        label = row[y_col]
        row.loc[label] = 1
        return row
    
    df = df.apply(fill_in_rewards, axis=1, args=[classes])
    if drop_context:
        df = df.drop(contexts, axis=1)
    
    return df

### Setting up Target Policy

In [10]:
class Policy:
    def __init__(self, num_actions=2):
        self.num_actions = num_actions

    @abc.abstractmethod
    def get_action_distribution(self, X):
        raise NotImplementedError("Must override method")

    def get_action_propensities(self, X, actions):
        distrib = self.get_action_distribution(X)
        distrib["action"] = actions
        return distrib.apply(lambda x : x[x["action"]], axis=1)
        
    def select_actions(self, X, rng=default_rng(1)):
        df = self.get_action_distribution(X)
        action_list = df.columns
        
        df["actions"] = df.apply(lambda x : np.random.choice(action_list, p=x), axis=1)
        actions = df["actions"]
        propensities = self.get_action_propensities(X, actions)
        
        return actions, propensities
        
    def get_value_estimate(self, X, full_rewards):
        actions, propensities = self.select_actions(X)
        df = pd.DataFrame(full_rewards.reset_index(drop=True)).assign(act = actions)
        action_rewards = df.apply(lambda x : x[x["act"]], axis=1)
        
        return action_rewards.mean()

In [67]:
class SKLearnPolicy(Policy):
    """ 
    An SKLearnPolicy uses a scikit learn model to generate an action distribution.  If the SKLearnPolicy is built with is_deterministic=False, 
    then the predict distribution for a context x should be whatever predict_proba for the model returns.  If is_deterministic=True, then all the probability mass 
    should be concentrated on whatever predict of the model returns.
    """
    def __init__(self, model, num_actions=2, is_deterministic=False):
        self.is_deterministic = is_deterministic
        self.num_actions = num_actions
        self.model = model

    def get_action_distribution(self, X):
        prob = pd.DataFrame(self.model.predict_proba(X), columns=self.model.classes_)
        action = self.model.predict(X)
        
        def deterministic(row):
            pred = row["act"]
            row[pred] = 1
            return row
        
        if (self.is_deterministic):
            df = pd.DataFrame(np.zeros(prob.shape), columns=self.model.classes_).assign(act=action)
            df = df.apply(deterministic, axis=1).drop("act", axis=1)
        else:
            df = copy.deepcopy(prob)

        return df

    def select_actions(self, X, rng=default_rng(1)):
        if (self.is_deterministic):
            actions = pd.DataFrame(self.model.predict(X))
            propensities = pd.Series([1 for i in range(len(actions))])
        else:
            df = self.get_action_distribution(X)
            action_list = df.columns
            df["actions"] = df.apply(lambda x : np.random.choice(action_list, p=x), axis=1)
            actions = df["actions"]
            propensities = self.get_action_propensities(X, actions)
            
        return actions, propensities

class BanditLoggingPolicy(Policy):
    """
    This policy derives from another deterministic policy following the recipe described in the Vlassis et al paper, on the top of the second column in section 5.3.
    For any context x, if the deterministic policy selects action a, then this policy selects action a with probability eps (a supplied parameter), and spreads the
    rest of the probability mass uniformly over the other actions.
    """
    def __init__(self, num_actions=2, eps=0.3, actions=None, classes=None):
        self.num_actions = num_actions
        self.eps = eps
        self.actions = actions.reset_index(drop=True)
        self.classes = classes
        
    def get_action_distribution(self, X):
        
        def bandit_sampling(row, classes):
            pred = row["act"]
            s_a = np.random.uniform(0.1, 1, len(self.classes))
            row[row.index!="act"] +=  s_a/s_a.sum()*(1-self.eps) # take care of actions
            row[pred] += self.eps
            return row
        
        df = pd.DataFrame(np.zeros([X.shape[0], self.num_actions]), columns=self.classes).assign(act=self.actions)
        df = df.apply(bandit_sampling, axis=1, args=[self.classes]).drop("act", axis=1)
        return df

In [12]:
def generate_bandit_feedback(contexts, full_rewards, policy, rng=default_rng(1)):
    """   
    Args:
        contexts (np.array): contexts, rows correspond to entries of rewards
        full_rewards (np.array): 2-dim numpy array with the same number of rows as X and number of columns corresponding to the number actions
            each row gives the reward that would be received for each action for the context in the corresponding row of X. 

    Returns:
        new_contexts (np.array): new_n rows and same number of columns as in contexts
        actions (np.array): vector with new_n entries giving actions selected by the provided policy for the contexts in new_contexts
        observed_rewards (np.array): vector with new_n entries giving actions selected by the provided policy for the contexts in new_contexts 
    """   
    
    n, k = full_rewards.shape
    new_contexts = contexts
    actions, propensities = policy.select_actions(X=new_contexts, rng=rng)
    obs_rewards = full_rewards.reset_index(drop=True).assign(act=actions)
    obs_rewards["obs_r"] = obs_rewards.apply(lambda x : x[x["act"]], axis=1)
    observed_rewards = obs_rewards["obs_r"]
    return new_contexts, actions, observed_rewards, propensities

### Creating value estimators

In [13]:
def get_value_estimators(policy, contexts, actions, rewards, propensities, skip_slow_stuff=False):
    """   
    Args:
        policy (Policy): the policy we want to get a value estimate for
        contexts (np.array): contexts from bandit feedback
        actions (np.array): actions chosen for bandit feedback
        rewards (np.array): rewards received in bandit feedback
        propensities (np.array): the propensity for each action selected under the logging policy (which is not provided to this function)
        skip_slow_stuff (boolean): boolean flag which allows you to turn on/off some slow estimators (ignore this if you like)
    Returns:
        est (dict): keys are string describing the value estimator, values are the corresponding value estimates 
    """   

    est = {}
    est["mean"] = np.mean(rewards)
    new_propensities = policy.get_action_propensities(contexts, actions)
    imp_wgt = new_propensities / propensities
    
    est["iw"] = np.mean(rewards*imp_wgt)
    est["sn-iw"] = np.sum(rewards*imp_wgt) / np.sum(imp_wgt)

    merged = pd.DataFrame(contexts.reset_index(drop=True)).assign(
        act=actions.reset_index(drop=True)).assign(r=rewards.reset_index(drop=True)).assign(wgt=imp_wgt)
    
    rewards_linreg, rewards_linreg_iw = pd.DataFrame(), pd.DataFrame()
    rewards_rf, rewards_rf_iw = pd.DataFrame(), pd.DataFrame()
    
    for act in sorted(list(set(actions))):
        df = merged.loc[merged["act"]==act]
        X, R, wgt = df.drop(columns=["act","r", "wgt"]), df["r"], df["wgt"]
        
        # Direct method with linear ridge regression
        rewards_linreg[act] = RidgeCV([1e-3, 1e-2, 1e-1]).fit(X,R).predict(contexts)
        rewards_linreg_iw[act] = RidgeCV([1e-3, 1e-2, 1e-1]).fit(X,R, sample_weight=wgt).predict(contexts)
        
        # Direct method with a non-linear reward predictor
        rf = RandomForestRegressor()
        params = {'n_estimators': [50, 100], 
                  'max_depth': [5, 10, 20], 
                  'min_samples_split': [2, 5, 10]}

        rewards_rf[act] = GridSearchCV(rf, params, cv=3).fit(X,R).predict(contexts)
        rewards_rf_iw[act] = GridSearchCV(rf, params, cv=3).fit(X,R, sample_weight=wgt).predict(contexts)
        
    act_dist = policy.get_action_distribution(contexts)
    
    est["dr-lin"] = (rewards_linreg * act_dist).sum().sum() / act_dist.shape[0]
    est["dr-iw-lin"] = (rewards_linreg_iw * act_dist).sum().sum() / act_dist.shape[0]
    est["dr-rf"] = (rewards_rf * act_dist).sum().sum() / act_dist.shape[0]
    est["dr-iw-rf"] = (rewards_rf_iw * act_dist).sum().sum() / act_dist.shape[0]
    
    return est

In [14]:
def get_estimator_stats(estimates, true_parameter_value=None):
    est_stat = []
    for est in estimates.columns:
        pred_means = estimates[est]
        stat = {}
        stat['stat'] = est
        stat['mean'] = np.mean(pred_means)
        stat['SD'] = np.std(pred_means)
        stat['SE'] = np.std(pred_means) / np.sqrt(len(pred_means))
        if true_parameter_value:
            stat['bias'] = stat['mean'] - true_parameter_value
            stat['RMSE'] = np.sqrt(np.mean((pred_means - true_parameter_value) ** 2))
        est_stat.append(stat)

    return pd.DataFrame(est_stat)

#### picked stochastic policy here (deterministic policy has risk of getting 0 weights in the propensities)

In [66]:
def value_est_output(data, model, trials=5, rng=default_rng(7), eps=0.5):
    
    n = data.shape[0]
    train_frac = 0.7
    train_size = round(train_frac * n)
    train_idx = rng.choice(n, size = train_size, replace = False)
    test_idx = np.setdiff1d(np.arange(n), train_idx, assume_unique=True)

    data_context, data_label = data.drop("label", axis=1), data["label"]

    X_train, y_train = data_context.iloc[train_idx], data_label.iloc[train_idx]
    X_test, y_test = data_context.iloc[test_idx], data_label.iloc[test_idx]

    full_rewards = create_rewards(data, "label", True)
    full_rewards_test = full_rewards.iloc[test_idx].drop("label", axis=1)

    model.fit(X_train, y_train)
    policy_stochastic = SKLearnPolicy(model=model, num_actions=len(data.label.unique()), is_deterministic=False)
    policy_true_value = policy_stochastic.get_value_estimate(X_test, full_rewards_test)
    
    classes, k = model.classes_, len(model.classes_)
    
    logging_policy = BanditLoggingPolicy(num_actions=k, eps=eps, actions=y_test, classes=classes)
    logging_policy_value = logging_policy.get_value_estimate(X=X_test, full_rewards=full_rewards_test)   
    print(f"Logging policy value est: {logging_policy_value:.6f}")
    print(f"Target policy true value: {policy_true_value:.6f}")
    
    val_ests = []    
    for i in range(trials):
        contexts, actions, rewards, propensities = generate_bandit_feedback(X_test, full_rewards_test, logging_policy, rng=rng)
        est = get_value_estimators(policy_stochastic, contexts, actions, rewards, propensities)
        val_ests.append(est)
    df = pd.DataFrame(val_ests)
    
    return get_estimator_stats(df, true_parameter_value=policy_true_value) 

### Yeast data results

- \# of classes = 10
- Sample size = 1,484


In [16]:
from itertools import product
lr = LogisticRegression(multi_class='multinomial')
gb = GradientBoostingClassifier()

params = {'model_list': [lr, gb], 
          'eps': [0.1, 0.5, 0.9]}
keys = params.keys()
values = (params[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in product(*values)]

for c in combinations:
    print(f"Model: {c['model_list']} | eps: {c['eps']}") 
    display(value_est_output(yeast_data, model=c['model_list'], eps=c['eps']))

Model: LogisticRegression(multi_class='multinomial') | eps: 0.1
Logging policy value est: 0.182022
Target policy true value: 0.314607


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.183371,0.019621,0.008775,-0.131236,0.132695
1,iw,0.359811,0.052608,0.023527,0.045205,0.069362
2,sn-iw,0.281804,0.048928,0.021881,-0.032803,0.058907
3,dr-lin,0.434664,0.02551,0.011408,0.120057,0.122737
4,dr-iw-lin,0.308597,0.036915,0.016509,-0.00601,0.037401
5,dr-rf,0.422124,0.029878,0.013362,0.107517,0.111591
6,dr-iw-rf,0.4112,0.034285,0.015333,0.096593,0.102497


Model: LogisticRegression(multi_class='multinomial') | eps: 0.5
Logging policy value est: 0.550562
Target policy true value: 0.361798


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.55191,0.024723,0.011057,0.190112,0.191713
1,iw,0.355856,0.019397,0.008675,-0.005942,0.020287
2,sn-iw,0.315899,0.044554,0.019925,-0.045899,0.063967
3,dr-lin,0.734176,0.01951,0.008725,0.372379,0.372889
4,dr-iw-lin,0.432517,0.040483,0.018105,0.070719,0.081487
5,dr-rf,0.715733,0.01927,0.008618,0.353935,0.354459
6,dr-iw-rf,0.716485,0.018989,0.008492,0.354687,0.355195


Model: LogisticRegression(multi_class='multinomial') | eps: 0.9
Logging policy value est: 0.914607
Target policy true value: 0.359551


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.908764,0.005791,0.00259,0.549213,0.549244
1,iw,0.346951,0.004314,0.001929,-0.012599,0.013317
2,sn-iw,0.2855,0.052518,0.023487,-0.074051,0.090783
3,dr-lin,0.906149,0.008933,0.003995,0.546598,0.546671
4,dr-iw-lin,0.608221,0.106562,0.047656,0.24867,0.270541
5,dr-rf,0.887338,0.015948,0.007132,0.527787,0.528028
6,dr-iw-rf,0.882178,0.009552,0.004272,0.522628,0.522715


Model: GradientBoostingClassifier() | eps: 0.1
Logging policy value est: 0.161798
Target policy true value: 0.510112


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.186067,0.012176,0.005445,-0.324045,0.324274
1,iw,0.532118,0.037454,0.01675,0.022006,0.043441
2,sn-iw,0.469913,0.065753,0.029406,-0.040199,0.077068
3,dr-lin,0.536807,0.018326,0.008196,0.026694,0.03238
4,dr-iw-lin,0.530347,0.06462,0.028899,0.020235,0.067714
5,dr-rf,0.574177,0.032069,0.014342,0.064065,0.071643
6,dr-iw-rf,0.559924,0.040722,0.018211,0.049812,0.064339


Model: GradientBoostingClassifier() | eps: 0.5
Logging policy value est: 0.523596
Target policy true value: 0.512360


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.538876,0.024674,0.011035,0.026517,0.036221
1,iw,0.494404,0.024298,0.010867,-0.017956,0.030213
2,sn-iw,0.422873,0.077384,0.034607,-0.089487,0.118305
3,dr-lin,0.79287,0.020189,0.009029,0.28051,0.281236
4,dr-iw-lin,0.549436,0.075101,0.033586,0.037077,0.083755
5,dr-rf,0.802705,0.027678,0.012378,0.290345,0.291662
6,dr-iw-rf,0.799855,0.022797,0.010195,0.287496,0.288398


Model: GradientBoostingClassifier() | eps: 0.9
Logging policy value est: 0.903371
Target policy true value: 0.474157


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.90382,0.008921,0.00399,0.429663,0.429756
1,iw,0.497225,0.005629,0.002518,0.023068,0.023745
2,sn-iw,0.471239,0.20701,0.092578,-0.002919,0.207031
3,dr-lin,0.952097,0.007206,0.003223,0.477939,0.477994
4,dr-iw-lin,0.736233,0.097041,0.043398,0.262076,0.279465
5,dr-rf,0.954539,0.006846,0.003062,0.480381,0.48043
6,dr-iw-rf,0.95123,0.010405,0.004653,0.477073,0.477186


### Bean data results

- \# of classes = 17
- Sample size = 13,611

In [17]:
for c in combinations:
    print(f"Model: {c['model_list']} | eps: {c['eps']}") 
    display(value_est_output(bean_data, model=c['model_list'], eps=c['eps']))

Model: LogisticRegression(multi_class='multinomial') | eps: 0.1
Logging policy value est: 0.225569
Target policy true value: 0.580700


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.229929,0.004175,0.001867,-0.350771,0.350796
1,iw,0.640121,0.006887,0.00308,0.059421,0.059819
2,sn-iw,0.543671,0.010281,0.004598,-0.037029,0.03843
3,dr-lin,0.549412,0.022239,0.009946,-0.031288,0.038387
4,dr-iw-lin,0.583055,0.003671,0.001642,0.002354,0.004361
5,dr-rf,0.60811,0.005671,0.002536,0.02741,0.02799
6,dr-iw-rf,0.604327,0.004803,0.002148,0.023626,0.024109


Model: LogisticRegression(multi_class='multinomial') | eps: 0.5
Logging policy value est: 0.565516
Target policy true value: 0.578251


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.567965,0.003404,0.001522,-0.010287,0.010835
1,iw,0.585414,0.006646,0.002972,0.007163,0.009772
2,sn-iw,0.496102,0.012039,0.005384,-0.082149,0.083027
3,dr-lin,0.6572,0.018019,0.008058,0.078949,0.080979
4,dr-iw-lin,0.57085,0.006695,0.002994,-0.007401,0.00998
5,dr-rf,0.673148,0.003517,0.001573,0.094897,0.094962
6,dr-iw-rf,0.653492,0.007392,0.003306,0.075241,0.075603


Model: LogisticRegression(multi_class='multinomial') | eps: 0.9
Logging policy value est: 0.911585
Target policy true value: 0.573108


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.912074,0.004757,0.002127,0.338966,0.339
1,iw,0.583228,0.003715,0.001661,0.01012,0.01078
2,sn-iw,0.499489,0.023346,0.010441,-0.073619,0.077232
3,dr-lin,0.811447,0.00686,0.003068,0.238339,0.238438
4,dr-iw-lin,0.593254,0.010619,0.004749,0.020146,0.022773
5,dr-rf,0.787598,0.002558,0.001144,0.21449,0.214505
6,dr-iw-rf,0.774978,0.009946,0.004448,0.20187,0.202115


Model: GradientBoostingClassifier() | eps: 0.1
Logging policy value est: 0.213813
Target policy true value: 0.903257


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.227676,0.001837,0.000821,-0.675582,0.675584
1,iw,0.952355,0.014716,0.006581,0.049098,0.051256
2,sn-iw,0.869228,0.010427,0.004663,-0.03403,0.035591
3,dr-lin,0.729797,0.022937,0.010258,-0.17346,0.17497
4,dr-iw-lin,0.882316,0.009345,0.004179,-0.020942,0.022932
5,dr-rf,0.884999,0.004194,0.001875,-0.018259,0.018734
6,dr-iw-rf,0.89102,0.004145,0.001854,-0.012238,0.012921


Model: GradientBoostingClassifier() | eps: 0.5
Logging policy value est: 0.583150
Target policy true value: 0.900808


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.575508,0.002533,0.001133,-0.3253,0.32531
1,iw,0.9134,0.006495,0.002905,0.012592,0.014168
2,sn-iw,0.873238,0.016446,0.007355,-0.02757,0.032102
3,dr-lin,0.899159,0.013049,0.005836,-0.001649,0.013153
4,dr-iw-lin,0.895954,0.010352,0.004629,-0.004854,0.011433
5,dr-rf,0.954307,0.001681,0.000752,0.053499,0.053525
6,dr-iw-rf,0.951007,0.002921,0.001307,0.050199,0.050284


Model: GradientBoostingClassifier() | eps: 0.9
Logging policy value est: 0.924810
Target policy true value: 0.893461


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.913495,0.003654,0.001634,0.020034,0.020365
1,iw,0.894388,0.002881,0.001289,0.000927,0.003027
2,sn-iw,0.868659,0.053392,0.023878,-0.024802,0.058872
3,dr-lin,0.972187,0.004183,0.001871,0.078726,0.078838
4,dr-iw-lin,0.922304,0.023487,0.010504,0.028843,0.037196
5,dr-rf,0.983329,0.001597,0.000714,0.089868,0.089882
6,dr-iw-rf,0.97959,0.003517,0.001573,0.086129,0.086201


Direct methods performs well under more random logging policy (small epsilon; less bias towards true label) 

### ZOZO data results

- \# of classes = 80
- Sample size = 13,611

In [69]:
sample = bts.sample(100000).reset_index(drop = True)
sample = sample.drop(columns = ['Unnamed: 0', 'timestamp', 'position', 'propensity_score', 'click'])
sample = sample.rename(columns = {'item_id': 'label'})
label = sample['label'].astype(str)

sample = pd.get_dummies(sample.drop(columns = ['label']))
sample['label'] = label

# sample = pd.get_dummies(sample)

In [70]:
for c in combinations:
    print(f"Model: {c['model_list']} | eps: {c['eps']}") 
    display(value_est_output(sample, model=c['model_list'], eps=c['eps']))

Model: LogisticRegression(multi_class='multinomial') | eps: 0.1
80
Logging policy value est: 0.106333
Target policy true value: 0.052333


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.110533,0.003588,0.001604,0.0582,0.05831
1,iw,0.0439,0.00283,0.001266,-0.008434,0.008896
2,sn-iw,0.031431,0.002977,0.001331,-0.020902,0.021113
3,dr-lin,0.279656,0.01714,0.007665,0.227323,0.227968
4,dr-iw-lin,0.088761,0.015622,0.006986,0.036427,0.039636
5,dr-rf,0.279862,0.014384,0.006433,0.227529,0.227983
6,dr-iw-rf,0.13747,0.012097,0.00541,0.085137,0.085992


Model: LogisticRegression(multi_class='multinomial') | eps: 0.5
80
Logging policy value est: 0.506000
Target policy true value: 0.048000


Unnamed: 0,stat,mean,SD,SE,bias,RMSE
0,mean,0.506933,0.0096,0.004293,0.458933,0.459034
1,iw,0.044973,0.000768,0.000344,-0.003027,0.003123
2,sn-iw,0.032541,0.000979,0.000438,-0.015459,0.01549
3,dr-lin,0.676724,0.009064,0.004054,0.628724,0.628789
4,dr-iw-lin,0.183348,0.015796,0.007064,0.135348,0.136267
5,dr-rf,0.675341,0.006764,0.003025,0.627341,0.627378
6,dr-iw-rf,0.382449,0.009266,0.004144,0.334449,0.334577


Model: LogisticRegression(multi_class='multinomial') | eps: 0.9
80
Logging policy value est: 0.899000
Target policy true value: 0.045667


ValueError: Cannot have number of splits n_splits=3 greater than the number of samples: n_samples=1.

Direct methods performs well under more random logging policy (small epsilon; less bias towards true label) 