SeaVan1 :
X is drawn uniformly from {0,1,2}
Y|X = x ~ N(x, 1)
R|X = expit(4-4x) where expit(x) = 1.(1+e^-x)

In [9]:
import numpy as np
import pandas as pd
np.random.seed(2021)
sv1 = pd.DataFrame({"X" : np.random.randint(0,3,size=50000)})
sv1["Y"] = sv1["X"].apply(lambda x : np.random.normal(x))
sv1["R"] = sv1["X"].apply(lambda x : 0 if 1/(1+np.exp(4*x-4)) < 0.5 else 1)

In [10]:
sv1.tail(5)

Unnamed: 0,X,Y,R
49995,2,3.670323,0
49996,0,1.520434,1
49997,0,0.43179,1
49998,1,1.246051,1
49999,0,1.029791,1


SeaVan2: X is drawn uniformly from {0,1,2}
Y|X = x ~ N(1[x>=1], 1)
R|X = expit(4-4x) where expit(x) = 1.(1+e^-x)

In [11]:
np.random.seed(2022)
sv2 = pd.DataFrame({"X" : np.random.randint(0,3,size=50000)})
sv2["Y"] = sv2["X"].apply(lambda x : np.random.normal(1 if x>=1 else 0))
sv2["R"] = sv2["X"].apply(lambda x : 0 if 1/(1+np.exp(4*x-4)) < 0.5 else 1)

In [12]:
sv2.tail(5)

Unnamed: 0,X,Y,R
49995,2,0.99153,0
49996,1,2.674972,1
49997,2,1.871297,0
49998,0,-0.256136,1
49999,0,0.549157,1


### Applying Baseline Models

Direct Methods : Linear Regression / Non-linear Regression / Regression Tree
MAR estimates : IPW , SN-IPW, IW, SN-IW, lin-impute, NL impute

In [30]:
import seaborn as sb
from copy import copy
from sklearn.model_selection import *
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

SeaVan1

In [17]:
train, test = train_test_split(sv1, test_size=0.2, random_state=2021)

In [23]:
lr = LinearRegression()
obs_train_x = train.loc[train.R==1][["X"]]
obs_train_y = train.loc[train.R==1]["Y"]

lr.fit(obs_train_x, obs_train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
test_x = test[["X"]]
test_y = test["Y"]
pred_y = lr.predict(test_x)

def calc_rmse(pred, label):
    return np.sum(np.abs(pred-label))**0.5
rmse = calc_rmse(pred_y, test_y)


In [26]:
rmse

89.57631143879968

SeaVan2

Yeast


In [1]:
import pandas as pd

In [6]:
yeast = pd.read_csv("yeast.csv")

In [7]:
yeast.head()

Unnamed: 0,Sequence Name,mcg,gvh,alm,mit,erl,pox,vac,nuc,label
0,ADT1_YEAST,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,ADT2_YEAST,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,ADT3_YEAST,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,AAR2_YEAST,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,AATM_YEAST,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


In [43]:
def create_rewards(df, x_col, y_col):
    """
    This function assigns rewards probabiliteis from Logistic Regression
    
    Args:
    df (dataframe) : dataframe with dataset
    k (array-like) : list of classes in the dataset 
    x_col (array-like) : list of columns that corresponds to covariate in the dataset
    y_col (str) : name of the label column in the dataset
    
    Returns:
    df (dataframe) : dataframe with rewards assigned
    """
    lg = LogisticRegression()
    x = df[x_col]
    y = df[y_col]
    lg.fit(x, y)
    r = pd.DataFrame(lg.predict_proba(x), columns=lg.classes_)
    
    df = pd.concat([df, r], axis=1)
    
    return df

In [67]:
modified_yeast = create_rewards(yeast, x_col=["mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc"], y_col="label")



In [68]:
def create_missing_data_sample(df, y_col):
    """
    This function creates missing data sample with probably of 1/K.
    Randomly selects a class k with prob. of 1/K and only log the rewards for that
    
    Args:
    df (dataframe) : dataframe with dataset
    x_col (array-like) : list of columns that corresponds to covariate in the dataset
    y_col (str) : name of the label column in the dataset
    
    Returns:
    df (dataframe) : dataframe with partially missing data sample
    """
    classes = list(df[y_col].unique())
   
    def fill_in_nan(row, classes):
        na = copy(classes)
        na.remove(np.random.choice(classes))
        row.loc[na] = np.NaN
        return row
    df = df.apply(fill_in_nan, axis=1, args=[classes])
    
    return df

In [70]:
missing_data_yeast = create_missing_data_sample(modified_yeast, "label")