In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

In [3]:
df = pd.read_csv(data_path)
df.drop(columns=['user_id','label'], inplace=True)
# log transformation
df[["status_count", "followers_count", "friend_count"]] = df[
        ["status_count", "followers_count", "friend_count"]].applymap(lambda x: x + 1)
df[["status_count", "followers_count", "friend_count"]] = df[
        ["status_count", "followers_count", "friend_count"]].apply(np.log)

In [4]:
col_means = [df[col].mean() for col in df.columns]
def generate_y(df, noise=0.7, seed=32):
    """
    Generate y  
    """
    np.random.seed(seed)
    beta = np.random.uniform(-1, 1, size=df.shape[1]) /1000
    
    df_y = df.copy()
    for i, col in enumerate(df.columns):
        df_y[col] = df_y[col] * beta[i]
        df_y[col] = np.max((df_y[col].mean() - df_y[col]), 0)
    y = np.sum(df_y, axis=1) + np.random.normal(0, noise, size=df.shape[0])
    #  
    y = 1 / (1 + np.exp(-y)) # sigmoid
    y = y.rename('y', inplace=True)
    return y

def test_counterfactuals(df, model, cf_per_obs=20, seed=32):
    """
    Generate counterfactuals for a given model
    """
    np.random.seed(seed)
    df = df.copy()
    if 'y' in df.columns:
        df.drop(columns=['y'], inplace=True)
    names_treatments = ['verified', 'register_time', 'status_count', 'followers_count',
       'friend_count']
    df_treatments = df[names_treatments]
    df.drop(columns=names_treatments, inplace=True)
    names_confounders = df.columns.to_list()
    # repeat each observation cf_per_obs times
    df = df.loc[df.index.repeat(cf_per_obs)].reset_index(drop=True)
    # add random treatment values to dataset in front of each observation

    # samples from marginal columnwise joint distribution:
    df_treatments = df_treatments.sample(n=df.shape[0], replace=True).reset_index(drop=True)
    # concat with confounders
    df = pd.concat([df_treatments, df[names_confounders]], axis=1)
    
    # generate y
    y = generate_y(df, seed=seed)
    y_hat = model.predict(df)
    # calculate rmse
    rmse = mean_squared_error(y, y_hat, squared=False)
    return rmse

def test_factual(df_real, model, seed=32):
    y = generate_y(df_real, seed=seed)
    y_hat = model.predict(df_real)
    # calculate rmse
    rmse = mean_squared_error(y, y_hat, squared=False)
    return rmse

In [5]:
seed = 24

df_train = df.sample(frac=0.8, random_state=seed) 
print(df_train.shape)
df_test = df.drop(df_train.index).reset_index(drop=True)
print(df_test.shape)
df_train.reset_index(drop=True, inplace=True)

y_train = generate_y(df_train, seed=seed)
results = {}

(15960, 69)
(3990, 69)


In [None]:
## EBM
from interpret.glassbox import ExplainableBoostingRegressor
print(seed)
# train
f_types = ["nominal"]
f_types.extend(["continuous"] * 68)
ebm = ExplainableBoostingRegressor(feature_types=f_types, learning_rate=0.01, max_bins= 512, min_samples_leaf= 3, n_jobs=1)
ebm.fit(df_train, y_train)
# test
rmse = test_counterfactuals(df_test, ebm, cf_per_obs=20, seed=seed)
df_real = df_test.copy()
rmse_real = test_factual(df_real, ebm, seed=seed)
print("Counterfactual", rmse, "Real", rmse_real)
results["EBM_counterfactual"] = rmse
results["EBM_real"] = rmse_real

In [None]:
# Linear Regression
from sklearn import linear_model
print(seed)
# train
lr = linear_model.LinearRegression()
lr.fit(df_train, y_train)
# test
rmse = test_counterfactuals(df_test, lr, cf_per_obs=20, seed=seed)
df_real = df_test.copy()
rmse_real = test_factual(df_real, lr, seed=seed)
print("Counterfactual", rmse, "Real", rmse_real)
results["LR_counterfactual"] = rmse
results["LR_real"] = rmse_real

In [None]:
# ExNN 
from exnn import ExNN
import tensorflow as tf
print(seed)
# prepare meta info
f_types = ["nominal"]
f_types.extend(["continuous"] * 68)
meta_info = {f_name: {"type": f_type} for f_name, f_type in zip (df_train.columns, f_types)}
meta_info["y"] = {"type": "target"}

# fit model
exnn = ExNN(meta_info=meta_info,
               subnet_num=10,
               subnet_arch=[10, 6],
               task_type="Regression",
               activation_func=tf.tanh,
               batch_size=min(1000, int(df_train.to_numpy().shape[0] * 0.2)),
               training_epochs=10000,
               lr_bp=0.001,
               lr_cl=0.1,
               beta_threshold=0.05,
               tuning_epochs=100,
               l1_proj=0.0001,
               l1_subnet=0.00316,
               l2_smooth=10**(-6),
               verbose=True,
               val_ratio=0.2,
               early_stop_thres=500)

exnn.fit(df_train.to_numpy(), y_train.to_numpy())
# test
rmse = test_counterfactuals(df_test, exnn, cf_per_obs=20, seed=seed)
df_real = df_test.copy()
rmse_real = test_factual(df_real, exnn, seed=seed)
print("Counterfactual", rmse, "Real", rmse_real)    
results["ExNN_counterfactual"] = rmse
results["ExNN_real"] = rmse_real

In [None]:
# NAM
from nam.wrapper import NAMRegressor
print(seed)
random_state = 0
nam = NAMRegressor(
            num_epochs=10,
            num_learners=1,
            metric='mse',
            early_stop_mode='min',
            monitor_loss=False,
            n_jobs=1,
            random_state=random_state
        )
nam.fit(df_train, y_train)
# test
rmse = test_counterfactuals(df_test, nam, cf_per_obs=20, seed=seed)
df_real = df_test.copy()
rmse_real = test_factual(df_real, nam, seed=seed)
print("Counterfactual", rmse, "Real", rmse_real)
results["NAM_counterfactual"] = rmse
results["NAM_real"] = rmse_real

In [15]:
results

{'EBM_counterfactual': 0.11233528790841289,
 'EBM_real': 0.11117099244009716,
 'LR_counterfactual': 0.11252664487127256,
 'LR_real': 0.11156787661982835,
 'ExNN_counterfactual': 0.11235420006229978,
 'ExNN_real': 0.11125867399460311,
 'NAM_counterfactual': 0.12403880478786627,
 'NAM_real': 0.1198183649542339}