In [76]:
import sensemakr as smkr
import statsmodels.formula.api as smf
import pandas as pd
import pickle
import numpy as np
from functools import reduce
from sklearn import linear_model
from sklearn.decomposition import PCA

In [2]:
model_type = 'bprmf_t'

In [15]:
path = "../../Data/regression/"
df = pd.read_csv(path + 'regression_data.csv')
# log transformation
df[["status_count", "followers_count", "friend_count"]] = df[
        ["status_count", "followers_count", "friend_count"]].applymap(lambda x: x + 1)
df[["status_count", "followers_count", "friend_count"]] = df[
        ["status_count", "followers_count", "friend_count"]].apply(np.log)
df.drop("user_id", axis=1, inplace=True)
temp = df.columns[:6].tolist()
temp.extend(['X%s' % i for i in range(0, 64)])
df.columns = temp

In [None]:
# PCA
num_components = 1
confounder = df.loc[:,df.columns.str.startswith('X')]
print(confounder.shape)
# Perform PCA
pca = PCA(n_components=num_components)
principal_components = pca.fit_transform(confounder)
# Create a DataFrame with the principal component scores for each row
principal_df = pd.DataFrame(data=principal_components, columns=['PC1'], index=confounder.index)
print(principal_df.shape)
# Display the explained variance ratio
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

In [None]:
# replace embeddings with PC1
df_confounder = df.drop(confounder.columns.tolist(), axis=1)
df_confounder = pd.concat([df_confounder, principal_df], axis=1)
df_confounder

In [None]:
# train-test split
train = df_confounder.groupby('label').apply(lambda x: x.sample(frac=0.8, random_state=200)).reset_index(level=0, drop=True)
train = train.sample(frac=1)
print(len(train))
test = df_confounder.drop(train.index).sample(frac=1)
print(len(test))

In [142]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [143]:
def evaluate_performance(pred, test):
    # evaluate performance
    RMSE = ((pred - test['label'])).mean()
    print("MSE: ", RMSE)
    return RMSE

def run_regression(formula, train, test):
    # runs regression model
    reg_model = smf.ols(formula=formula, data=train)
    df_model = reg_model.fit()
    pred = df_model.predict(test)
    RMSE = evaluate_performance(pred, test)
    coef = df_model.params
    coef = ['%.5f' % elem for elem in coef]
    print(df_model.summary())
    conf = df_model.conf_int(alpha=0.05, cols=None)
    print(conf)
    # conf = ['%.5f' % elem for elem in conf]
    # print(conf)
    return df_model, RMSE, coef, conf

In [144]:
df_without_label = df_confounder.drop(columns=['label'])
formula_bprmf = 'label ~ ' + " + ".join(df_without_label.columns.tolist())
formula_bprmf

'label ~ verified + register_time + status_count + followers_count + friend_count + PC1'

In [None]:
reg_model = smf.ols(formula=formula_bprmf, data=train)
df_model = reg_model.fit()
# partial r2
print(df_model.summary())
smkr.partial_r2(df_model, "PC1")

In [None]:
baseline_sense = smkr.Sensemakr(model = df_model,
                              treatment = "status_count",
                              benchmark_covariates = ["PC1"],
                              kd = [0.2, 0.5, 0.8, 1.0])
baseline_sense.summary()

In [None]:
baseline_sense = smkr.Sensemakr(model = df_model,
                              treatment = "followers_count",
                              benchmark_covariates = ["PC1"],
                              kd = [0.2, 0.5, 0.8, 1.0])
baseline_sense.summary()

In [None]:
baseline_sense = smkr.Sensemakr(model = df_model,
                              treatment = "verified",
                              benchmark_covariates = ["PC1"],
                              kd = [0.2, 0.5, 0.8, 1.0])
baseline_sense.summary()

In [None]:
baseline_sense = smkr.Sensemakr(model = df_model,
                              treatment = "friend_count",
                              benchmark_covariates = ["PC1"],
                              kd = [0.2, 0.5, 0.8, 1.0])
baseline_sense.summary()

In [None]:
baseline_sense = smkr.Sensemakr(model = df_model,
                              treatment = "register_time",
                              benchmark_covariates = ["PC1"],
                              kd = [0.2, 0.5, 0.8, 1.0])
baseline_sense.summary()