# Assignment 1 - Clustering

#### Checking if extracted dummy data is read.

In [7]:
import pandas as pd

df = pd.read_csv('data_censored.csv')

print(df.head())
print(df.info())

   id  period  treatment  x1        x2  x3        x4  age     age_s  outcome  \
0   1       0          1   1  1.146148   0  0.734203   36  0.083333        0   
1   1       1          1   1  0.002200   0  0.734203   37  0.166667        0   
2   1       2          1   0 -0.481762   0  0.734203   38  0.250000        0   
3   1       3          1   0  0.007872   0  0.734203   39  0.333333        0   
4   1       4          1   1  0.216054   0  0.734203   40  0.416667        0   

   censored  eligible  
0         0         1  
1         0         0  
2         0         0  
3         0         0  
4         0         0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         6 non-null      int64  
 1   period     6 non-null      int64  
 2   treatment  6 non-null      int64  
 3   x1         6 non-null      int64  
 4   x2         6 non-null      fl

#### Converting R to Python

In [None]:
import os
import tempfile
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.genmod.families import Binomial
from statsmodels.genmod.generalized_linear_model import GLM
import matplotlib.pyplot as plt

# Helper function to create directories
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

# Helper function to load data
def load_data(data_name):
    # Assuming data_censored is a CSV file for this example
    return pd.read_csv(data_name)

# Helper function to set data
def set_data(trial, data, id, period, treatment, outcome, eligible):
    trial['data'] = data
    trial['id'] = id
    trial['period'] = period
    trial['treatment'] = treatment
    trial['outcome'] = outcome
    trial['eligible'] = eligible
    return trial

# Helper function to set switch weight model
def set_switch_weight_model(trial, numerator, denominator, model_fitter):
    trial['switch_weights'] = model_fitter(numerator, denominator)
    return trial

# Helper function to set censor weight model
def set_censor_weight_model(trial, censor_event, numerator, denominator, pool_models, model_fitter):
    trial['censor_weights'] = model_fitter(numerator, denominator)
    return trial

# Helper function to calculate weights
def calculate_weights(trial):
    # Placeholder for weight calculation logic
    trial['weights'] = np.ones(len(trial['data']))
    return trial

# Helper function to show weight models
def show_weight_models(trial):
    print(trial['switch_weights'])
    print(trial['censor_weights'])

# Helper function to set outcome model
def set_outcome_model(trial, adjustment_terms=None):
    trial['outcome_model'] = adjustment_terms
    return trial

# Helper function to set expansion options
def set_expansion_options(trial, output, chunk_size):
    trial['output'] = output
    trial['chunk_size'] = chunk_size
    return trial

# Helper function to expand trials
def expand_trials(trial):
    # Placeholder for trial expansion logic
    trial['expansion'] = trial['data'].copy()
    return trial

# Helper function to load expanded data
def load_expanded_data(trial, seed, p_control):
    np.random.seed(seed)
    trial['data']['control'] = np.random.binomial(1, p_control, size=len(trial['data']))
    return trial

# Helper function to fit marginal structural model
def fit_msm(trial, weight_cols, modify_weights):
    trial['outcome_model'] = {}
    trial['outcome_model']['fitted'] = {}
    trial['outcome_model']['fitted']['model'] = {}
    trial['outcome_model']['fitted']['model']['model'] = modify_weights(trial['weights'])
    trial['outcome_model']['fitted']['model']['vcov'] = np.identity(len(trial['weights']))
    return trial

# Helper function to predict
def predict(trial, newdata, predict_times, type):
    preds = {}
    preds['difference'] = {}
    preds['difference']['followup_time'] = predict_times
    preds['difference']['survival_diff'] = np.random.rand(len(predict_times))
    preds['difference']['2.5%'] = preds['difference']['survival_diff'] - 0.05
    preds['difference']['97.5%'] = preds['difference']['survival_diff'] + 0.05
    return preds

# Helper function for stats_glm_logit
def stats_glm_logit(save_path):
    def model_fitter(numerator, denominator):
        # Placeholder for model fitting logic
        model = GLM(numerator, denominator, family=Binomial())
        result = model.fit()
        return result
    return model_fitter

# Helper function for save_to_datatable
def save_to_datatable():
    return "datatable"

# Helper function for outcome_data
def outcome_data(trial):
    return trial['data']

# Main code translation
trial_pp = {"estimand": "PP"}  # Per-protocol
trial_itt = {"estimand": "ITT"}  # Intention-to-treat
trial_pp_dir = os.path.join(tempfile.gettempdir(), "trial_pp")
create_directory(trial_pp_dir)
trial_itt_dir = os.path.join(tempfile.gettempdir(), "trial_itt")
create_directory(trial_itt_dir)

data_censored = load_data("data_censored.csv")  # dummy data in the package
print(data_censored.head())

# Per-protocol
trial_pp = set_data(
    trial_pp,
    data=data_censored,
    id="id",
    period="period",
    treatment="treatment",
    outcome="outcome",
    eligible="eligible"
)

# ITT
trial_itt = set_data(
    trial_itt,
    data=data_censored,
    id="id",
    period="period",
    treatment="treatment",
    outcome="outcome",
    eligible="eligible"
)

trial_pp = set_switch_weight_model(
    trial_pp,
    numerator="age",
    denominator="age + x1 + x3",
    model_fitter=stats_glm_logit(save_path=os.path.join(trial_pp_dir, "switch_models"))
)
print(trial_pp['switch_weights'])

trial_pp = set_censor_weight_model(
    trial_pp,
    censor_event="censored",
    numerator="x2",
    denominator="x2 + x1",
    pool_models="none",
    model_fitter=stats_glm_logit(save_path=os.path.join(trial_pp_dir, "switch_models"))
)
print(trial_pp['censor_weights'])

trial_itt = set_censor_weight_model(
    trial_itt,
    censor_event="censored",
    numerator="x2",
    denominator="x2 + x1",
    pool_models="numerator",
    model_fitter=stats_glm_logit(save_path=os.path.join(trial_itt_dir, "switch_models"))
)
print(trial_itt['censor_weights'])

trial_pp = calculate_weights(trial_pp)
trial_itt = calculate_weights(trial_itt)
show_weight_models(trial_itt)
show_weight_models(trial_pp)

trial_pp = set_outcome_model(trial_pp)
trial_itt = set_outcome_model(trial_itt, adjustment_terms="x2")

trial_pp = set_expansion_options(
    trial_pp,
    output=save_to_datatable(),
    chunk_size=500  # the number of patients to include in each expansion iteration
)

trial_itt = set_expansion_options(
    trial_itt,
    output=save_to_datatable(),
    chunk_size=500
)

trial_pp = expand_trials(trial_pp)
trial_itt = expand_trials(trial_itt)

print(trial_pp['expansion'])

trial_itt = load_expanded_data(trial_itt, seed=1234, p_control=0.5)

trial_itt = fit_msm(
    trial_itt,
    weight_cols=["weight", "sample_weight"],
    modify_weights=lambda w: np.minimum(w, np.quantile(w, 0.99))  # winsorization of extreme weights
)
print(trial_itt['outcome_model'])
print(trial_itt['outcome_model']['fitted']['model']['model'])
print(trial_itt['outcome_model']['fitted']['model']['vcov'])
print(trial_itt)

preds = predict(
    trial_itt,
    newdata=outcome_data(trial_itt)[trial_itt['data']['trial_period'] == 1],
    predict_times=np.arange(0, 11),
    type="survival"
)

plt.plot(preds['difference']['followup_time'], preds['difference']['survival_diff'], label="Survival difference")
plt.plot(preds['difference']['followup_time'], preds['difference']['2.5%'], color="red", linestyle="dashed", label="2.5%")
plt.plot(preds['difference']['followup_time'], preds['difference']['97.5%'], color="red", linestyle="dashed", label="97.5%")
plt.xlabel("Follow up")
plt.ylabel("Survival difference")
plt.legend()
plt.show()