# Lights model with PBC2 dataset

In [1]:
# Library setup
%reset -f
%matplotlib inline
import os
os.environ['R_HOME'] = "/Library/Frameworks/R.framework/Versions/4.0/Resources"
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from tick.plot import plot_point_process
from lights.simulation import SimuJointLongitudinalSurvival
from lights.base.utils import heatmap, annotate_heatmap, gompertz_pdf, \
                              gompertz_survival, visualize_vect_learning
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import LabelEncoder
from lifelines.utils import concordance_index as c_index_score
from IPython.display import Markdown, display
from scipy.stats import beta
from matplotlib import rc
rc('text', usetex=True)
from lights.inference import QNMCEM

def printmd(string):
    display(Markdown(string))

In [2]:
import rpy2.robjects as robjects
import rpy2.robjects as robjects
from rpy2.robjects import Formula, Environment
from rpy2.robjects.vectors import IntVector, FloatVector, StrVector
from rpy2.robjects.lib import grid
from rpy2.robjects.packages import importr, data
import rpy2.robjects.packages as rpackages
import warnings
base = importr('base')
utils = importr('utils')
utils.chooseCRANmirror(ind=1)
packnames = ('ggplot2', 'tidyverse', 'gridExtra', 'mice')
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))
%load_ext rpy2.ipython

In [3]:
%%R
library("JMbayes")
data(pbc2, package = "JMbayes")
colnames_pbc2 <- names(pbc2)

R[write to console]: Le chargement a nécessité le package : nlme

R[write to console]: Le chargement a nécessité le package : survival

R[write to console]: Le chargement a nécessité le package : doParallel

R[write to console]: Le chargement a nécessité le package : foreach

R[write to console]: Le chargement a nécessité le package : iterators

R[write to console]: Le chargement a nécessité le package : parallel

R[write to console]: Le chargement a nécessité le package : rstan

R[write to console]: Le chargement a nécessité le package : StanHeaders

R[write to console]: Le chargement a nécessité le package : ggplot2

R[write to console]: rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)

R[write to console]: For execution on a local, multicore CPU with excess RAM we recommend calling
options(mc.cores = parallel::detectCores()).
To avoid recompilation of unchanged Stan programs, we recommend calling
rstan_options(auto_write = TRUE)



In [10]:
# dataset setup
pbc2 = robjects.globalenv['pbc2']
colnames_pbc2 = robjects.globalenv['colnames_pbc2']
pbc2 = pd.DataFrame(pbc2, index = colnames_pbc2).T
n_samples = pbc2["id"].drop_duplicates().shape[0]
long_features_list = ["serBilir", "albumin", "SGOT"]
n_long_features = len(long_features_list)

In [11]:
# data preprocessing
# survival data
survival_data = pbc2[["id", "years", "status2", "drug", "age", "sex"]].drop_duplicates()
X = survival_data[["drug", "age", "sex"]]
labelencoder = LabelEncoder()
X["drug"] = labelencoder.fit_transform(X["drug"])
X["sex"] = labelencoder.fit_transform(X["sex"])
X = X.values
T = survival_data[["years"]].values.flatten()
delta = survival_data[["status2"]].values.flatten()

# longitunal data
Y_ = pbc2[["id", "year", "serBilir", "albumin", "SGOT"]]
Y = pd.DataFrame(columns=long_features_list)
for i in range(n_samples):
    y_i = []
    for l in range(n_long_features):
        Y_il = Y_[["year", long_features_list[l]]][Y_["id"]==i+1]
        y_i += [pd.Series(Y_il[long_features_list[l]].values, index=Y_il["year"].values)]
    Y.loc[i] = y_i

In [None]:
## Choose parameters ##
tol = 1e-6            # tolerance for the convergence stopping criterion 
eta = 0.3             # parameter controlling the trade-off between l1 
                      # and l2 regularization in the elasticNet
gamma_chosen = '1se'  # way to select l_elasticNet_chosen: '1se' or 'min'
warm_start = True     # at each L-BGFS-B iteration, reset beta to 0 or take 
                      # the previous value 
grid_size = 30        # grid size for the cross validation procedure
metric = 'C-index'    # cross-validation metric: 'log_lik' or 'C-index'

# declare learner here
fixed_effect_time_order = 1
qnmcem = QNMCEM(fixed_effect_time_order=fixed_effect_time_order, max_iter=10, initialize=True, print_every=1,
               compute_obj=True)
# qnmcem.fit(X_train, Y_train, T_train, delta_train)
qnmcem.fit(X, Y, T, delta)

# Visualize learning
visualize_vect_learning(qnmcem, "obj")

## Cross-validation ##

## Run selected model with l_elasticNet_chosen ##

# run final fit here

Launching the solver QNMCEM...
Launching the solver MLMM...
Launching the solver ULMM...




Done solving using ULMM in 2.15e+00 seconds
 n_iter  |   obj    | rel_obj 
       0 |  19009.1 |      inf
       1 |  18977.3 | 1.68e-03
       2 |  19015.8 | 2.03e-03
       3 |    19006 | 5.14e-04
       4 |  19001.3 | 2.50e-04
       5 |  18999.7 | 8.27e-05
       6 |  19000.1 | 2.30e-05
       7 |  19001.9 | 9.50e-05
       8 |  19004.7 | 1.46e-04
       9 |  19008.2 | 1.85e-04
      10 |  19012.3 | 2.14e-04
Done solving using MLMM in 3.85e+00 seconds
 n_iter  |   obj    | rel_obj 
       0 |  64.0245 |      inf
       1 |  60.0704 | 6.18e-02
       2 |      nan |      nan
       3 |      nan |      nan
