<a href="https://colab.research.google.com/github/pryplotsky/C5.-Deep-Survival-Analysis-and-Time-Varying-Covariates/blob/main/CPH_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### Install  packages and define global variables
#Basic:
import pandas as pd
import numpy as np
# 1. Read, save and load a data:
import os
import pickle
# 2. Pre-process the data 
from sklearn import preprocessing
# 3. Split the data into 3 sets
import sklearn
from sklearn.model_selection import train_test_split
# 5. Create Cox time ( CPH for TVC) model
!pip install lifelines
import matplotlib.pyplot as plt
from lifelines import CoxTimeVaryingFitter
from lifelines import KaplanMeierFitter
from lifelines.utils import concordance_index
from sklearn.metrics import brier_score_loss

In [6]:
#Set global variables
result_c_index=[]#extract c-index
result_brier_score=[]#extract brier score
pred_time=[21, 23, 40, 50]# set evaluation time
k_event=['default_time', 'payoff_time']#set events
var_list=['id', 'tte', 'times',
       'balance_time', 'LTV_time', 'interest_rate_time', 'rate_time',
       'hpi_time', 'gdp_time', 'uer_time','avg_balance_time', 'avg_interest_rate_time',
       'avg_LTV_time', 'avg_rate_time', 'avg_hpi_time', 'avg_gdp_time',
       'avg_uer_time','FICO_orig_time', 'REtype_CO_orig_time',
       'REtype_PU_orig_time', 'REtype_SF_orig_time']

**Main part of code**

In [3]:
### Create class Preparation with 5 methods: __init__, readdf, save, load, cleaning and splitdata
class Preparation:
  def __init__(self):# Dont use any additional attributes    
    self.df = None # Placeholders. Set varabels as none and then update a resut 
    self.X_train = None
    self.X_test = None
    self.y_train = None
    self.y_test  = None
    self.X_train = None
    self.X_val = None
    self.y_train = None
    self.y_val = None
    self.cols_standardize = ['id', 'tte', 'times','label',
       'balance_time', 'LTV_time', 'interest_rate_time', 'rate_time',
       'hpi_time', 'gdp_time', 'uer_time','avg_balance_time', 'avg_interest_rate_time',
       'avg_LTV_time', 'avg_rate_time', 'avg_hpi_time', 'avg_gdp_time',
       'avg_uer_time','FICO_orig_time', 'REtype_CO_orig_time',
       'REtype_PU_orig_time', 'REtype_SF_orig_time', 'default_time', 'payoff_time']
    #self.cols_standardize = ['rate_time','hpi_time', 'gdp_time']
### Pre-processing:
  # Read data
  def readdf (self, sep=",", filename="dcr_cleaned.csv", cwd = os.getcwd()): # Method whith 3 def attributes: sep - separator, getcwd - path to your working directory
    file_name = cwd + "/" + filename # Get path of file
    data = pd.read_csv(file_name, sep= sep) # Read csv
    self.df = data # Save filtered dataset 
    return self.df # Print dataset
  # Clean data
  def cleaning (self, data,longformat=True,individual="id", stop="times", stopname="start"): 
    if longformat:# Bring data into long format (necessary for using the lifeline package's Cox’s time varying proportional hazard model) 
        data[stopname] = data.groupby(individual)[stop].shift(1)
        data[stopname] = data[stopname].fillna(0)
        self.df = data # Save filtered dataset
        return self.df # Print dataset
### Saving and load objects as binary mode
  def save (self, dataname, dataframe , cwd = os.getcwd()):# Saving and load objects as binary mode
    with open( cwd + '/' + dataname + '.pkl','wb') as path_name: # save df, 'wb' specifies 'write'
      pickle.dump(dataframe, path_name)  
  def load (self, dataname, cwd = os.getcwd()):# Saving and load objects as binary mode
    with open( cwd + '/' + dataname + '.pkl' ,'rb') as path_name:# load df, 'rb' specifies 'read'
      dataframe = pickle.load(path_name)
      return dataframe # Print dataset
### Split the data into 3 sets: train(80%)  + dev (10%) + test(10%)  
  def splitdata (self, Xvar, yvar, perc_test=0.2, perc_val=1/5,  shuffle=False):# Method needs 5 arguments: X set with independent vars, y - set w. dependent vars, perc_test=percentage for test set, perc_val=valuation set (=(1-perc_test)*perc_val) 
    self.X_train, self.X_test, self.y_train, self.y_test = sklearn.model_selection.train_test_split(Xvar, yvar, test_size=perc_test,random_state=1234,  shuffle=shuffle)# Split data not randomly to train 90% and test 10%
    #self.X_train, self.X_val, self.y_train, self.y_val = sklearn.model_selection.train_test_split(self.X_train, self.y_train, test_size=perc_test,random_state=1234,  shuffle=shuffle) # Split train data not randomly to train 80% and valid 10%
    print(len(self.X_train),  len(self.X_test)) # len - length 

In [4]:
def brier_score(Prediction, Time_survival, Death, Time):#define a function to calculate a brier score
    N = len(Prediction)
    y_true = ((Time_survival <= Time) * Death).astype(float)
    return np.mean((Prediction - y_true.values)**2)

**Cox Time model**

In [None]:
for j in k_event:#loop over number evaluation times
  var_list.append(j)
  temp=Preparation()#define an obkect
  newdf = temp.readdf()#read aur dataframe
  df_to_use=temp.cleaning(temp.df)#clean our dataframe for CPH model
  temp.splitdata(df_to_use.loc[:,df_to_use.columns != j], yvar=df_to_use[j])# split data into test and train set
  X_train=temp.X_train#set train df with tvcs
  y_train=temp.y_traint#set train df with dependent variable
  df_full=X_train[['id', 'tte', 'times',
       'balance_time', 'LTV_time', 'interest_rate_time', 'rate_time',
       'hpi_time', 'gdp_time', 'uer_time','FICO_orig_time',  'start']]
  df2=pd.concat([df_full,  y_train], axis=1)#merge df with dependent and independent variables
  df3=df2.dropna()#drop missing values
  for i in pred_time:#loop over number of events
    X_test=temp.X_test#set test df with tvcs
    y_test=temp.y_test#set test df with dependent variable
    X_test=X_test[['id', 'tte', 'times',
       'balance_time', 'LTV_time', 'interest_rate_time', 'rate_time',
       'hpi_time', 'gdp_time', 'uer_time','FICO_orig_time',  'start']]
    df2_test=pd.concat([X_test,  y_test], axis=1)#merge df with dependent and independent variables
    df2_testb=df2_test.loc[df2_test['times']<=i]#set restriction for evaluation (equal prediction time plus evaluation time)
    df3_test=df2_testb.dropna()#drop missing values
    ctv = CoxTimeVaryingFitter(penalizer=0.1)#implements fitting Cox’s time-varying proportional hazard model 
    ctv.fit(df3, id_col="id", event_col=j, start_col="start", stop_col="times", show_progress=True )
    ctv.print_summary() #summary of the fitted model with different values of the parametric partial hazard and Partial AIC
    ctv.plot() #visiualizaton of the covariates and how they are distributed
    pred=ctv.baseline_cumulative_hazard_['baseline hazard'][i] * ctv.predict_partial_hazard(df3_test)#predicting the probabilities
    res1=concordance_index(df3_test['times'], -pred, df3_test[j])# calculate a c-index
    res2=brier_score(pred, df3_test['times']  ,df3_test[j] , i)# calculate a brier score
    result_c_index.append(res1)
    result_brier_score.append(res2)

In [None]:
df_to_print=pd.DataFrame({'Event': k_event*4, 'Time': np.repeat(pred_time, 2 ), 'C-index': result_c_index,'Brier score': result_brier_score })# Prepare a table with results
df_to_print.to_excel('df_res.xlsx')# Export table with results
df_to_print