In [2]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from lifelines.utils import concordance_index

# Load the data from the .xlsx file
data = pd.read_excel('data1.xlsx')

# Define categorical variables
categorical_cols = ['SEX', 'CompositeStage', 'LNInvolment', 'Comorbidity', 'FamiliyHistoryOfCancer']
data[categorical_cols] = data[categorical_cols].astype('category')

# One-hot encode categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Standardize the covariates
scaler = StandardScaler()
data_encoded[['DEATH', 'AGE']] = scaler.fit_transform(data_encoded[['DEATH', 'AGE']])
buckley_james_data = data_encoded[['Months', 'DEATH', 'AGE'] + [col for col in data_encoded.columns if col.startswith('SEX_') or col.startswith('CompositeStage_') or col.startswith('LNInvolment_') or col.startswith('Comorbidity_') or col.startswith('FamiliyHistoryOfCancer_')]]
cph = CoxPHFitter(penalizer=0.1)
cph.fit(buckley_james_data, 'Months', 'DEATH', show_progress=True)
print(cph.summary)
#cph.print_summary()

Iteration 1: norm_delta = 0.68300, step_size = 0.9500, log_lik = -1663.17959, newton_decrement = 54.49713, seconds_since_start = 0.0
Iteration 2: norm_delta = 0.08068, step_size = 0.9500, log_lik = -1616.75385, newton_decrement = 1.16193, seconds_since_start = 0.0
Iteration 3: norm_delta = 0.00455, step_size = 0.9500, log_lik = -1615.59252, newton_decrement = 0.00340, seconds_since_start = 0.0
Iteration 4: norm_delta = 0.00000, step_size = 1.0000, log_lik = -1615.58912, newton_decrement = 0.00000, seconds_since_start = 0.0
Convergence success after 4 iterations.
                              coef  exp(coef)  se(coef)  coef lower 95%  \
covariate                                                                 
AGE                       0.026478   1.026831  0.056826       -0.084899   
SEX_2                    -0.027535   0.972841  0.107577       -0.238382   
CompositeStage_2         -0.151530   0.859392  0.173728       -0.492030   
CompositeStage_3          0.026319   1.026669  0.188558 

In [3]:
concordance_values = {}
for column in cph.summary.index:
    if column != 'ID':
        concordance_values[column] = concordance_index(buckley_james_data[column], cph.predict_partial_hazard(buckley_james_data))
print("Concordance values of Univariate Variables:\n")
print(concordance_values)

Concordance values of Univariate Variables:

{'AGE': 0.5306817586333386, 'SEX_2': 0.45524296675191817, 'CompositeStage_2': 0.2839379059084429, 'CompositeStage_3': 0.15888888888888889, 'CompositeStage_4': 1.0, 'LNInvolment_1': 0.1500506072874494, 'Comorbidity_1': 0.3904086109968463, 'FamiliyHistoryOfCancer_1': 0.45142900577683187}


In [228]:
univariate_results = []
for col in data.columns:
    if col not in ['Months', 'ID','DEATH']:
        cph_univariate = CoxPHFitter(penalizer=0.1)
        cph_univariate.fit(data[[col, 'Months', 'DEATH']], 'Months', 'DEATH', show_progress=True)
        univariate_results.append((col, cph_univariate.summary))
        n = len(data)
        llf = cph_univariate.log_likelihood_
        k = cph_univariate.params_.shape[0]
        aic = -2 * llf + 2 * k
        bic = -2 * llf + k * np.log(n)
        univariate_aic_bic.append((col, aic, bic))
        print(f"\nAIC value of {col}:", aic)
        print(f"BIC value of {col}:", bic)

# Print the summaries of the univariate analysis
for col, summary in univariate_results:
    print(f"Univariate analysis of: {col}")
    print(summary)
    print("\n")

Iteration 1: norm_delta = 0.09094, step_size = 0.9500, log_lik = -943.70062, newton_decrement = 0.84604, seconds_since_start = 0.0
Iteration 2: norm_delta = 0.00646, step_size = 0.9500, log_lik = -942.84679, newton_decrement = 0.00411, seconds_since_start = 0.0
Iteration 3: norm_delta = 0.00033, step_size = 0.9500, log_lik = -942.84269, newton_decrement = 0.00001, seconds_since_start = 0.1
Iteration 4: norm_delta = 0.00000, step_size = 1.0000, log_lik = -942.84268, newton_decrement = 0.00000, seconds_since_start = 0.1
Convergence success after 4 iterations.

AIC value of AGE: 1887.6853655755
BIC value of AGE: 1891.523096022666
Iteration 1: norm_delta = 0.02152, step_size = 0.9500, log_lik = -943.70062, newton_decrement = 0.04824, seconds_since_start = 0.0
Iteration 2: norm_delta = 0.00117, step_size = 0.9500, log_lik = -943.65238, newton_decrement = 0.00014, seconds_since_start = 0.0
Iteration 3: norm_delta = 0.00006, step_size = 0.9500, log_lik = -943.65224, newton_decrement = 0.00000

In [229]:
# Print AIC and BIC for univariate models
print("\nAIC and BIC for univariate models:")
for col, aic, bic in univariate_aic_bic:
    print(f"{col}: AIC={aic}, BIC={bic}")


AIC and BIC for univariate models:
SEX_2: AIC=3330.115165027162, BIC=3337.790625921494
CompositeStage_2: AIC=3321.5101024525675, BIC=3329.1855633468995
CompositeStage_3: AIC=3317.0745678980034, BIC=3324.7500287923353
CompositeStage_4: AIC=3242.6468994706224, BIC=3250.3223603649544
LNInvolment_1: AIC=3322.1186323122874, BIC=3329.7940932066194
Comorbidity_1: AIC=3328.7696858848362, BIC=3336.445146779168
FamiliyHistoryOfCancer_1: AIC=3329.9669227876907, BIC=3337.6423836820227
AGE: AIC=1887.6853655755, BIC=1891.523096022666
SEX: AIC=1889.3044715567958, BIC=1893.1422020039618
CompositeStage: AIC=1819.2296914692981, BIC=1823.0674219164641
LNInvolment: AIC=1879.6997872387765, BIC=1883.5375176859425
Comorbidity: AIC=1888.3125005206216, BIC=1892.1502309677876
FamiliyHistoryOfCancer: AIC=1889.243255971226, BIC=1893.080986418392
AGE: AIC=1887.6853655755, BIC=1891.523096022666
SEX: AIC=1889.3044715567958, BIC=1893.1422020039618
CompositeStage: AIC=1819.2296914692981, BIC=1823.0674219164641
LNInvo

In [230]:
significant_variables_multivariate = [(var, summary) for var, summary in multivariate_results if summary['p'][var] < 0.05]
print("\nSignificant variables from univariate analysis:")
for var, summary in significant_variables_multivariate:
    print(f"\n{var}:")
    print(summary)


Significant variables from univariate analysis:

CompositeStage:
                    coef  exp(coef)  se(coef)  coef lower 95%  coef upper 95%  \
covariate                                                                       
CompositeStage  0.451465   1.570611  0.061942        0.330061        0.572868   
AGE             0.013370   1.013460  0.053478       -0.091446        0.118185   

                exp(coef) lower 95%  exp(coef) upper 95%  cmp to         z  \
covariate                                                                    
CompositeStage             1.391053             1.773346     0.0  7.288543   
AGE                        0.912611             1.125453     0.0  0.250006   

                           p   -log2(p)  
covariate                                
CompositeStage  3.133250e-13  41.537405  
AGE             8.025824e-01   0.317279  

LNInvolment:
                 coef  exp(coef)  se(coef)  coef lower 95%  coef upper 95%  \
covariate                           

In [231]:
#significant_variables = [(var, p_value) for var, p_value in multivariate_results if p_value < 0.05]
print("Updated data with significant variables as categorical data:")
print(data)

#data_encoded = pd.get_dummies(data, columns=[var for var, _ in significant_variables], drop_first=True)

# Update the Buckley-James data with the new categorical variables
categorical_columns = ['SEX_', 'CompositeStage_', 'LNInvolment_', 'Comorbidity_', 'FamiliyHistoryOfCancer_']
buckley_james_data = data_encoded[['Months', 'DEATH', 'AGE'] + [col for col in data_encoded.columns if any(col.startswith(cat_col) for cat_col in categorical_columns)]]

Updated data with significant variables as categorical data:
      ID  Months  DEATH  AGE SEX CompositeStage LNInvolment Comorbidity  \
0      1      70      0   50   1              3           1           1   
1      2      68      0   50   2              1           0           1   
2      3      69      0   52   1              2           0           1   
3      4      43      1   55   2              2           0           0   
4      5      71      0   69   2              3           1           1   
..   ...     ...    ...  ...  ..            ...         ...         ...   
338  339      65      0   41   1              3           1           1   
339  340      61      0   52   1              2           0           1   
340  341      65      0   61   2              2           0           1   
341  342      16      1   71   2              4           0           0   
342  343      31      1   60   2              4           1           0   

    FamiliyHistoryOfCancer  
0        

In [232]:
cph_multivariate = CoxPHFitter(penalizer=0.1)
cph_multivariate.fit(buckley_james_data[['Months', 'DEATH', 'AGE'] + [var for var, _ in significant_variables]], 'Months', 'DEATH', show_progress=True)
print(cph_multivariate.summary)

Iteration 1: norm_delta = 0.68301, step_size = 0.9500, log_lik = -1663.17959, newton_decrement = 54.30464, seconds_since_start = 0.0
Iteration 2: norm_delta = 0.08057, step_size = 0.9500, log_lik = -1616.92886, newton_decrement = 1.14381, seconds_since_start = 0.1
Iteration 3: norm_delta = 0.00450, step_size = 0.9500, log_lik = -1615.78591, newton_decrement = 0.00329, seconds_since_start = 0.1
Iteration 4: norm_delta = 0.00000, step_size = 1.0000, log_lik = -1615.78262, newton_decrement = 0.00000, seconds_since_start = 0.1
Convergence success after 4 iterations.
                      coef  exp(coef)  se(coef)  coef lower 95%  \
covariate                                                         
AGE               0.018283   1.018451  0.053798       -0.087159   
CompositeStage_2 -0.149303   0.861308  0.173490       -0.489338   
CompositeStage_3  0.032246   1.032772  0.187625       -0.335493   
CompositeStage_4  1.137077   3.117641  0.179401        0.785458   
LNInvolment_1    -0.345846   

In [241]:
concordance_dict = {}

for var in cph_multivariate.params_.index:
    concordance = cph_multivariate.concordance_index_
    concordance_dict[var] = concordance
print("Concordance values of Multivariate Variables:\n")
print(concordance_dict)

Concordance values of Multivariate Variables:

{'AGE': 0.6582635491564964, 'CompositeStage_2': 0.6582635491564964, 'CompositeStage_3': 0.6582635491564964, 'CompositeStage_4': 0.6582635491564964, 'LNInvolment_1': 0.6582635491564964}


In [233]:
n = len(buckley_james_data)
llf = cph_multivariate.log_likelihood_
k = cph_multivariate.params_.shape[0]
multivariate_aic = -2 * llf + 2 * k
multivariate_bic = -2 * llf + k * np.log(n)
print(cph_multivariate.summary)


                      coef  exp(coef)  se(coef)  coef lower 95%  \
covariate                                                         
AGE               0.018283   1.018451  0.053798       -0.087159   
CompositeStage_2 -0.149303   0.861308  0.173490       -0.489338   
CompositeStage_3  0.032246   1.032772  0.187625       -0.335493   
CompositeStage_4  1.137077   3.117641  0.179401        0.785458   
LNInvolment_1    -0.345846   0.707622  0.138427       -0.617158   

                  coef upper 95%  exp(coef) lower 95%  exp(coef) upper 95%  \
covariate                                                                    
AGE                     0.123725             0.916532             1.131705   
CompositeStage_2        0.190732             0.613032             1.210135   
CompositeStage_3        0.399985             0.714986             1.491802   
CompositeStage_4        1.488695             2.193411             4.431311   
LNInvolment_1          -0.074533             0.539475         

In [235]:
# Print AIC and BIC for multivariate model
print("\nAIC value of the multivariate model:", multivariate_aic)
print("BIC value of the multivariate model:", multivariate_bic)


AIC value of the multivariate model: 3241.5652333399553
BIC value of the multivariate model: 3260.753885575785
