In [1]:
import pandas as pd
from lifelines import CoxPHFitter

# 1. Load data
data_path = '/root/DATA/cleaned_final_cox_dataset.csv'
data = pd.read_csv(data_path)

# 2. Set duration column (survival time) and event column (death occurrence)
duration_col = 'los_x'
event_col = 'expire_flag'

# 3. Fit the initial Cox model
cph = CoxPHFitter()
cph.fit(data, duration_col=duration_col, event_col=event_col)
print("Initial model summary:")
cph.print_summary()

# 4. Select variables with p-values less than 0.05
significant_vars = cph.summary[cph.summary['p'] < 0.05].index.tolist()

# 5. Fit the model again, keeping only significant variables
data_selected = data[significant_vars + [duration_col, event_col]]  # Add duration and event columns
cph_reduced = CoxPHFitter()
cph_reduced.fit(data_selected, duration_col=duration_col, event_col=event_col)

# 6. Output the summary of the reduced model
print("\nReduced model summary (only significant variables):")
cph_reduced.print_summary()


Initial model summary:


0,1
model,lifelines.CoxPHFitter
duration col,'los_x'
event col,'expire_flag'
baseline estimation,breslow
number of observations,20101
number of events observed,8086
partial log-likelihood,-69129.56
time fit was run,2024-11-13 23:22:00 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
positiveculture,0.05,1.05,0.03,-0.0,0.11,1.0,1.11,0.0,1.91,0.06,4.15
gcs,0.04,1.04,0.02,0.01,0.08,1.01,1.08,0.0,2.58,0.01,6.68
gcsmotor,-0.08,0.92,0.02,-0.13,-0.03,0.88,0.97,0.0,-3.17,<0.005,9.35
lactate,0.02,1.02,0.01,0.0,0.04,1.0,1.04,0.0,2.14,0.03,4.94
bloodureanitrogen,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,4.83,<0.005,19.47
hemoglobin,-0.02,0.98,0.01,-0.05,0.01,0.95,1.01,0.0,-1.38,0.17,2.58
intnormalisedratio,0.02,1.02,0.01,0.0,0.04,1.0,1.04,0.0,2.13,0.03,4.91
albumin,-0.03,0.97,0.05,-0.13,0.07,0.87,1.07,0.0,-0.68,0.50,1.0
chloride,-0.01,0.99,0.0,-0.02,-0.01,0.98,0.99,0.0,-5.09,<0.005,21.4
hematocrit,-0.01,0.99,0.01,-0.02,-0.0,0.98,1.0,0.0,-2.25,0.02,5.37

0,1
Concordance,0.67
Partial AIC,138297.13
log-likelihood ratio test,2464.92 on 19 df
-log2(p) of ll-ratio test,inf



Reduced model summary (only significant variables):


0,1
model,lifelines.CoxPHFitter
duration col,'los_x'
event col,'expire_flag'
baseline estimation,breslow
number of observations,20101
number of events observed,8086
partial log-likelihood,-69136.05
time fit was run,2024-11-13 23:22:02 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
gcs,0.05,1.05,0.02,0.01,0.08,1.01,1.08,0.0,2.68,0.01,7.1
gcsmotor,-0.08,0.92,0.02,-0.13,-0.03,0.88,0.97,0.0,-3.26,<0.005,9.82
lactate,0.02,1.02,0.01,0.0,0.04,1.0,1.04,0.0,2.11,0.04,4.83
bloodureanitrogen,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,5.14,<0.005,21.76
intnormalisedratio,0.02,1.02,0.01,0.0,0.04,1.0,1.04,0.0,2.25,0.02,5.37
chloride,-0.01,0.99,0.0,-0.02,-0.01,0.98,0.99,0.0,-5.0,<0.005,20.72
hematocrit,-0.02,0.98,0.0,-0.02,-0.01,0.98,0.99,0.0,-6.11,<0.005,29.94
age_years,0.02,1.02,0.0,0.02,0.02,1.02,1.03,0.0,24.22,<0.005,427.99
insurance_Medicare,0.15,1.17,0.03,0.09,0.22,1.1,1.24,0.0,4.92,<0.005,20.14
gcsverbal,0.09,1.1,0.02,0.05,0.14,1.05,1.15,0.0,4.08,<0.005,14.41

0,1
Concordance,0.67
Partial AIC,138298.10
log-likelihood ratio test,2451.94 on 13 df
-log2(p) of ll-ratio test,inf


In [3]:
from lifelines import CoxPHFitter
from scipy.stats import chi2  # Import chi2 to calculate the p-value

# Fit the full model
cph_full = CoxPHFitter()
cph_full.fit(data, duration_col=duration_col, event_col=event_col)
full_log_likelihood = cph_full.log_likelihood_

# Select significant variables (p < 0.05)
significant_vars = cph_full.summary[cph_full.summary['p'] < 0.05].index.tolist()

# Fit the reduced model (using only significant variables)
data_reduced = data[significant_vars + [duration_col, event_col]]
cph_reduced = CoxPHFitter()
cph_reduced.fit(data_reduced, duration_col=duration_col, event_col=event_col)
reduced_log_likelihood = cph_reduced.log_likelihood_

# Calculate the likelihood ratio test statistic
likelihood_ratio_stat = 2 * (full_log_likelihood - reduced_log_likelihood)
df = len(cph_full.params_) - len(cph_reduced.params_)  # Degrees of freedom equal to the number of removed variables
p_value = chi2.sf(likelihood_ratio_stat, df)  # chi2.sf is used to compute the p-value

# Print results
print(f"Likelihood Ratio Statistic: {likelihood_ratio_stat:.4f}")
print(f"Degrees of Freedom: {df}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("The reduced model is significantly worse than the full model, and the removed variables may need to be retained.")
else:
    print("There is no significant difference between the reduced and full models, so removing the variables is reasonable.")


Likelihood Ratio Statistic: 12.9736
Degrees of Freedom: 6
P-value: 0.0435
The reduced model is significantly worse than the full model, and the removed variables may need to be retained.
