In [37]:
#imports
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_predict,
                                     cross_validate, train_test_split)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from collections import Counter
from sklearn.linear_model import LogisticRegression
import statistics
import shap
from sksurv.ensemble import RandomSurvivalForest
from lifelines import KaplanMeierFitter

from lifelines.statistics import logrank_test

random_state = 42

In [38]:
#data loading
ds = pd.read_csv('preprocessed_cpa_csv')
ds = ds.drop(columns = ['Unnamed: 0', 'ID', 'CT-SCAN', 'date', 'fullID', 'death_date','closest','preexisting_cond',])


In [39]:
#Extrapreprocessing

for variable in ['COPD', 'ILD', 'NTM']:
    ds[variable][ds[variable] == 'Yes'] = 1
    ds[variable][ds[variable] == 'No'] = 0

ds['sex'][ds['sex'] == 'F'] = 0
ds['sex'][ds['sex'] == 'M'] = 1

ds.ETHNIC = ds.ETHNIC.astype('category').cat.codes
td = ds.surv
ds.surv = ((td / np.timedelta64(1, 'D')).astype(int))

#Missing values
# ds = ds.fillna(ds.mean())
ds = ds.fillna(0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [43]:
from lifelines import CoxPHFitter
print(ds.cavities)
data_no_ct = ds.drop(columns=['cavities'])
cph_full = CoxPHFitter()
cph_full.fit(data_no_ct,"surv",event_col="death5")
cph_full.print_summary()

0      0
1      0
2      0
3      1
4      1
5      1
6      1
7      0
8      0
9      2
10     2
11     2
12     1
13     1
14     0
15     0
16     0
17     2
18     2
19     2
20     2
21     2
22     2
23     2
24     2
25     1
26     1
27     1
28     0
29     1
30     1
31     1
32     1
33     2
34     2
35     2
36     1
37     1
38     1
39     1
40     1
41     1
42     1
43     1
44     1
45     1
46     1
47     0
48     0
49     1
50     1
51     1
52     1
53     1
54     1
55     0
56     0
57     0
58     0
59     2
60     2
61     1
62     1
63     1
64     1
65     1
66     1
67     1
68     1
69     2
70     2
71     2
72     2
73     2
74     1
75     1
76     1
77     1
78     1
79     1
80     1
81     1
82     1
83     1
84     1
85     1
86     1
87     1
88     1
89     1
90     1
91     2
92     2
93     2
94     2
95     1
96     2
97     1
98     1
99     2
100    0
101    1
102    1
103    1
104    1
105    1
106    1
107    1
108    1
109    1
110    2
1

Column BR have very low variance when conditioned on death event present or not. This may harm convergence. This could be a form of 'complete separation'. For example, try the following code:

>>> events = df['death5'].astype(bool)
>>> print(df.loc[events, 'BR'].var())
>>> print(df.loc[~events, 'BR'].var())

A very low variance means that the column BR completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.

Newton-Rhaphson convergence completed successfully but norm(delta) is still high, 0.314. This may imply non-unique solutions to the maximum likelihood. Perhaps there is collinearity or complete separation in the dataset?

In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also co

0,1
model,lifelines.CoxPHFitter
duration col,'surv'
event col,'death5'
baseline estimation,breslow
number of observations,233
number of events observed,80
partial log-likelihood,-359.11
time fit was run,2022-09-04 13:47:37 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.39,1.47,0.31,-0.22,1.0,0.8,2.71,0.0,1.24,0.21,2.22
ETHNIC,0.12,1.12,0.07,-0.03,0.26,0.97,1.3,0.0,1.61,0.11,3.21
COPD,-0.35,0.7,0.49,-1.32,0.61,0.27,1.84,0.0,-0.72,0.47,1.09
ILD,1.43,4.18,0.57,0.32,2.54,1.38,12.67,0.0,2.53,0.01,6.44
NTM,0.85,2.33,0.31,0.24,1.46,1.27,4.31,0.0,2.71,0.01,7.23
crp,0.01,1.01,0.0,0.0,0.01,1.0,1.01,0.0,2.96,<0.005,8.33
albumin,-0.02,0.98,0.01,-0.04,-0.0,0.96,1.0,0.0,-2.48,0.01,6.25
BMI,-0.01,0.99,0.01,-0.04,0.02,0.96,1.02,0.0,-0.88,0.38,1.4
fungal_balls,-0.34,0.71,0.22,-0.78,0.1,0.46,1.1,0.0,-1.53,0.13,2.99
FL,-0.69,0.5,1.38,-3.4,2.02,0.03,7.51,0.0,-0.5,0.62,0.7

0,1
Concordance,0.81
Partial AIC,768.22
log-likelihood ratio test,110.08 on 25 df
-log2(p) of ll-ratio test,39.61


In [41]:
pd.set_option('display.max_rows', None)
print(ds.surv)

0      1572
1      1572
2      1917
3      1502
4      1392
5      1236
6       791
7      2785
8      2308
9      1587
10     1587
11     1587
12     4051
13     2756
14     1005
15     1005
16     1005
17     3925
18     3569
19     3217
20     2854
21     2303
22     1714
23     1504
24     1174
25     3977
26     2710
27     2601
28     3098
29     2604
30     2086
31     1217
32     1063
33     1761
34     1383
35     1014
36     3862
37     3106
38     3043
39     2675
40     2581
41     2021
42     1895
43     1736
44     1649
45     1047
46      826
47     1894
48     1233
49     3283
50     3112
51     2993
52     2464
53     2189
54     1800
55     1369
56     2308
57     1909
58     1209
59      643
60       13
61     1654
62     3779
63     3163
64     1847
65     1654
66     1191
67     3127
68     2707
69     4087
70     3757
71     3232
72     1798
73     1041
74     3057
75     2848
76     2643
77     2260
78     2037
79     1870
80     1699
81     1586
82     1394
83  