In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Cleaning Metadata Features

In [2]:
data_dir = "C:\\Users\\preet\\Documents\\RadFusion\\"

In [3]:
demo_df = pd.read_csv(data_dir + "Demographics.csv")
print(demo_df.shape)
demo_df.head(5)
demo_df = demo_df[~demo_df["idx"].duplicated(keep=False)]

demo_df['Gender'] = demo_df[['Female', 'Male']].idxmax(axis=1).eq('Male').astype('int8')                
index = demo_df["idx"]                  

eth_cols = ['Asian', 'Black', 'Native American', 'Other',
            'Pacific Islander', 'Unknown_race', 'White']

demo_df['Ethnicity'] = demo_df[eth_cols].values.argmax(axis=1).astype('int8') 

eth_cols = ['SMOKER_N', 'SMOKER_Y']
demo_df['Smoking'] = demo_df[eth_cols].values.argmax(axis=1).astype('int8')  

eth_mapping = {i: col for i, col in enumerate(eth_cols)}

demo_df.head()
demo_df = demo_df.drop(["Female", 'Male', 'Asian', 'Black', 'Native American', 'Other', 'Pacific Islander', 'Unknown_race', 'White', 'SMOKER_N', 'SMOKER_Y'], axis=1)


demo_df['current_age_yrs'] = (demo_df['current_age_yrs'] // 10).astype('int8') + 1 #Binning age into values from 1 to 10

demo_df.head()


(1837, 14)


Unnamed: 0,current_age_yrs,idx,split,Gender,Ethnicity,Smoking
0,9,890,train,0,6,0
1,7,1879,train,1,6,1
2,10,1783,train,1,6,1
3,10,3896,test,0,6,0
4,8,1193,train,0,6,0


In [4]:
def clean_and_clip_df(df, min_val, max_val):
    """
    Function to clip dataframe values to a min and max value. 
    """
    df = df[~df["idx"].duplicated(keep="first")]
    df.columns = df.columns.str.strip()
    df.drop(columns=['Unnamed: 0',"split"], inplace=True)
    cols = df.columns[:-1] 
    df[cols] = df[cols].clip(lower=min_val, upper=max_val) 
    return df

In [5]:
#Calling the clean and clip method on the out medications, in medications, disease codes and vitals dataframes

out_med_df = pd.read_csv(data_dir+'OUT_MED.csv')
out_med_df = clean_and_clip_df(out_med_df,min_val = 0,max_val=9)

in_med_df = pd.read_csv(data_dir+'INP_MED.csv')
in_med_df = clean_and_clip_df(in_med_df, min_val = 0, max_val=5)

icd_df = pd.read_csv(data_dir+'ICD.csv')
icd_df = clean_and_clip_df(icd_df,min_val = 0, max_val=5)

vitals_df  = pd.read_csv(data_dir + "Vitals.csv")
vitals_df = clean_and_clip_df(vitals_df,min_val=-100, max_val=99) 

#Normalizing valuesin the vitals dataframe so they all fall within a reasonable range
cols = vitals_df.columns[:-1] # skip idx
for col in cols:
    vitals_df[col] = ((vitals_df[col]+100) // 50).astype('int8') 


print(vitals_df.isna().sum().sum())
vitals_df.head()

0


Unnamed: 0,SBP,DBP,height_inch,weight_kg,bmi,tempf,respirations,spO2,pulse,idx
0,2,2,2,2,2,2,2,2,2,84
1,2,2,2,2,2,2,2,2,2,2248
2,2,1,1,1,1,2,1,2,2,2271
3,2,2,1,1,1,2,1,2,2,1691
4,2,2,2,2,2,2,2,2,2,3286


In [6]:
#Merging the out medications, in medications, disease codes, and vitals dataframes

merged = (
    demo_df
    .merge(out_med_df,on="idx", how="inner")
    .merge(in_med_df, on="idx", how="inner")
    .merge(icd_df,on="idx", how="inner")
    .merge(vitals_df, on="idx", how="inner")
)

#Isolating the column with the split (train test val) and the unnamed columns
split_cols = [col for col in merged.columns if col.startswith("split")]
un_cols = [col for col in merged.columns if col.startswith("Unnamed")]

for col in split_cols[1:]:
    assert (merged[split_cols[0]] == merged[col]).all(), f"{col} differs!"

merged["split"] = merged[split_cols[0]]  #adding split column back to merged dataframe
merged = merged.drop(columns=[col for col in split_cols if col != "split"]) 
merged = merged.drop(columns=un_cols)
merged = merged.drop(columns=['current_age_yrs','Ethnicity'])


In [7]:
SPLIT_COL='split'
#These are the cleaned labels from the Penet codebase 
df_ext = pd.read_csv("C:\\Users\\preet\\Documents\\RadFusion\\ExtendedLabelsFiltered.csv")

# Rremove all rows with subsegmental PE
df_filt = merged[merged["idx"].isin(df_ext["patient"])] #df ext contains patients only with central and segmental PE (and no PE)
df_filt = df_filt.rename(columns={'idx': 'patient'})
df_filt.head() 


# Logic to insert labels with ones from df_ext
labels_df_ext = df_ext[["label", "patient"]]
df= df_filt.merge(labels_df_ext, on="patient", how = "inner")


In [8]:
# Function to sort df based on correlation only in the training split
def sort_and_clean_df(df, num_feats):
    df = df.loc[:, df.nunique(dropna=False) > 1]
    print(df.shape)
    correlation = df.drop(columns=['patient','label','split']).apply(lambda col: col.corr(df['label']))
    sorted_by_strength = correlation.abs().sort_values(ascending=False)
    cols_in_order = sorted_by_strength.index.tolist()
    cols_to_keep = cols_in_order[:num_feats]
    cols_to_skip = cols_in_order[num_feats:]
    df = df.drop(columns = cols_to_skip)
    df = df.reindex(columns=cols_to_keep + [c for c in df.columns if c not in cols_to_keep]) 
    print(df.shape)
    return df

In [9]:
NUM_FEATS = 16 #Iterated through values, settled on 16
SORT = False #Flag for whether to proceed with sorting 
if SORT:
    df = sort_and_clean_df(df,NUM_FEATS) #This line will be commented when running simulations without sorting 

#Prepare for training
df_train = df[df["split"] == 'train']
df_val = df[df["split"] == 'val']
df_test = df[df["split"] == 'test']

df_train.drop(columns=["split"],inplace=True)
df_val.drop(columns=["split"],inplace=True)
df_test.drop(columns=["split"],inplace=True)

df_train.head()
y_test = df_test["label"]
X_test = df_test.drop(columns =   ["label","patient"])

y_train = df_train["label"]
X_train= df_train.drop(columns =   ["label","patient"])

y_val = df_val["label"]
X_val = df_val.drop(columns =   ["label","patient"])


(1755, 1444)
(1755, 19)


## Models

In [10]:

depth = 10
from sklearn.metrics import roc_auc_score

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

dtree = DecisionTreeClassifier(criterion='gini',max_depth=depth, random_state=42) # Adjust parameters as needed
print("First row, first few cols and their types:")
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test) 
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
probabilities = dtree.predict_proba(X_test)
prob = [proby[1] for proby in probabilities]

if SORT == True:
    dct_dec = {"label": y_test, "prob": prob}
    df1 = pd.DataFrame(dct_dec)
if SORT == False:
    dct_dec2 = {"label": y_test, "prob": prob} 
    df2 = pd.DataFrame(dct_dec2)

roc_auc_score(y_test, prob)

First row, first few cols and their types:
Accuracy: 0.6358024691358025


0.7416044776119404

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_auc_score

# Create and train logistic regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)  # increase max_iter to ensure convergence
logreg.fit(X_train, y_train)

# Predict labels
y_pred = logreg.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Predict probabilities
probabilities = logreg.predict_proba(X_test)
prob = [proby[1] for proby in probabilities]  # Class 0 probability

# Compute AUC
roc_auc = roc_auc_score(y_test, prob)
print(f"ROC AUC: {roc_auc}")
if SORT == True:
    dct_lg = {"label": y_test, "prob": prob}
    df3 = pd.DataFrame(dct_lg)
if SORT == False:
    dct_lg2 = {"label": y_test, "prob": prob} 
    df4 = pd.DataFrame(dct_lg2)

Accuracy: 0.6790123456790124
ROC AUC: 0.7070895522388059


In [11]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay

rf = RandomForestClassifier(n_estimators=256, random_state=42)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
probabilities = rf.predict_proba(X_test)
prob = [proby[1] for proby in probabilities]

if SORT == True:
    dct_rf = {"label": y_test, "prob": prob}
    df5 = pd.DataFrame(dct_rf)
if SORT == False:
    dct_rf2 = {"label": y_test, "prob": prob}  
    df6 = pd.DataFrame(dct_rf2)
roc_auc_score(y_test, prob)

Accuracy: 0.6481481481481481


0.7983742004264391

In [34]:
from sklearn.metrics import (
            accuracy_score, precision_score, recall_score, f1_score,
            roc_auc_score, average_precision_score, balanced_accuracy_score,
            confusion_matrix
        )


def create_metrics(result_df):
    
    # Get true labels and predicted probabilities
    y_true = result_df['label'].values
    y_prob = result_df['prob'].values
    y_pred = (y_prob >= 0.5).astype(int)
    
    # Confusion matrix to get TN, FP, FN, TP
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Compute metrics
    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Sensitivity": recall_score(y_true, y_pred),  # aka recall
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0.0,
        "Precision": precision_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred),
        "AUC-ROC": roc_auc_score(y_true, y_prob),
        "AUC-PR": average_precision_score(y_true, y_prob),
        "Balanced Accuracy": balanced_accuracy_score(y_true, y_pred)
    }
        
    # Add to metrics_df
    metrics_df = pd.DataFrame([metrics])  # single row
    return metrics_df

In [35]:
if SORT == True:
    dec1 = create_metrics(df1)
    log1 = create_metrics(df3)
    rf1 = create_metrics(df5)
else:
    #Set SORT flag to False 
    dec2 = create_metrics(df2)
    log2 = create_metrics(df4)
    rf2 = create_metrics(df6) 

    

In [44]:
merged = pd.DataFrame()
merged["Classifier"] = ["Decision Tree", "Logistic Regression", "Random Forest", "Decision Tree", "Logistic Regression", "Random Forest"]
merged["Num Features"] = [16, 16, 16, 2856, 2856, 2856] 
merged["Sorted"] = [True, True, True, False, False, False] 

merged1 = pd.concat([dec1, log1, rf1, dec2, log2, rf2], axis=0)

merged2 = pd.concat([merged.reset_index(drop=True), merged1.reset_index(drop=True)], axis=1)

merged2.head(6)

Unnamed: 0,Classifier,Num Features,Sorted,Accuracy,Sensitivity,Specificity,Precision,F1 Score,AUC-ROC,AUC-PR,Balanced Accuracy
0,Decision Tree,16,True,0.611111,0.821429,0.567164,0.283951,0.422018,0.744003,0.323445,0.694296
1,Logistic Regression,16,True,0.641975,0.857143,0.597015,0.307692,0.45283,0.786381,0.402304,0.727079
2,Random Forest,16,True,0.648148,0.857143,0.604478,0.311688,0.457143,0.798374,0.435364,0.73081
3,Decision Tree,2856,False,0.623457,0.714286,0.604478,0.273973,0.39604,0.661914,0.245332,0.659382
4,Logistic Regression,2856,False,0.679012,0.714286,0.671642,0.3125,0.434783,0.70709,0.314235,0.692964
5,Random Forest,2856,False,0.660494,0.642857,0.664179,0.285714,0.395604,0.767324,0.377228,0.653518


In [40]:
merged.to_csv("C:\\Users\\preet\\Documents\\penet\\results_sequence\\metadata_final.csv", index=False)