In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load data 

In [3]:
def load(x,y,z,c,v): 
    df_x = pd.read_csv(x)
    df_y = pd.read_csv(y)
    df_z = pd.read_csv(z)
    df_c = pd.read_csv(c)
    df_v = pd.read_csv(v)
    return df_x, df_y, df_z, df_c, df_v


def cleanup(df_x, df_y, df_z, df_c, df_v):
    df_x = df_x[["pid","study_yr"]]
    df_y = df_y[["race","gender","age","scr_res0","scr_res1","scr_res2","pid"]]
    df_z = df_z[["sct_ab_desc","sct_ab_num","study_yr","pid"]]
    df_c = df_c.drop(columns=["visible_days","dataset_version"])
    df_v = df_v[["pid","de_stag","lesionsize","lc_morph","lc_behav","lc_grade","study_yr","lc_order"]]
    return df_x, df_y, df_z, df_c, df_v


def data(df_x, df_y, df_z, df_c, df_v):
    
    df_c[["sct_ab_attn","sct_ab_gwth"]] = df_c[["sct_ab_attn","sct_ab_gwth"]].fillna(9)
    df_c["sct_ab_invg"] = df_c["sct_ab_invg"].fillna(1)
    df_c.dropna(axis = 0, how="any", inplace= True )
    def update(row):
        if pd.isna(row["lesionsize"]):
            if row["de_stag"]<310:
                return 1
            elif row["de_stag"]>=310:
                return 25
            return row["lesionsize"]
        else:
            return row["lesionsize"]



    df_v["lesionsize"] = df_v.apply(update,axis = 1)
    df_v.dropna(axis = 0, how="any", inplace= True )

    return df_x, df_y, df_z, df_c, df_v


def merge(df_x, df_y, df_z, df_c, df_v):
    df_c = df_c.rename(columns={"sct_ab_code":"sct_ab_desc"})
    df = df_y.merge(df_x,on = "pid", how = "outer").merge(df_z, on =["pid","study_yr"],how = "outer").merge(df_c,on =["pid","study_yr","sct_ab_desc"],how = "outer").merge(df_v, on="pid", how = "outer")
    df = df.drop(columns=["sct_ab_num_y","study_yr_y"])
    df[["study_yr_x","sct_ab_desc","sct_ab_num_x","de_stag","lesionsize","lc_morph","lc_behav","lc_grade","lc_order"]] = df[["study_yr_x","sct_ab_desc","sct_ab_num_x","de_stag","lesionsize","lc_morph","lc_behav","lc_grade","lc_order"]].fillna(0)
    df[["sct_ab_preexist", "sct_ab_attn", "sct_ab_gwth", "sct_ab_invg"]] = df[["sct_ab_preexist", "sct_ab_attn", "sct_ab_gwth", "sct_ab_invg"]].fillna(1)
    df = df.dropna()
    
    return df



In [4]:
df_x, df_y, df_z, df_c, df_v = load("nlst_780_screen_idc_20210527.csv","nlst_780_prsn_idc_20210527.csv","nlst_780_ctab_idc_20210527.csv","nlst_780_ctabc_idc_20210527.csv","nlst_780_canc_idc_20210527.csv")
df_x, df_y, df_z, df_c, df_v=cleanup(df_x, df_y, df_z, df_c, df_v)
df_x, df_y, df_z, df_c, df_v=data(df_x, df_y, df_z, df_c, df_v)                                
df = merge(df_x, df_y, df_z, df_c, df_v)

In [5]:
# going to remove all the patients without cancer
df = df.loc[df["de_stag"] != 0]

# Train / Test split 

In [7]:
X = df.drop("de_stag",axis = 1)
y = df["de_stag"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=35)

In [8]:
model = DecisionTreeClassifier(random_state=35)
model.fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9175745118191161
Classification Report:
               precision    recall  f1-score   support

       110.0       0.96      0.96      0.96      1958
       120.0       0.90      0.90      0.90       493
       210.0       0.85      0.85      0.85       165
       220.0       0.79      0.90      0.84       143
       310.0       0.85      0.85      0.85       350
       320.0       0.87      0.89      0.88       499
       400.0       0.93      0.90      0.91      1176
       888.0       0.50      1.00      0.67         1
       900.0       1.00      1.00      1.00        20
       994.0       0.94      0.94      0.94        16
       999.0       0.90      0.98      0.93        44

    accuracy                           0.92      4865
   macro avg       0.86      0.92      0.88      4865
weighted avg       0.92      0.92      0.92      4865

