# Project - ML approach to assist Lung Cancer Detection from clinical data 

#### This research combined three clinical datasets, patient data, nodule count per patient, and nodule size list and feed significant features into a rule based classifier algorithms in order to reduce the false positive of clinical identification of lung cancer.......

### The key parts in data analysis:
#### 1. Data preparation: merging, cleaning, formatting
#### 2. ML and prediction
#### 3. Evaluation of predictive models performances

### Import the first data set, clean the header, correct data type and alias

In [None]:
# Make pandas display all columns
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [None]:
# URL of the tree datasets
url_pd = 'https://wiki.cancerimagingarchive.net/download/attachments/3539039/tcia-diagnosis-data-2012-04-20.xls?version=1&modificationDate=1334930231098&api=v2'
url_nsl= 'http://www.via.cornell.edu/lidc/list3.2.csv'
url_ncp = 'https://wiki.cancerimagingarchive.net/download/attachments/3539039/lidc-idri%20nodule%20counts%20%286-23-2015%29.xlsx?version=1&modificationDate=1435085651880&api=v2'

In [None]:
# Read first data set, pd
df_pd = pd.read_excel(url_pd)
#Remove the notation of column headers to make it shorter
headers = ['TCIA Patient ID', 'Diagnosis at the Patient Level', 'Diagnosis Method', 
           'Primary tumor site for metastatic disease','Nodule 1 Diagnosis at the Nodule Level',
          'Nodule 1 Diagnosis Method at the Nodule Level','Nodule 2 Diagnosis at the Nodule Level',
          'Nodule 2 Diagnosis Method at the Nodule Level','Nodule 3 Diagnosis at the Nodule Level',
          'Nodule 3 Diagnosis Method at the Nodule Level','Nodule 4 Diagnosis at the Nodule Level',
          'Nodule 4 Diagnosis Method at the Nodule Level','Nodule 5 Diagnosis at the Nodule Level',
          'Nodule 5 Diagnosis Method at the Nodule Level']
df_pd.columns = headers

In [None]:
# Corrected data type of 3 categorical variables
df_pd['Diagnosis at the Patient Level'] = df_pd['Diagnosis at the Patient Level'].astype('object')
df_pd['Diagnosis Method'] = df_pd['Diagnosis Method'].astype('object')
df_pd['Nodule 1 Diagnosis at the Nodule Level'] = df_pd['Nodule 1 Diagnosis at the Nodule Level'].astype('object')

In [None]:
# Make sure we are using the alias of the dataset(easly interpretable), not the coding of if
df_pd.loc[df_pd['Diagnosis at the Patient Level'] == 0, 'Diagnosis at the Patient Level'] = 'unknown'
df_pd.loc[df_pd['Diagnosis at the Patient Level'] == 1, 'Diagnosis at the Patient Level'] = 'benign or non-malignant disease'
df_pd.loc[df_pd['Diagnosis at the Patient Level'] == 2, 'Diagnosis at the Patient Level'] = 'malignant, primary lung cancer'
df_pd.loc[df_pd['Diagnosis at the Patient Level'] == 3, 'Diagnosis at the Patient Level'] = 'malignant metastatic'

df_pd.loc[df_pd['Diagnosis Method'] == 0, 'Diagnosis Method'] = 'unknown'
df_pd.loc[df_pd['Diagnosis Method'] == 1, 'Diagnosis Method'] = 'review of radiological images to show 2 years of stable nodule'
df_pd.loc[df_pd['Diagnosis Method'] == 2, 'Diagnosis Method'] = 'biopsy'
df_pd.loc[df_pd['Diagnosis Method'] == 3, 'Diagnosis Method'] = 'surgical resection'
df_pd.loc[df_pd['Diagnosis Method'] == 4, 'Diagnosis Method'] = 'progression or response'

df_pd.loc[df_pd['Nodule 1 Diagnosis at the Nodule Level'] == 0, 'Nodule 1 Diagnosis at the Nodule Level'] = 'unknown'
df_pd.loc[df_pd['Nodule 1 Diagnosis at the Nodule Level'] == 1, 'Nodule 1 Diagnosis at the Nodule Level'] = 'benign or non-malignant disease'
df_pd.loc[df_pd['Nodule 1 Diagnosis at the Nodule Level'] == 2, 'Nodule 1 Diagnosis at the Nodule Level'] = 'malignant, primary lung cancer'
df_pd.loc[df_pd['Nodule 1 Diagnosis at the Nodule Level'] == 3, 'Nodule 1 Diagnosis at the Nodule Level'] = 'malignant metastatic'
## 156 out of 1018 patients has diagnosis result

### Create binary Response Variable, named 'Cancer'

In [None]:
#Need to import numpy nan to fill in nan with "unknown" in patient diagnosis
from numpy import nan
df_pd.loc[df_pd['Diagnosis at the Patient Level']=='unknown', 'Cancer'] = nan
df_pd.loc[df_pd['Diagnosis at the Patient Level']=='benign or non-malignant disease', 'Cancer'] = 0
df_pd.loc[df_pd['Diagnosis at the Patient Level']=='malignant, primary lung cancer','Cancer' ] = 1
df_pd.loc[df_pd['Diagnosis at the Patient Level']=='malignant metastatic', 'Cancer'] = 1
df_pd['Cancer'] = df_pd['Cancer'].astype('object')
df_pd.shape
# Final pd dataset has 157 rows and 15 columns

### Connecting 2nd dataset ncp, left join, and keep only the records that has scanning result

In [None]:
# Save the 2nd dataset "nodule count by patient", to dataframe "df_ncp_origin"
df_ncp_origin = pd.read_excel(url_ncp)
# Drop Unuseful Columns, drop last row (a sum) 
df_ncp_origin = df_ncp_origin.drop(columns=['Unnamed: 4', 'Unnamed: 5'])
df_ncp = df_ncp_origin.dropna(subset = ['TCIA Patent ID'])
df_ncp.columns
# Shape is (1018,4)

In [None]:
## Merge df_ncp (Nodule count per patient 1018 *4) with df_pd (Patient Diagnosis 157 *15), saved in df_ncp_pd
df_ncp_pd = pd.merge(df_ncp, df_pd, how='left', left_on='TCIA Patent ID', right_on='TCIA Patient ID')
df_ncp_pd.shape
## To get a dataframe of 1018 * 19

In [None]:
#keep records that only have a scanning result
df_ncp_pd = df_ncp_pd.dropna(subset = ['Cancer'])
df_ncp_pd.shape
# Generating dataframe of 19 columns but only 131 rows 

### Connectin 3rd dataset df_nsl (nodule size list), clean and merge

In [None]:
# connecting data source
df_nsl_full = pd.read_csv(url_nsl)
# Selet columns that will only be used, saved into dataframe df_nsl 
df_nsl = df_nsl_full[['case', 'scan','roi', 'volume']]
df_nsl.shape

In [None]:
# Create column 'Patient_case' in df_ncp_pd to be the trim of Patent ID with the same "four digit" format, and make both string type
df_ncp_pd['Patient_case'] = df_ncp_pd['TCIA Patent ID'].str[-4:]
df_ncp_pd['Patient_case'] = df_ncp_pd['Patient_case'].apply(str)
# Also make 'case' column in nodule size list dataframe into string
df_nsl['case'] = df_nsl['case'].apply(str) 


In [None]:
## Take a look at the two data frame

In [None]:
# Merge df_nsl(nodule size list, 2635 * 4) with df_ncp_pd (131 * 19)
df_ncp_pd_nsl = pd.merge(df_nsl, df_ncp_pd, how='right', left_on='case',right_on = 'Patient_case')
df_ncp_pd_nsl.shape
# Shape is (2635, 24)

In [None]:
#keep records that only have a scanning result
df_ncp_pd_nsl = df_ncp_pd_nsl.dropna(subset = ['Cancer'])
df_ncp_pd_nsl.shape
# Generating dataframe of 19 columns but only 131 rows 

In [None]:
# Select rows that has no nodule size, or rows has largest nodule size
largest_volume = (df_ncp_pd_nsl['volume'] == df_ncp_pd_nsl.groupby(['case'])['volume'].transform(max)) 
no_nodule_size = df_ncp_pd_nsl['volume'].isnull()
df = df_ncp_pd_nsl.loc[ largest_volume | no_nodule_size]


In [None]:
## Since volumne colum has nan, let's impute them with avg
df['volume'].fillna((df['volume'].mean()), inplace=True)


## Dataset df_ncp_pd_nsl is finally prepared, saved into "df" for modeling 

In [None]:
## Somehow the 'case' column can be dropped now


## Export the data to spreadsheet, for examination purpose

In [None]:
# Export to a local csv file in 'Download' folder
export_path = '/Users/dahailiu/Downloads/20181109_1039.csv'
df.to_csv(export_path)

## Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
X = df [['Total Number of Nodules* ','Number of Nodules >=3mm**','Number of Nodules <3mm***', 'volume']]
y = df ['Cancer']
## y can not be object
y=y.astype('int')
# default is 75% / 25% train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Initiating Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree

tree2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train)
print(tree2)
tree2.score(X_test, y_test)

## Visualizing Decision Tree

In [None]:
lpy = [item for item in X_train.columns]

In [None]:
import pydot_ng as pydot
from IPython.display import IFrame
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

with open("dt.dot","w") as dot_data:
    export_graphviz(tree2, out_file=dot_data, filled=True, 
                feature_names = lpy,label = 'all')
pydot.graph_from_dot_file("dt.dot").write_png("dt.png")
IFrame("dt.png", width = 700, height = 300)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 10, max_features = 3, random_state = 0).fit(X_train, y_train)
forest.score(X_test, y_test)

In [None]:
## Plotting Random Forest


## Gradient Boosted Decision Tree (GBDT)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
GBDT = GradientBoostingClassifier(learning_rate = .1, max_depth = 4, random_state = 0)
gbdt = GBDT.fit(X_train, y_train)
gbdt.score(X_test, y_test)

In [None]:
# plotting GBDT
#with open("gbdt.dot","w") as dot_data:
#    export_graphviz(gbdt, out_file=dot_data, filled=True, 
#                feature_names = lpy,label = 'all')
#pydot.graph_from_dot_file("gbdt.dot").write_png("dbdt.png")
#IFrame("gbdt.png", width = 700, height = 300)

## Confusion Matrices

In [None]:
## Confusion Matrics for Gradient Boosted Decision Tree
from sklearn.metrics import confusion_matrix
gbdt_predicted = gbdt.predict(X_test)
confusion_gbdt = confusion_matrix(y_test, gbdt_predicted)
print('gradient boost decision tree classifier',  confusion_gbdt)

## Evaluation Metrics for Gradient Boost Decision Tree

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)  Also known as sensitivity, or True Positive Rate
# F1 = 2 * Precision * Recall / (Precision + Recall) 
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, gbdt_predicted)))
print('Precision, which matters more: {:.2f}'.format(precision_score(y_test, gbdt_predicted)))
print('Recall: {:.2f}'.format(recall_score(y_test, gbdt_predicted)))
print('F1: {:.2f}'.format(f1_score(y_test, gbdt_predicted)))

In [None]:
## Plotting ROC Curves
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
y_score_gbdt = gbdt.decision_function(X_test)
fpr_gbdt, tpr_gbdt, _ = roc_curve(y_test, y_score_gbdt)
roc_auc_gbdt = auc(fpr_gbdt, tpr_gbdt)
plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_gbdt, tpr_gbdt, lw=3, label='GBDT ROC curve (area = {:0.2f})'.format(roc_auc_gbdt))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()