### Import the first data set, clean the header, correct data type and alias

In [1]:
# Make pandas display all columns
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [2]:
# URL of the tree datasets
url_pd = 'https://wiki.cancerimagingarchive.net/download/attachments/3539039/tcia-diagnosis-data-2012-04-20.xls?version=1&modificationDate=1334930231098&api=v2'
url_nsl= 'http://www.via.cornell.edu/lidc/list3.2.csv'
url_ncp = 'https://wiki.cancerimagingarchive.net/download/attachments/3539039/lidc-idri%20nodule%20counts%20%286-23-2015%29.xlsx?version=1&modificationDate=1435085651880&api=v2'


In [3]:
df_pd = pd.read_excel(url_pd)


In [4]:
#Remove the notation of column headers to make it shorter
headers = ['TCIA Patient ID', 'Diagnosis at the Patient Level', 'Diagnosis Method', 
           'Primary tumor site for metastatic disease','Nodule 1 Diagnosis at the Nodule Level',
          'Nodule 1 Diagnosis Method at the Nodule Level','Nodule 2 Diagnosis at the Nodule Level',
          'Nodule 2 Diagnosis Method at the Nodule Level','Nodule 3 Diagnosis at the Nodule Level',
          'Nodule 3 Diagnosis Method at the Nodule Level','Nodule 4 Diagnosis at the Nodule Level',
          'Nodule 4 Diagnosis Method at the Nodule Level','Nodule 5 Diagnosis at the Nodule Level',
          'Nodule 5 Diagnosis Method at the Nodule Level']
df_pd.columns = headers


In [5]:
# Making sure we are using the correct data type
df_pd['Diagnosis at the Patient Level'] = df_pd['Diagnosis at the Patient Level'].astype('object')
df_pd['Diagnosis Method'] = df_pd['Diagnosis Method'].astype('object')
df_pd['Nodule 1 Diagnosis at the Nodule Level'] = df_pd['Nodule 1 Diagnosis at the Nodule Level'].astype('object')

In [6]:
# Make sure we are using the alias of the dataset(easly interpretable), not the coding of if
df_pd.loc[df_pd['Diagnosis at the Patient Level'] == 0, 'Diagnosis at the Patient Level'] = 'unknown'
df_pd.loc[df_pd['Diagnosis at the Patient Level'] == 1, 'Diagnosis at the Patient Level'] = 'benign or non-malignant disease'
df_pd.loc[df_pd['Diagnosis at the Patient Level'] == 2, 'Diagnosis at the Patient Level'] = 'malignant, primary lung cancer'
df_pd.loc[df_pd['Diagnosis at the Patient Level'] == 3, 'Diagnosis at the Patient Level'] = 'malignant metastatic'

df_pd.loc[df_pd['Diagnosis Method'] == 0, 'Diagnosis Method'] = 'unknown'
df_pd.loc[df_pd['Diagnosis Method'] == 1, 'Diagnosis Method'] = 'review of radiological images to show 2 years of stable nodule'
df_pd.loc[df_pd['Diagnosis Method'] == 2, 'Diagnosis Method'] = 'biopsy'
df_pd.loc[df_pd['Diagnosis Method'] == 3, 'Diagnosis Method'] = 'surgical resection'
df_pd.loc[df_pd['Diagnosis Method'] == 4, 'Diagnosis Method'] = 'progression or response'


df_pd.loc[df_pd['Nodule 1 Diagnosis at the Nodule Level'] == 0, 'Nodule 1 Diagnosis at the Nodule Level'] = 'unknown'
df_pd.loc[df_pd['Nodule 1 Diagnosis at the Nodule Level'] == 1, 'Nodule 1 Diagnosis at the Nodule Level'] = 'benign or non-malignant disease'
df_pd.loc[df_pd['Nodule 1 Diagnosis at the Nodule Level'] == 2, 'Nodule 1 Diagnosis at the Nodule Level'] = 'malignant, primary lung cancer'
df_pd.loc[df_pd['Nodule 1 Diagnosis at the Nodule Level'] == 3, 'Nodule 1 Diagnosis at the Nodule Level'] = 'malignant metastatic'

## 157 patient has diagnosis out of 1018

### Create Response Variable

In [7]:
#Need to import numpy nan to fill in nan with "unknown" in patient diagnosis
from numpy import nan
df_pd.loc[df_pd['Diagnosis at the Patient Level']=='unknown', 'Cancer'] = nan
df_pd.loc[df_pd['Diagnosis at the Patient Level']=='benign or non-malignant disease', 'Cancer'] = 0
df_pd.loc[df_pd['Diagnosis at the Patient Level']=='malignant, primary lung cancer','Cancer' ] = 1
df_pd.loc[df_pd['Diagnosis at the Patient Level']=='malignant metastatic', 'Cancer'] = 1
df_pd['Cancer'] = df_pd['Cancer'].astype('object')
df_pd.shape

(157, 15)

### Connecting 2nd dataset, merge, and keep records that has scanning result

In [8]:
# Connecting 2nd dataset "nodule count by patient"
df_ncp_origin = pd.read_excel(url_ncp)
# Drop Unuseful Columns
df_ncp_origin = df_ncp_origin.drop(columns=['Unnamed: 4', 'Unnamed: 5'])
df_ncp_origin.shape
## Drop last row----it is a sum, not a patient
df_ncp = df_ncp_origin.dropna(subset = ['TCIA Patent ID'])
# Shape is (1018,4)

In [9]:
## Merge df_ncp (Nodule count per patient) with df_pd (Patient Diagnosis), shape is (1018,19) (15+4 columns combined)
df_ncp_pd = pd.merge(df_ncp, df_pd, how='left', left_on='TCIA Patent ID', right_on='TCIA Patient ID')
df_ncp_pd.shape

(1018, 19)

In [10]:
#keep records that only have a scanning result
df_ncp_pd = df_ncp_pd.dropna(subset = ['Cancer'])
df_ncp_pd.shape
# Generating dataframe with shape (131, 19)

(131, 19)

### Connectin 3rd dataset df_nsl (nodule size list), clean and merge

In [11]:
# connecting data source
df_nsl_full = pd.read_csv(url_nsl)
df_nsl_full.shape
# Selet columns that will only be used in the dataset
df_nsl = df_nsl_full[['case', 'scan','roi', 'volume']]
df_nsl.shape

(2635, 4)

In [12]:
# Create column 'case' to be the trim of Patent ID with the same "four digit" format, and make both string type
df_ncp_pd['case'] = df_ncp_pd['TCIA Patent ID'].str[-4:]
df_ncp_pd['case'] = df_ncp_pd['case'].apply(str)
df_nsl['case'] = df_nsl['case'].apply(str) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [13]:
# Merge df_nsl(nodule size list) with df_ncp_pd 
df_ncp_pd_nsl = pd.merge( df_nsl,df_ncp_pd, how='right', on='case')
df_ncp_pd_nsl


# Shape is (140, 19+4=23)

Unnamed: 0,case,scan,roi,volume,TCIA Patent ID,Total Number of Nodules*,Number of Nodules >=3mm**,Number of Nodules <3mm***,TCIA Patient ID,Diagnosis at the Patient Level,Diagnosis Method,Primary tumor site for metastatic disease,Nodule 1 Diagnosis at the Nodule Level,Nodule 1 Diagnosis Method at the Nodule Level,Nodule 2 Diagnosis at the Nodule Level,Nodule 2 Diagnosis Method at the Nodule Level,Nodule 3 Diagnosis at the Nodule Level,Nodule 3 Diagnosis Method at the Nodule Level,Nodule 4 Diagnosis at the Nodule Level,Nodule 4 Diagnosis Method at the Nodule Level,Nodule 5 Diagnosis at the Nodule Level,Nodule 5 Diagnosis Method at the Nodule Level,Cancer
0,1002,30009.0,1.0,3416.40,LIDC-IDRI-1002,5,4,1,LIDC-IDRI-1002,"malignant, primary lung cancer",biopsy,non-small cell carcinoma,,,,,,,,,,,1
1,1002,30009.0,2.0,3251.56,LIDC-IDRI-1002,5,4,1,LIDC-IDRI-1002,"malignant, primary lung cancer",biopsy,non-small cell carcinoma,,,,,,,,,,,1
2,1002,30009.0,3.0,302.41,LIDC-IDRI-1002,5,4,1,LIDC-IDRI-1002,"malignant, primary lung cancer",biopsy,non-small cell carcinoma,,,,,,,,,,,1
3,1002,30009.0,4.0,2289.63,LIDC-IDRI-1002,5,4,1,LIDC-IDRI-1002,"malignant, primary lung cancer",biopsy,non-small cell carcinoma,,,,,,,,,,,1
4,1004,30946.0,1.0,121.89,LIDC-IDRI-1004,10,4,6,LIDC-IDRI-1004,"malignant, primary lung cancer",surgical resection,LUL NSCLC,"malignant, primary lung cancer",3.0,,,,,,,,,1
5,1004,30946.0,2.0,31.05,LIDC-IDRI-1004,10,4,6,LIDC-IDRI-1004,"malignant, primary lung cancer",surgical resection,LUL NSCLC,"malignant, primary lung cancer",3.0,,,,,,,,,1
6,1004,30946.0,3.0,2273.91,LIDC-IDRI-1004,10,4,6,LIDC-IDRI-1004,"malignant, primary lung cancer",surgical resection,LUL NSCLC,"malignant, primary lung cancer",3.0,,,,,,,,,1
7,1004,30946.0,4.0,80.42,LIDC-IDRI-1004,10,4,6,LIDC-IDRI-1004,"malignant, primary lung cancer",surgical resection,LUL NSCLC,"malignant, primary lung cancer",3.0,,,,,,,,,1
8,1011,2.0,1.0,1911.86,LIDC-IDRI-1011,4,4,0,LIDC-IDRI-1011,malignant metastatic,biopsy,small-cell carcinoma of the tongue,malignant metastatic,2.0,3.0,4.0,,,,,,,1
9,1011,2.0,2.0,3618.77,LIDC-IDRI-1011,4,4,0,LIDC-IDRI-1011,malignant metastatic,biopsy,small-cell carcinoma of the tongue,malignant metastatic,2.0,3.0,4.0,,,,,,,1


In [14]:
# Select rows that has no nodule size or rows has largest nodule size
largest_volume = (df_ncp_pd_nsl['volume'] == df_ncp_pd_nsl.groupby(['case'])['volume'].transform(max)) 
no_nodule_size = df_ncp_pd_nsl['volume'].isnull()
df_ncp_pd_nsl.loc[ largest_volume | no_nodule_size]

Unnamed: 0,case,scan,roi,volume,TCIA Patent ID,Total Number of Nodules*,Number of Nodules >=3mm**,Number of Nodules <3mm***,TCIA Patient ID,Diagnosis at the Patient Level,Diagnosis Method,Primary tumor site for metastatic disease,Nodule 1 Diagnosis at the Nodule Level,Nodule 1 Diagnosis Method at the Nodule Level,Nodule 2 Diagnosis at the Nodule Level,Nodule 2 Diagnosis Method at the Nodule Level,Nodule 3 Diagnosis at the Nodule Level,Nodule 3 Diagnosis Method at the Nodule Level,Nodule 4 Diagnosis at the Nodule Level,Nodule 4 Diagnosis Method at the Nodule Level,Nodule 5 Diagnosis at the Nodule Level,Nodule 5 Diagnosis Method at the Nodule Level,Cancer
0,1002,30009.0,1.0,3416.40,LIDC-IDRI-1002,5,4,1,LIDC-IDRI-1002,"malignant, primary lung cancer",biopsy,non-small cell carcinoma,,,,,,,,,,,1
6,1004,30946.0,3.0,2273.91,LIDC-IDRI-1004,10,4,6,LIDC-IDRI-1004,"malignant, primary lung cancer",surgical resection,LUL NSCLC,"malignant, primary lung cancer",3.0,,,,,,,,,1
10,1011,2.0,3.0,5716.19,LIDC-IDRI-1011,4,4,0,LIDC-IDRI-1011,malignant metastatic,biopsy,small-cell carcinoma of the tongue,malignant metastatic,2.0,3.0,4.0,,,,,,,1
12,0068,,,,LIDC-IDRI-0068,7,6,1,LIDC-IDRI-0068,malignant metastatic,progression or response,Head & Neck Cancer,malignant metastatic,4.0,,,,,,,,,1
13,0071,,,,LIDC-IDRI-0071,4,0,4,LIDC-IDRI-0071,malignant metastatic,review of radiological images to show 2 years ...,Head & Neck,benign or non-malignant disease,1.0,,,,,,,,,1
14,0072,,,,LIDC-IDRI-0072,3,1,2,LIDC-IDRI-0072,"malignant, primary lung cancer",progression or response,Lung Cancer,benign or non-malignant disease,4.0,,,,,,,,,1
15,0088,,,,LIDC-IDRI-0088,6,3,3,LIDC-IDRI-0088,malignant metastatic,unknown,Uterine Cancer,unknown,0.0,,,,,,,,,1
16,0090,,,,LIDC-IDRI-0090,4,1,3,LIDC-IDRI-0090,"malignant, primary lung cancer",surgical resection,NSCLC,"malignant, primary lung cancer",3.0,,,,,,,,,1
17,0091,,,,LIDC-IDRI-0091,5,5,0,LIDC-IDRI-0091,malignant metastatic,progression or response,urothelial carcinoma,malignant metastatic,4.0,,,,,,,,,1
18,0100,,,,LIDC-IDRI-0100,1,0,1,LIDC-IDRI-0100,malignant metastatic,review of radiological images to show 2 years ...,Testis,benign or non-malignant disease,1.0,,,,,,,,,1


## Dataset is finally prepared for modeling (we temporarily use ncp_pd 158*22 )

In [15]:
df = df_ncp_pd_nsl.loc[ largest_volume | no_nodule_size]
#imput the missing nodule size
df['volume'] = df['volume'].fillna(0)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,case,scan,roi,volume,TCIA Patent ID,Total Number of Nodules*,Number of Nodules >=3mm**,Number of Nodules <3mm***,TCIA Patient ID,Diagnosis at the Patient Level,Diagnosis Method,Primary tumor site for metastatic disease,Nodule 1 Diagnosis at the Nodule Level,Nodule 1 Diagnosis Method at the Nodule Level,Nodule 2 Diagnosis at the Nodule Level,Nodule 2 Diagnosis Method at the Nodule Level,Nodule 3 Diagnosis at the Nodule Level,Nodule 3 Diagnosis Method at the Nodule Level,Nodule 4 Diagnosis at the Nodule Level,Nodule 4 Diagnosis Method at the Nodule Level,Nodule 5 Diagnosis at the Nodule Level,Nodule 5 Diagnosis Method at the Nodule Level,Cancer
0,1002,30009.0,1.0,3416.40,LIDC-IDRI-1002,5,4,1,LIDC-IDRI-1002,"malignant, primary lung cancer",biopsy,non-small cell carcinoma,,,,,,,,,,,1
6,1004,30946.0,3.0,2273.91,LIDC-IDRI-1004,10,4,6,LIDC-IDRI-1004,"malignant, primary lung cancer",surgical resection,LUL NSCLC,"malignant, primary lung cancer",3.0,,,,,,,,,1
10,1011,2.0,3.0,5716.19,LIDC-IDRI-1011,4,4,0,LIDC-IDRI-1011,malignant metastatic,biopsy,small-cell carcinoma of the tongue,malignant metastatic,2.0,3.0,4.0,,,,,,,1
12,0068,,,0.00,LIDC-IDRI-0068,7,6,1,LIDC-IDRI-0068,malignant metastatic,progression or response,Head & Neck Cancer,malignant metastatic,4.0,,,,,,,,,1
13,0071,,,0.00,LIDC-IDRI-0071,4,0,4,LIDC-IDRI-0071,malignant metastatic,review of radiological images to show 2 years ...,Head & Neck,benign or non-malignant disease,1.0,,,,,,,,,1
14,0072,,,0.00,LIDC-IDRI-0072,3,1,2,LIDC-IDRI-0072,"malignant, primary lung cancer",progression or response,Lung Cancer,benign or non-malignant disease,4.0,,,,,,,,,1
15,0088,,,0.00,LIDC-IDRI-0088,6,3,3,LIDC-IDRI-0088,malignant metastatic,unknown,Uterine Cancer,unknown,0.0,,,,,,,,,1
16,0090,,,0.00,LIDC-IDRI-0090,4,1,3,LIDC-IDRI-0090,"malignant, primary lung cancer",surgical resection,NSCLC,"malignant, primary lung cancer",3.0,,,,,,,,,1
17,0091,,,0.00,LIDC-IDRI-0091,5,5,0,LIDC-IDRI-0091,malignant metastatic,progression or response,urothelial carcinoma,malignant metastatic,4.0,,,,,,,,,1
18,0100,,,0.00,LIDC-IDRI-0100,1,0,1,LIDC-IDRI-0100,malignant metastatic,review of radiological images to show 2 years ...,Testis,benign or non-malignant disease,1.0,,,,,,,,,1


## Train-Test Split

In [16]:
from sklearn.model_selection import train_test_split
import numpy as np
X = df [['Number of Nodules >=3mm**','Number of Nodules >=3mm**','Number of Nodules <3mm***']]
y = df ['Cancer']
## y can not be object
y=y.astype('int')

# default is 75% / 25% train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)



## Initiating Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree

tree2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train)
print(tree2)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [18]:
tree2.score(X_test, y_test)

0.5757575757575758

## Visualizing Decision Tree

In [19]:
graph = Source(tree.export_graphviz(tree2, out_file=None
   , feature_names=labels, class_names=['0', '1', '2'] 
   , filled = True))

display(SVG(graph.pipe(format='svg')))

NameError: name 'Source' is not defined

Sum of the nodule size, or biggest nodule
Matrix multiplication 有关
Which variable am I using?? 参考jinglu的分析可以照抄，做文献综述


Question to be discussed:


- 1) How to share code with Github and the tool ------------
- 2) How to slice data frame by column--------------
- 3) How to aggregate the data--------------
- 4) What variables are used for the model----------
- 5) How to put categorical data into decision tree-------------
- 6) How to visualize decision tree
- 7) The learning and testing will be 157 but later on 1018?
- 8) How to put hyperlink to data
- 9) precision and reproducability and hurt human?


- I imputed my nodule size