In [1]:
#!conda install -c conda-forge pydotplus -y
#!conda install -c conda-forge python-graphviz -y

### Get Data 

In [3]:
## Get datasets from local machine into Jupyter Pandas dataframe, check shape
import pandas as pd
import numpy as np
df_pt_full = pd.read_csv('/Users/dahailiu/Desktop/DM for Cancer /NLST dataset/participant.data.d100517.csv')
df_lc = pd.read_csv('/Users/dahailiu/Desktop/DM for Cancer /NLST dataset/Lung Cancer/lung_cancer.data.d100517.csv')
df_sctabn_full = pd.read_csv('/Users/dahailiu/Desktop/DM for Cancer /NLST dataset/Spiral CT Abnormalities/sct_abnormalities.data.d100517.csv')
print(df_pt_full.shape)
print(df_lc.shape)
print(df_sctabn_full.shape)
pd.set_option('display.max_columns', None)

(53452, 324)
(2150, 48)
(177487, 12)


### Dataset #1: patient dataset, df_pt

In [9]:
#### Trim dataset: Selecting only the people being CT scanned, and those has a record of Cancer vs. Non Cancer
## Select only CT scan
df_pt_ct = df_pt_full.loc[df_pt_full['rndgroup'] == 1]
## Select only people has a record of cancer vs non cancer
df_pt = df_pt_ct.loc[df_pt_ct['conflc'].isin([1,2])]
print('Shape of this datset:', df_pt.shape)
## This will trim the patient dataset df_pt into 6379 rows, from the original of 53452 rows
## summarize the cancer/Non-cancer count in pt dataset
conflc_counts = df_pt['conflc'].value_counts().to_frame()
conflc_counts.rename(columns={'conflc': 'Number of patient'}, inplace=True)
conflc_counts.index.name = 'conflc'
conflc_counts
# 1089 with cancer, and 5290 without cancer

Shape of this datset: (6379, 324)


Unnamed: 0_level_0,Number of patient
conflc,Unnamed: 1_level_1
2,5290
1,1089


### Dataset #3 -Sct_abnormality: df_sctabn and its trim

In [11]:
## Select in abnormality only the rows that has sct_ab_desc in 51,52,53,62

df_abn_bigsmallnomany = df_sctabn_full.loc[df_sctabn_full['sct_ab_desc'].isin([51,52,53,62])]

## select columns in abnormality dataset that's only useful for the research
df_abn_useful = df_abn_bigsmallnomany[['dataset_version','pid','sct_ab_desc','sct_ab_num', 'sct_epi_loc',
                                       'sct_long_dia','sct_slice_num','study_yr']] 
df_abn_useful.shape

(81356, 8)

In [13]:
## Calculation of largest nodule, total nodule size, number of nodule
df_abn_agg = df_abn_useful[['pid','sct_long_dia','study_yr']]
df_abn_nodulesum =    df_abn_agg.groupby(['pid'], as_index = False).sum()
df_abn_nodulemax =    df_abn_agg.groupby(['pid'], as_index = False).max()
df_abn_nodulecounts = df_abn_agg.groupby(['pid'], as_index = False).count()
## 81,356 record will become 19,116 after picking the sum/max/count with one pid only appear once
## Renaming the 3 data frames with each of their 'sct_long_dia' columns to indicate it's a sum, max, or a count
df_abn_nodulesum.rename(columns={'sct_long_dia': 'sct_long_dia_sum'}, inplace=True)
df_abn_nodulemax.rename(columns={'sct_long_dia': 'sct_long_dia_max'}, inplace=True)
df_abn_nodulecounts.rename(columns={'sct_long_dia': 'sct_long_dia_count'}, inplace=True)

### Joining #1 df_pt with #3 df_sct_abn

In [14]:
df_pt_abn_sum = pd.merge(df_pt, df_abn_nodulesum, how='left', left_on='pid', right_on='pid')
df_pt_abn_sum_max = pd.merge(df_pt_abn_sum, df_abn_nodulemax, how='left', left_on='pid', right_on='pid')
df_pt_abn_sum_max_counts = pd.merge(df_pt_abn_sum_max, df_abn_nodulecounts, how='left', left_on='pid', right_on='pid')
df_pt_abn_sum_max_counts.head()
## From here, the pt dataset is combined with abnormal dataset, with nodules size info, that contains max/sum/counts
## The N match exactly what has been discussed, which is 6379 patients has a record of Cancer/No Cancer
## Surprisingly, and happlily, the N=6379 has not been decreased by joining the abn dataset with sct_ab_desc in (51, 52, 53, 62)

Unnamed: 0,cen,dataset_version,elig,ineligible,pid,rndgroup,study,age,educat,ethnic,gender,height,marital,race,weight,age_quit,cigar,cigsmok,pipe,pkyr,smokeage,smokeday,smokelive,smokework,smokeyr,scr_days0,scr_days1,scr_days2,scr_group,scr_iso0,scr_iso1,scr_iso2,scr_lat0,scr_lat1,scr_lat2,scr_res0,scr_res1,scr_res2,sct_image_has,sct_image_years,biop0,biop1,biop2,bioplc,invas0,invas1,invas2,invaslc,medcomp0,medcomp1,medcomp2,medcomplc,mra_stat0,mra_stat1,mra_stat2,no_proc_reas0,no_proc_reas1,no_proc_reas2,proc0,proc1,proc2,proclc,can_scr,canc_free_days,canc_rpt_link,canc_rpt_source,cancyr,candx_days,conflc,de_grade,de_stag,de_stag_7thed,de_type,lesionsize,loccar,loclhil,loclin,locllow,loclmsb,loclup,locmed,locoth,locrhil,locrlow,locrmid,locrmsb,locrup,locunk,treatlc,contactstatus,fup_days,wdlost,dcfdeathlc,dcficd,death_days,deathcutoff,deathstat,finaldeathlc,hasdcf,ndicd,evp_revr,evpcert,evpdeath,evpdirect,evpincomplete,evpsel,evpsent,resasbe,resbaki,resbutc,reschem,rescoal,rescott,resfarm,resfire,resflou,resfoun,reshard,respain,ressand,resweld,wrkasbe,wrkbaki,wrkbutc,wrkchem,wrkcoal,wrkcott,wrkfarm,wrkfire,wrkflou,wrkfoun,wrkhard,wrkpain,wrksand,wrkweld,yrsasbe,yrsbaki,yrsbutc,yrschem,yrscoal,yrscott,yrsfarm,yrsfire,yrsflou,yrsfoun,yrshard,yrspain,yrssand,yrsweld,ageadas,ageasbe,agebron,agechas,agechro,agecopd,agediab,ageemph,agefibr,agehear,agehype,agepneu,agesarc,agesili,agestro,agetube,diagadas,diagasbe,diagbron,diagchas,diagchro,diagcopd,diagdiab,diagemph,diagfibr,diaghear,diaghype,diagpneu,diagsarc,diagsili,diagstro,diagtube,ageblad,agebrea,agecerv,agecolo,ageesop,agekidn,agelary,agelung,agenasa,ageoral,agepanc,agephar,agestom,agethyr,agetran,cancblad,cancbrea,canccerv,canccolo,cancesop,canckidn,canclary,canclung,cancnasa,cancoral,cancpanc,cancphar,cancstom,cancthyr,canctran,fambrother,famchild,famfather,fammother,famsister,acrin_alc_curr,acrin_alc_ever,acrin_drink24hr,acrin_drinknum_curr,acrin_drinknum_form,acrin_drinkyrs_curr,acrin_drinkyrs_form,acrin_lastdrink,lss_alcohol_freq,lss_alcohol_num,confirmed_candxdays1,confirmed_candxdays2,confirmed_candxdays3,confirmed_candxdays4,confirmed_conforder1,confirmed_conforder2,confirmed_conforder3,confirmed_conforder4,confirmed_icd_behav1,confirmed_icd_behav2,confirmed_icd_behav3,confirmed_icd_behav4,confirmed_icd_grade1,confirmed_icd_grade2,confirmed_icd_grade3,confirmed_icd_grade4,confirmed_icd_morph1,confirmed_icd_morph2,confirmed_icd_morph3,confirmed_icd_morph4,confirmed_icd_topog1,confirmed_icd_topog2,confirmed_icd_topog3,confirmed_icd_topog4,confirmed_seer1,confirmed_seer2,confirmed_seer3,confirmed_seer4,confirmed_seercat1,confirmed_seercat2,confirmed_seercat3,confirmed_seercat4,num_confirmed,anyscr_has_nodule,last_progfree_days,prog_days_1st,prog_days_2nd,prog_days_3rd,prog_days_4th,prog_days_5th,progressed_ever,progression_num,progsite_adrenal_1st,progsite_adrenal_days,progsite_adrenal_ever,progsite_adrenal_num,progsite_bone_1st,progsite_bone_days,progsite_bone_ever,progsite_bone_num,progsite_brain_1st,progsite_brain_days,progsite_brain_ever,progsite_brain_num,progsite_liver_1st,progsite_liver_days,progsite_liver_ever,progsite_liver_num,progsite_lymph_n1_1st,progsite_lymph_n1_days,progsite_lymph_n1_ever,progsite_lymph_n1_num,progsite_lymph_n2_1st,progsite_lymph_n2_days,progsite_lymph_n2_ever,progsite_lymph_n2_num,progsite_lymph_n3_1st,progsite_lymph_n3_days,progsite_lymph_n3_ever,progsite_lymph_n3_num,progsite_mediastinum_1st,progsite_mediastinum_days,progsite_mediastinum_ever,progsite_mediastinum_num,progsite_orig_lung_1st,progsite_orig_lung_days,progsite_orig_lung_ever,progsite_orig_lung_num,progsite_other_1st,progsite_other_days,progsite_other_ever,progsite_other_lung_1st,progsite_other_lung_days,progsite_other_lung_ever,progsite_other_lung_num,progsite_other_num,progsite_pleura_1st,progsite_pleura_days,progsite_pleura_ever,progsite_pleura_num,progsite_skin_1st,progsite_skin_days,progsite_skin_ever,progsite_skin_num,progsite_unk_1st,progsite_unk_days,progsite_unk_ever,progsite_unk_num,sct_long_dia_sum,study_yr_x,sct_long_dia_max,study_yr_y,sct_long_dia_count,study_yr
0,AF,2011.02.03/10.05.17,2,,100004,1,1,60,5,2,1,70.0,2.0,1,205.0,45.0,0.0,0,1.0,34.0,22.0,40,0.0,1.0,17,8.0,452.0,743.0,2,4,4,2,0,0,0,4,5,2,1.0,3.0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,0,2,1,0,0,0,0,0,0,2688,0,5,,,2,,,,,,,,,,,,,,,,,,,,,1,2688,0,,,,0,0,,0,,,,,,,,,0.0,,,,,,0.0,0.0,0.0,,,,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,5.0,,,,,,5.0,2.0,2.0,,,,,5.0,,,,,,,,,,,55.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,5.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.0,3.0,4.0,2.0,2.0,3.0
1,AV,2011.02.03/10.05.17,2,,100012,1,1,61,7,2,2,67.0,2.0,1,142.0,,1.0,1,1.0,37.0,22.0,20,1.0,1.0,37,16.0,363.0,,1,4,4,13,0,0,0,4,6,13,1.0,2.0,0,1,0,1,0,1,0,1,0,0,0,0,2,1,0,0,0,0,1,1,0,1,1,336,1,5,1.0,454.0,1,3.0,3.0,3.0,8140.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,2451,0,,,,0,0,,0,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,5.0,2.0,454.0,1548.0,,,1.0,2.0,,,3.0,3.0,,,2.0,2.0,,,8140.0,8500.0,,,C34.1,C50.8,,,22030.0,26000.0,,,13.0,17.0,,,2,1.0,1498.0,1498.0,,,,,1.0,1.0,0.0,,0.0,0.0,1.0,1498.0,1.0,1.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,,,0.0,0.0,,,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0,,0.0,0.0,23.0,1.0,15.0,1.0,2.0,2.0
2,AF,2011.02.03/10.05.17,2,,100019,1,1,61,5,2,1,65.0,2.0,1,144.0,58.0,0.0,0,0.0,78.0,15.0,40,0.0,1.0,39,0.0,379.0,717.0,2,2,4,2,0,0,0,2,6,2,1.0,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,2579,0,5,,,2,,,,,,,,,,,,,,,,,,,,,1,2579,0,,,,0,0,,0,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,60.0,,60.0,50.0,20.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,2.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14.0,2.0,14.0,1.0,1.0,2.0
3,BD,2011.02.03/10.05.17,2,,100026,1,1,57,4,2,1,71.0,2.0,1,252.0,57.0,0.0,0,0.0,61.5,16.0,30,1.0,1.0,41,31.0,360.0,724.0,2,4,4,4,0,0,0,4,5,2,1.0,3.0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,0,1,1,0,0,0,0,0,0,2585,0,5,,,2,,,,,,,,,,,,,,,,,,,,,1,2585,0,,,,0,0,,0,,,,,,,,,,,,,,,,,,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,,,,,,,,36.0,,,,,,,,,,,,,,3.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,2.0,3.0,2201.0,,,,1.0,,,,3.0,,,,2.0,,,,8140.0,,,,C61.9,,,,28010.0,,,,19.0,,,,1,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,17.0,11.0,5.0,2.0,4.0,10.0
4,BD,2011.02.03/10.05.17,2,,100035,1,1,55,4,2,2,70.0,2.0,1,154.0,,0.0,1,0.0,38.0,17.0,20,1.0,1.0,38,35.0,406.0,782.0,2,4,4,4,0,0,0,4,5,2,1.0,3.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,2602,0,5,,,2,,,,,,,,,,,,,,,,,,,,,1,2602,0,,,,0,0,,0,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15.0,6.0,5.0,2.0,3.0,5.0


### Assembling Table 1:

In [25]:
## Building table 1 for the first model(this we only use dataset pt, and CT, but not lc)
## Based on literature review of Jinglu's code:

df_pt_abn = df_pt_abn_sum_max_counts[['pid', 'age','gender', 'smokelive','race','pkyr','smokework',
                                      'famfather','fammother','anyscr_has_nodule','conflc', 'sct_long_dia_sum'
                                      ,'sct_long_dia_max', 'sct_long_dia_count','study_yr','diagcopd']]

## Did Jinglu use max, sum or count??? Which COPD is important?? 
print('The shape of the final dataset is: ',df_pt_abn.shape)

print('How many missing values in each columns?', '\n', df_pt_abn.isnull().sum())

The shape of the final dataset is:  (6379, 16)
How many missing values in each columns? 
 pid                     0
age                     0
gender                  0
smokelive              34
race                    0
pkyr                    0
smokework              58
famfather             160
fammother             139
anyscr_has_nodule      15
conflc                  0
sct_long_dia_sum      171
sct_long_dia_max      411
sct_long_dia_count    171
study_yr              171
diagcopd               10
dtype: int64


### I just deleted rows contain missing values here, which I'm sure this isn't right

In [26]:
df_pt_abn.shape
df_pt_abn = df_pt_abn.dropna()

In [30]:
## Correcting Data Types for modeling purpose
df_pt_abn['gender'] = df_pt_abn['gender'].astype('object')
df_pt_abn['smokelive'] = df_pt_abn['smokelive'].astype('object')
df_pt_abn['race'] = df_pt_abn['race'].astype('object')
df_pt_abn['smokework'] = df_pt_abn['smokework'].astype('object')
df_pt_abn['famfather'] = df_pt_abn['famfather'].astype('object')
df_pt_abn['fammother'] = df_pt_abn['fammother'].astype('object')
df_pt_abn['anyscr_has_nodule'] = df_pt_abn['anyscr_has_nodule'].astype('object')
df_pt_abn['conflc'] = df_pt_abn['conflc'].astype('int')
df_pt_abn['study_yr'] = df_pt_abn['study_yr'].astype('object')
df_pt_abn['diagcopd'] = df_pt_abn['diagcopd'].astype('object')
df_pt_abn['pkyr'] = df_pt_abn['pkyr'].astype('float32')
df_pt_abn['sct_long_dia_sum'] = df_pt_abn['sct_long_dia_sum'].astype('float32')
df_pt_abn['sct_long_dia_max'] = df_pt_abn['sct_long_dia_max'].astype('float32')
df_pt_abn['sct_long_dia_count'] = df_pt_abn['sct_long_dia_count'].astype('float32')
df_pt_abn['pid'] = df_pt_abn['pid'].astype('int32')
df_pt_abn['age'] = df_pt_abn['age'].astype('int32')
df_pt_abn.dtypes

pid                     int32
age                     int32
gender                 object
smokelive              object
race                   object
pkyr                  float32
smokework              object
famfather              object
fammother              object
anyscr_has_nodule      object
conflc                  int64
sct_long_dia_sum      float32
sct_long_dia_max      float32
sct_long_dia_count    float32
study_yr               object
diagcopd               object
dtype: object

### Data Examination

In [29]:
a = list(['gender', 'smokelive','race','smokework',
                                      'famfather','fammother','anyscr_has_nodule','conflc'
                                      ,'sct_long_dia_max', 'sct_long_dia_count','study_yr','diagcopd'])
for column in a:
    print(df_pt_abn[column].value_counts())
    print('                              ')

1    3461
2    2300
Name: gender, dtype: int64
                              
1.0    5050
0.0     711
Name: smokelive, dtype: int64
                              
1     5306
2      182
3      149
6       78
5       24
4       15
7        4
99       1
98       1
96       1
Name: race, dtype: int64
                              
1.0    4880
0.0     881
Name: smokework, dtype: int64
                              
0.0    5170
1.0     591
Name: famfather, dtype: int64
                              
0.0    5422
1.0     339
Name: fammother, dtype: int64
                              
1.0    5761
Name: anyscr_has_nodule, dtype: int64
                              
2    4960
1     801
Name: conflc, dtype: int64
                              
5.0      1079
6.0       855
4.0       722
7.0       648
8.0       482
10.0      305
9.0       303
11.0      183
12.0      163
13.0      136
15.0      108
14.0       96
17.0       79
16.0       75
18.0       66
20.0       53
19.0       46
24.0       30
28.0 

In [28]:
export_path = '/Users/dahailiu/Downloads/20181127_1226.csv'
#df_pt_abn.to_csv(export_path)

## Machine Learning starts here:

In [31]:
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline 

### (Here is how Jinglu measured 3 times of feature importance, which will inform feature selection)

In [32]:
#Attribute usage:
# 100.00% RaceCategory #100.00% Ethnicity
#100.00% PackYears
#99.99% Age
#99.51% FamilyHxFather #99.34% Sex
#99.25% FamilyHxMother #98.57% SmokingNowCategory


# 100.00% Ethnicity #100.00% PackYears
#99.71% Age
#97.78% RaceCategory 
#84.81% FamilyHxMother 
#80.36% SmokingNowCategory 
#74.82% SecondSmokeAtHome 
#72.75% FamilyHxFather #64.44% Sex

#MeanDecreaseGini
#Age 1335.7826
#Sex 201.7961
#PackYears 1963.1295
#AbnormalCTdiametersize 916.6168
#AbnormalCTnumberofsuspiciousmasses 703.6684
#AbnormalCTtype   0.0000
#RaceCategory 293.6279
#EthnicityCategory 104.7826
#FamilyHxFather 186.8981
#FamilyHxMother 169.8810
#SecondSmokeAtHome 219.8933
#SecondSmokeAtWork 167.0132
#SmokingNowCategory 204.0901

In [33]:
## Train-Test Split
df_pt_abn = df_pt_abn.reset_index()
from sklearn.model_selection import train_test_split
X = df_pt_abn [[ 'age','gender', 'smokelive','race','pkyr','smokework',
                                      'famfather','fammother','anyscr_has_nodule', 'sct_long_dia_sum'
                                      ,'sct_long_dia_max', 'sct_long_dia_count','diagcopd']]
y = df_pt_abn ['conflc']
y = y.astype('int')


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [35]:
## Fit a decision Tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
tree2 = DecisionTreeClassifier(max_depth = 4).fit(X_train, y_train)
tree2.score(X_test, y_test)

0.8625954198473282

In [36]:
## Plotting Decision Tree
lpy = [item for item in X_train.columns]
import pydot_ng as pydot
from IPython.display import IFrame
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
with open("dt.dot","w") as dot_data:
    export_graphviz(tree2, out_file=dot_data, filled=True, 
                feature_names = lpy,label = 'all')
pydot.graph_from_dot_file("dt.dot").write_png("dt.png")
IFrame("dt.png", width = 1000, height = 500)

### Work undone: Plotting Feature Importance??

### GBDT

In [37]:
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
GBDT = GradientBoostingClassifier(learning_rate = .1, max_depth = 4, random_state = 0)
gbdt = GBDT.fit(X_train, y_train)
gbdt.score(X_test, y_test)

0.8619014573213046

In [38]:
## Confusion Matrics for Gradient Boosted Decision Tree
from sklearn.metrics import confusion_matrix
gbdt_predicted = gbdt.predict(X_test)
confusion_gbdt = confusion_matrix(y_test, gbdt_predicted)
print('gradient boost decision tree classifier',  confusion_gbdt)

gradient boost decision tree classifier [[  48  157]
 [  42 1194]]


In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)  Also known as sensitivity, or True Positive Rate
# F1 = 2 * Precision * Recall / (Precision + Recall) 
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, gbdt_predicted)))
print('Precision, which matters more: {:.2f}'.format(precision_score(y_test, gbdt_predicted)))
print('Recall: {:.2f}'.format(recall_score(y_test, gbdt_predicted)))
print('F1: {:.2f}'.format(f1_score(y_test, gbdt_predicted)))

Accuracy: 0.86
Precision, which matters more: 0.53
Recall: 0.23
F1: 0.33


### Plotting ROC Curve

In [None]:
## Plotting ROC Curves 这个会报错啊啊啊啊啊啊啊！！！！！
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
y_score_gbdt = gbdt.decision_function(X_test)
fpr_gbdt, tpr_gbdt, _ = roc_curve(y_test, y_score_gbdt)
roc_auc_gbdt = auc(fpr_gbdt, tpr_gbdt)
plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_gbdt, tpr_gbdt, lw=3, label='GBDT ROC curve (area = {:0.2f})'.format(roc_auc_gbdt))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()

# Comments

# Keep some distance! Here's about the 3rd dataset

In [None]:
df_lc.head()

In [None]:
        #check duplicate pid in df_lc
lcid = df_lc["pid"]
df_lc[lcid.isin(lcid[lcid.duplicated()])].head(15)

In [None]:
## Check how many pid actually exist in this data frame
df_lc['pid'].nunique()
## Exactly 2058 patient discussed in the last meeting, as compared to 2150 in this dataset

In [None]:
df_pt_pd = pd.merge(df_pt, df_lc, how='left', left_on='pid', right_on='pid')

df_pt_pd.columns
df_pt_pd.dtypes
df_pt_pd.shape
df_pt_pd.info

In [None]:
## Select Useful Columns
selected_features  = ['pid','age','cigar','pkyr','smokelive', 'can_scr', 'famfather','fammother']
#'lesionsize_y' is not included now
df = df_pt_pd[selected_features]
df.shape

In [None]:
## Check duplication of pid
ids = df["pid"]
df[ids.isin(ids[ids.duplicated()])].shape