In [43]:
import pandas as pd
import os, json
import time
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
import random
import pydicom
import pylidc as pl
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
df_train = pd.read_csv("/Users/alex/dev/STAT 447B/Project/Data/Meta/meta_annotation_info_train.csv")
df_test = pd.read_csv("/Users/alex/dev/STAT 447B/Project/Data/Meta/meta_annotation_info_test.csv")

In [3]:
list(df_train.columns)

['Patient_id',
 'Nodule_no',
 'Annotation_no',
 'Internal structure',
 'Calcification',
 'Subtlety',
 'Margin',
 'Sphericity',
 'Lobulation',
 'Spiculation',
 'Texture',
 'Internal structure_entropy',
 'Calcification_entropy',
 'Subtlety_entropy',
 'Margin_entropy',
 'Sphericity_entropy',
 'Lobulation_entropy',
 'Spiculation_entropy',
 'Texture_entropy',
 'Malignancy_entropy',
 'Internal structure_mode',
 'Calcification_mode',
 'Subtlety_mode',
 'Margin_mode',
 'Sphericity_mode',
 'Lobulation_mode',
 'Spiculation_mode',
 'Texture_mode',
 'Malignancy_mode',
 'Internal structure_mean',
 'Calcification_mean',
 'Subtlety_mean',
 'Margin_mean',
 'Sphericity_mean',
 'Lobulation_mean',
 'Spiculation_mean',
 'Texture_mean',
 'Malignancy_mean',
 'Internal structure_median',
 'Calcification_median',
 'Subtlety_median',
 'Margin_median',
 'Sphericity_median',
 'Lobulation_median',
 'Spiculation_median',
 'Texture_median',
 'Malignancy_median',
 'Internal structure_median_high',
 'Calcification_me

In [4]:
potential_predictors_list = ['Internal structure',
 'Calcification',
 'Subtlety',
 'Margin',
 'Sphericity',
 'Lobulation',
 'Spiculation',
 'Texture',
 'Internal structure_entropy',
 'Calcification_entropy',
 'Subtlety_entropy',
 'Margin_entropy',
 'Sphericity_entropy',
 'Lobulation_entropy',
 'Spiculation_entropy',
 'Texture_entropy',
 'Internal structure_mode',
 'Calcification_mode',
 'Subtlety_mode',
 'Margin_mode',
 'Sphericity_mode',
 'Lobulation_mode',
 'Spiculation_mode',
 'Texture_mode',
 'Internal structure_mean',
 'Calcification_mean',
 'Subtlety_mean',
 'Margin_mean',
 'Sphericity_mean',
 'Lobulation_mean',
 'Spiculation_mean',
 'Texture_mean',
 'Internal structure_median',
 'Calcification_median',
 'Subtlety_median',
 'Margin_median',
 'Sphericity_median',
 'Lobulation_median',
 'Spiculation_median',
 'Texture_median',
 'Internal structure_median_high',
 'Calcification_median_high',
 'Subtlety_median_high',
 'Margin_median_high',
 'Sphericity_median_high',
 'Lobulation_median_high',
 'Spiculation_median_high',
 'Texture_median_high']

In [50]:
X_train = df_train[potential_predictors_list]
y_train = df_train["Is_cancer"]
X_test = df_test[potential_predictors_list]
y_test = df_test["Is_cancer"]

list_continuous_predictors = [k for k in potential_predictors_list if 'mean' in k] + [k for k in potential_predictors_list if 'entropy' in k]
print(list_continuous_predictors)

X_train = X_train.astype('category')
X_test = X_test.astype('category')
y_train = y_train.astype('category')
y_test = y_test.astype('category')

X_train[list_continuous_predictors] = X_train[list_continuous_predictors].astype('int32')
X_test[list_continuous_predictors] = X_train[list_continuous_predictors].astype('int32')

le = LabelEncoder()
X_train_numeric = X_train.apply(le.fit_transform)
X_test_numeric = X_test.apply(le.fit_transform)
y_train_numeric = le.fit_transform(y_train)
y_test_numeric = le.fit_transform(y_test)

['Internal structure_mean', 'Calcification_mean', 'Subtlety_mean', 'Margin_mean', 'Sphericity_mean', 'Lobulation_mean', 'Spiculation_mean', 'Texture_mean', 'Internal structure_entropy', 'Calcification_entropy', 'Subtlety_entropy', 'Margin_entropy', 'Sphericity_entropy', 'Lobulation_entropy', 'Spiculation_entropy', 'Texture_entropy']


In [6]:
print(X_train.head())
print(X_train.dtypes)

  Internal structure Calcification Subtlety Margin Sphericity Lobulation  \
0                  1             6        5      3          3          3   
1                  1             6        5      4          4          5   
2                  1             6        5      2          3          3   
3                  1             6        5      4          5          1   
4                  1             6        1      2          5          1   

  Spiculation Texture  Internal structure_entropy  Calcification_entropy  ...  \
0           4       5                           0                      0  ...   
1           5       5                           0                      0  ...   
2           3       5                           0                      0  ...   
3           5       4                           0                      0  ...   
4           1       1                           0                      0  ...   

   Spiculation_median  Texture_median  Internal structur

In [7]:
clf = RandomForestClassifier(random_state = 42)

param_grid = {'max_depth': [2, 3, 4, 5, 6, 8, 10, 15, 20],
              'max_features': [5, 10, 20, 30, 40, 48],
              'min_samples_leaf': [1, 2, 4, 6],
              'min_samples_split': [2, 5, 10],
              'n_estimators': [10, 50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
              'bootstrap': [False]}

k_fold_cv = 5

rf_grid = RandomizedSearchCV(clf, param_distributions = param_grid, n_iter = 100, cv = k_fold_cv, n_jobs = -1, refit = True, verbose = 4, return_train_score = True, random_state = 42)
rf_grid.fit(X_train, y_train)

print('Best hyper parameter: ', rf_grid.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 2/5] END bootstrap=False, max_depth=2, max_features=5, min_samples_leaf=1, min_samples_split=10, n_estimators=1000;, score=(train=0.472, test=0.466) total time=   1.4s
[CV 3/5] END bootstrap=False, max_depth=4, max_features=40, min_samples_leaf=4, min_samples_split=10, n_estimators=1400;, score=(train=0.529, test=0.466) total time=   8.6s
[CV 1/5] END bootstrap=False, max_depth=5, max_features=5, min_samples_leaf=4, min_samples_split=10, n_estimators=1600;, score=(train=0.594, test=0.501) total time=   2.9s
[CV 1/5] END bootstrap=False, max_depth=4, max_features=20, min_samples_leaf=6, min_samples_split=5, n_estimators=200;, score=(train=0.555, test=0.485) total time=   0.7s
[CV 1/5] END bootstrap=False, max_depth=15, max_features=30, min_samples_leaf=2, min_samples_split=5, n_estimators=1200;, score=(train=0.967, test=0.469) total time=  13.3s
[CV 1/5] END bootstrap=False, max_depth=5, max_features=20, min_samples_leaf



[CV 5/5] END bootstrap=False, max_depth=2, max_features=5, min_samples_leaf=1, min_samples_split=10, n_estimators=1000;, score=(train=0.497, test=0.494) total time=   1.3s
[CV 1/5] END bootstrap=False, max_depth=4, max_features=40, min_samples_leaf=4, min_samples_split=10, n_estimators=1400;, score=(train=0.537, test=0.493) total time=   8.6s
[CV 3/5] END bootstrap=False, max_depth=5, max_features=5, min_samples_leaf=4, min_samples_split=10, n_estimators=1600;, score=(train=0.600, test=0.450) total time=   2.9s
[CV 3/5] END bootstrap=False, max_depth=4, max_features=20, min_samples_leaf=6, min_samples_split=5, n_estimators=200;, score=(train=0.562, test=0.455) total time=   0.8s
[CV 4/5] END bootstrap=False, max_depth=15, max_features=30, min_samples_leaf=2, min_samples_split=5, n_estimators=1200;, score=(train=0.966, test=0.551) total time=  13.5s
[CV 3/5] END bootstrap=False, max_depth=5, max_features=20, min_samples_leaf=1, min_samples_split=2, n_estimators=1400;, score=(train=0.619

In [29]:
hd = list(X_train.columns)
for i, f in zip(hd, rf_grid.best_estimator_.feature_importances_):
 print(i,round(f*100,2))

Internal structure 0.0
Calcification 2.13
Subtlety 1.43
Margin 0.67
Sphericity 0.72
Lobulation 0.71
Spiculation 0.99
Texture 0.46
Internal structure_entropy 0.0
Calcification_entropy 0.19
Subtlety_entropy 2.5
Margin_entropy 2.01
Sphericity_entropy 2.48
Lobulation_entropy 1.56
Spiculation_entropy 1.29
Texture_entropy 1.0
Internal structure_mode 0.0
Calcification_mode 5.32
Subtlety_mode 4.77
Margin_mode 2.15
Sphericity_mode 2.66
Lobulation_mode 2.11
Spiculation_mode 1.97
Texture_mode 1.07
Internal structure_mean 0.01
Calcification_mean 4.18
Subtlety_mean 3.81
Margin_mean 3.01
Sphericity_mean 2.4
Lobulation_mean 1.8
Spiculation_mean 1.65
Texture_mean 2.33
Internal structure_median 0.0
Calcification_median 3.19
Subtlety_median 6.85
Margin_median 3.19
Sphericity_median 3.24
Lobulation_median 4.0
Spiculation_median 3.11
Texture_median 1.76
Internal structure_median_high 0.0
Calcification_median_high 3.27
Subtlety_median_high 3.36
Margin_median_high 1.89
Sphericity_median_high 2.3
Lobulation_

In [32]:
predictions = rf_grid.predict(X_test)
prob_predictions = rf_grid.predict_proba(X_test)
print((X_test.shape))
print((y_test.shape))

print(y_train.value_counts()/ y_train.shape)

(2335, 48)
(2335,)
False           0.321060
No_consensus    0.286035
Ambiguous       0.247643
True            0.145263
Name: Is_cancer, dtype: float64


In [13]:
print(np.unique)

['Ambiguous' 'False' 'No_consensus' 'True']


In [23]:
print(accuracy_score(y_test, predictions))


0.515203426124197


# XGBoost

In [52]:
params = {'max_depth': [2, 3, 4, 5, 6, 8, 10, 15, 20],
          'subsample': [0.3, 0.4, 0.5, 0.6],
          'learning_rate': [0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4],
          'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1],
          'n_estimators': [10, 50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

X_train[list_continuous_predictors] = X_train[list_continuous_predictors].astype('int32')
X_test[list_continuous_predictors] = X_train[list_continuous_predictors].astype('int32')

print(X_train.dtypes)


xgbclf = xgb.XGBClassifier(objective="multi:softmax", tree_method='hist')
xgb_grid = RandomizedSearchCV(estimator=xgbclf,
                         param_distributions=params,
                         cv = k_fold_cv,
                         scoring='accuracy',
                         n_iter=500,
                         n_jobs=-1,
                         verbose=4)

xgb_grid.fit(X_train_numeric, y_train_numeric)

Internal structure                category
Calcification                     category
Subtlety                          category
Margin                            category
Sphericity                        category
Lobulation                        category
Spiculation                       category
Texture                           category
Internal structure_entropy           int32
Calcification_entropy                int32
Subtlety_entropy                     int32
Margin_entropy                       int32
Sphericity_entropy                   int32
Lobulation_entropy                   int32
Spiculation_entropy                  int32
Texture_entropy                      int32
Internal structure_mode           category
Calcification_mode                category
Subtlety_mode                     category
Margin_mode                       category
Sphericity_mode                   category
Lobulation_mode                   category
Spiculation_mode                  category
Texture_mod



[CV 1/5] END colsample_bytree=1.1, eval_metric=MSE, learning_rate=0.05, max_depth=10, n_estimators=2000, subsample=0.3;, score=nan total time=   0.0s
[CV 2/5] END colsample_bytree=1.1, eval_metric=MSE, learning_rate=0.05, max_depth=10, n_estimators=2000, subsample=0.3;, score=nan total time=   0.0s
[CV 3/5] END colsample_bytree=1.1, eval_metric=MSE, learning_rate=0.05, max_depth=10, n_estimators=2000, subsample=0.3;, score=nan total time=   0.0s
[CV 4/5] END colsample_bytree=1.1, eval_metric=MSE, learning_rate=0.05, max_depth=10, n_estimators=2000, subsample=0.3;, score=nan total time=   0.0s
[CV 5/5] END colsample_bytree=1.1, eval_metric=MSE, learning_rate=0.05, max_depth=10, n_estimators=2000, subsample=0.3;, score=nan total time=   0.0s
[CV 3/5] END colsample_bytree=1, eval_metric=MSE, learning_rate=0.3, max_depth=3, n_estimators=50, subsample=0.5;, score=nan total time=   0.0s
[CV 4/5] END colsample_bytree=1, eval_metric=MSE, learning_rate=0.3, max_depth=3, n_estimators=50, subsamp

330 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
330 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/alex/opt/anaconda3/envs/CancerClassificationProject/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/alex/opt/anaconda3/envs/CancerClassificationProject/lib/python3.9/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/Users/alex/opt/anaconda3/envs/CancerClassificationProject/lib/python3.9/site-packages/xgboost/sklearn.py", line 1490, in fit
    self._Booster = train(
  File "/Users/alex/opt/anaconda3/envs/CancerCl

In [54]:
hd = list(X_train.columns)
for i, f in zip(hd, xgb_grid.best_estimator_.feature_importances_):
    print(i,round(f*100,2))

xgb_predictions = xgb_grid.predict(X_test_numeric)

print(accuracy_score(y_test_numeric, xgb_predictions))

Internal structure 0.75
Calcification 1.19
Subtlety 1.02
Margin 0.82
Sphericity 0.85
Lobulation 1.0
Spiculation 1.1
Texture 0.89
Internal structure_entropy 1.25
Calcification_entropy 1.36
Subtlety_entropy 1.6
Margin_entropy 1.4
Sphericity_entropy 1.79
Lobulation_entropy 1.57
Spiculation_entropy 1.62
Texture_entropy 1.43
Internal structure_mode 0.97
Calcification_mode 11.9
Subtlety_mode 2.81
Margin_mode 1.33
Sphericity_mode 1.49
Lobulation_mode 1.34
Spiculation_mode 1.72
Texture_mode 1.25
Internal structure_mean 1.14
Calcification_mean 4.18
Subtlety_mean 2.56
Margin_mean 1.65
Sphericity_mean 1.56
Lobulation_mean 1.63
Spiculation_mean 1.74
Texture_mean 1.43
Internal structure_median 1.13
Calcification_median 7.62
Subtlety_median 3.02
Margin_median 1.43
Sphericity_median 1.51
Lobulation_median 2.41
Spiculation_median 3.27
Texture_median 1.52
Internal structure_median_high 1.15
Calcification_median_high 7.98
Subtlety_median_high 2.05
Margin_median_high 1.36
Sphericity_median_high 1.62
Lobu