In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler
#import xgboost as xgb
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from statistics import mean
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
# Loading feature names
data = pd.read_csv(r'Radiomics.csv')
tst_data = pd.read_csv(r'Radiomics.csv')


X_train, y_train = data.drop(['Pat_id','Label', 'Mask'], axis=1).values, data['Label'].values
X_test, y_test = tst_data.drop(['Pat_id','Label','Mask'], axis=1).values, tst_data['Label'].values
X_train = preprocessing.normalize(X_train,axis=1)
X_test = preprocessing.normalize(X_test,axis=1)

pd.DataFrame(X_train).fillna(pd.DataFrame(X_train).mean(), inplace=True)
pd.DataFrame(X_test).fillna(pd.DataFrame(y_test).mean(), inplace=True)


In [3]:
# Visualising features using heat map
%pylab
X=X_train
y=y_train

corrmat=data.corr()
top=corrmat.index
plt.figure()
#plot heat map
sns.heatmap(data[top].corr(),annot=True,cmap="RdYlGn")
plt.show()

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
# Over sampling to balance datasets eg. Benign 100, Malignant 450; After oversampling Benign 450, Malignant 450; 
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))


Resampled dataset shape Counter({1: 356, 0: 356})


In [5]:
# Principal Component Analysis - to reduce redundancy and to reduce dimension

pca = PCA(0.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
print(np.shape(X_train))
print(np.shape(X_test))

(454, 2)
(454, 2)


In [None]:
# Building pipeline 
SVparam_grid = [{'SVM__kernel': ['rbf','linear'], 'SVM__gamma': [0.01,0.1,1],'SVM__C': [0.1,1,10,]}]
LRparam_grid = [{'LR__C': np.power(10.0, np.arange(-1.0, 2.0, 1.0))}]
DTparam_grid = [{'DT__max_depth': np.arange(3, 16, dtype='int')}]
kNNparam_grid = [{'Knn__n_neighbors': np.arange(3, 16, dtype='int')}]
RFparam_grid = [{'RF__max_depth': np.arange(1, 8, dtype='int'),'RF__n_estimators': np.arange(1, 8, dtype='int')}]
#xGBparam_grid = [{'max_depth': np.arange(1, 18, dtype='int')}]

LRpipeline = Pipeline([('scaler', StandardScaler()),
                       ('LR',LogisticRegression(solver='lbfgs'))])
DTpipeline = Pipeline([('transformer', StandardScaler()),
                       ('DT',DecisionTreeClassifier(criterion='entropy'))]) 
kNNpipeline= Pipeline([('transformer', StandardScaler()),
                       ('Knn',KNeighborsClassifier())])
RFpipeline = Pipeline([('transform', StandardScaler()),
                       ('RF',RandomForestClassifier())])
SVpipeline = Pipeline([('scaler', StandardScaler()),
                       ('SVM',SVC(probability=True))])

SVM_grid = GridSearchCV(SVpipeline, param_grid=SVparam_grid , cv=10,iid=True)
Knn_grid = GridSearchCV(kNNpipeline, param_grid=kNNparam_grid, cv=10,iid=True)
LR_grid = GridSearchCV(LRpipeline, param_grid=LRparam_grid, cv=10,iid=True)
DT_grid = GridSearchCV(DTpipeline, param_grid=DTparam_grid, cv=10,iid=True)
RF_grid = GridSearchCV(RFpipeline, param_grid=RFparam_grid, cv=10,iid=True)

SV_pline=SVM_grid.fit(X_train, y_train) 
kNN_pline=Knn_grid.fit(X_train, y_train) 
LR_pline=LR_grid.fit(X_train, y_train) 
DT_pline=DT_grid.fit(X_train, y_train)
RF_pline=RF_grid.fit(X_train, y_train)

In [None]:
# Best parameters for different model identified by GrindsearchCV 
print(SV_pline.best_params_)
print(DT_pline.best_params_)
print(LR_pline.best_params_)
print(kNN_pline.best_params_)
print(RF_pline.best_params_)

In [None]:
# Cross_validation Score
print('SVM',np.mean(cross_val_score(SV_pline, X_train, y_train,cv=5)))
print('kNN',np.mean(cross_val_score(kNN_pline, X_train, y_train,cv=5)))
print('Logistic Regression',np.mean(cross_val_score(LR_pline, X_train, y_train,cv=5)))
print('Decision Tree',np.mean(cross_val_score(DT_pline, X_train, y_train,cv=5)))
print('Random Forest',np.mean(cross_val_score(RF_pline, X_train, y_train,cv=5)))

In [None]:
print('SVM')
print(classification_report(y_test, SV_pline.predict(X_test)))
print('Decision Tree')
print(classification_report(y_test, DT_pline.predict(X_test)))
print('Logistic Regression')
print(classification_report(y_test,LR_pline.predict(X_test)))
print('K-Nearest Neighbours')
print(classification_report(y_test, kNN_pline.predict(X_test)))
print('Random Forest')
print(classification_report(y_test, RF_pline.predict(X_test)))

In [None]:
print("Confusion Matrix of SVM:")
print(confusion_matrix(y_test, SV_pline.predict(X_test)))
print("Confusion Matrix of Knn:")
print(confusion_matrix(y_test,kNN_pline.predict(X_test)))
print("Confusion Matrix of Knn:")
print(confusion_matrix(y_test,LR_pline.predict(X_test)))
print("Confusion Matrix of DT:")
print(confusion_matrix(y_test, DT_pline.predict(X_test)))
print("Confusion Matrix of RF:")
print(confusion_matrix(y_test, RF_pline.predict(X_test)))

In [None]:
fpr = dict()
tpr = dict()
thres=dict()
roc_auc = dict()

In [None]:
for i in range(2):
    y_pred_proba = SV_pline.predict_proba(X_test)[::,i]
    fpr[i], tpr[i], _ = metrics.roc_curve(y_test, y_pred_proba)
    roc_auc[i] = auc(fpr[i], tpr[i])
    
    
print('AUC in SVM is', roc_auc[1])
plt.figure()
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot(fpr[0], tpr[0], color='red',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[0])
plt.plot(fpr[1], tpr[1], color='darkorange',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of SVM')
plt.legend(loc="lower right")
plt.show()

In [None]:
for i in range(2):
    y_pred_proba = DT_pline.predict_proba(X_test)[::,i]
    fpr[i], tpr[i], _ = metrics.roc_curve(y_test, y_pred_proba)
    roc_auc[i] = auc(fpr[i], tpr[i])
    
    
print('AUC in Decision tree is', roc_auc[1])
plt.figure()
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot(fpr[0], tpr[0], color='red',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[0])
plt.plot(fpr[1], tpr[1], color='darkorange',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of Decision Tree')
plt.legend(loc="lower right")
plt.show()

In [None]:
for i in range(2):
    y_pred_proba = LR_pline.predict_proba(X_test)[::,i]
    fpr[i], tpr[i], _ = metrics.roc_curve(y_test, y_pred_proba)
    roc_auc[i] = auc(fpr[i], tpr[i])
    
    
print('AUC in Logistic regression is', roc_auc[1])
plt.figure()
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot(fpr[0], tpr[0], color='red',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[0])
plt.plot(fpr[1], tpr[1], color='darkorange',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of Logistic Regression')
plt.legend(loc="lower right")
plt.show()

In [None]:
for i in range(2):
    y_pred_proba = kNN_pline.predict_proba(X_test)[::,i]
    fpr[i], tpr[i], _ = metrics.roc_curve(y_test, y_pred_proba)
    roc_auc[i] = auc(fpr[i], tpr[i])
    
    
print('AUC in K_nearest neighbor is', roc_auc[1])
plt.figure()
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot(fpr[0], tpr[0], color='red',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[0])
plt.plot(fpr[1], tpr[1], color='darkorange',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of K_nearest neighbor')
plt.legend(loc="lower right")
plt.show()

In [None]:
for i in range(2):
    y_pred_proba = RF_pline.predict_proba(X_test)[::,i]
    fpr[i], tpr[i], _ = metrics.roc_curve(y_test, y_pred_proba)
    roc_auc[i] = auc(fpr[i], tpr[i])
    
    
print('AUC in Random Forest is', roc_auc[1])
plt.figure()
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot(fpr[0], tpr[0], color='red',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[0])
plt.plot(fpr[1], tpr[1], color='darkorange',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of Random Forest')
plt.legend(loc="lower right")
plt.show()