In [2]:
# Loading Libraries

import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt 
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV

In [3]:
data = pd.read_csv("breast_cancer_data.csv")
data.drop(['id','Unnamed: 32'],axis=1,inplace=True)
# map function for mapping benign into 0 and malignant into 1
data['diagnosis'] = data['diagnosis'].map({'B':0,'M':1})

X=data.drop(['diagnosis'],axis=1)
y=data['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=42)

In [4]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [5]:
# Random Forest

rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)
y_train_pred=rf_model.predict_proba(X_train)

roc_auc_score(y_train,y_train_pred[:,1])       # ROC curve only need one column from two columns as 1 - 2nd column = 1st column and vice versa.

print('Random Forest training ROC-AUC Accuracy is :', roc_auc_score(y_train,y_train_pred[:,1]))
y_test_pred=rf_model.predict_proba(X_test)
print('Random Forest testing ROC-AUC Accuracy is :', roc_auc_score(y_test,y_test_pred[:,1]))



Random Forest training ROC-AUC Accuracy is : 1.0
Random Forest testing ROC-AUC Accuracy is : 0.9963611693598124


In [5]:
y_train_pred   # 1st column gives probability of 0 and 2nd column gives probability of 1.

array([[0.07, 0.93],
       [0.98, 0.02],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.07, 0.93],
       [0.92, 0.08],
       [1.  , 0.  ],
       [0.01, 0.99],
       [0.99, 0.01],
       [0.97, 0.03],
       [0.95, 0.05],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.95, 0.05],
       [0.21, 0.79],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.07, 0.93],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.99, 0.01],
       [1.  , 0.  ],
       [0.99, 0.01],
       [0.02, 0.98],
       [0.93, 0.07],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.99, 0.01],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.97, 0.03],
       [0.99, 0.01],
       [0.95, 0.05],
       [0.98, 0.02],
       [1.  , 0.  ],
       [0.18, 0.82],
       [0.  ,

In [7]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X_train,y_train)
y_train_pred=log_model.predict_proba(X_train)

roc_auc_score(y_train,y_train_pred[:,1]) 

print('Logistic Regression training ROC-AUC Accuracy is :', roc_auc_score(y_train,y_train_pred[:,1]))
y_test_pred=rf_model.predict_proba(X_test)
print('Logistic Regression testing ROC-AUC Accuracy is :', roc_auc_score(y_test,y_test_pred[:,1]))

Logistic Regression training ROC-AUC Accuracy is : 0.9933080070134425
Logistic Regression testing ROC-AUC Accuracy is : 0.9963611693598124


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Adaboost Classifier

ada_model = AdaBoostClassifier()
ada_model.fit(X_train,y_train)
y_train_pred=ada_model.predict_proba(X_train)

roc_auc_score(y_train,y_train_pred[:,1]) 

print('Adaboost Classifier training ROC-AUC Accuracy is :', roc_auc_score(y_train,y_train_pred[:,1]))
y_test_pred=rf_model.predict_proba(X_test)
print('Adaboost Classifier testing ROC-AUC Accuracy is :', roc_auc_score(y_test,y_test_pred[:,1]))

Adaboost Classifier training ROC-AUC Accuracy is : 1.0
Adaboost Classifier testing ROC-AUC Accuracy is : 0.9963611693598124


In [10]:
# KNN Classifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train,y_train)
y_train_pred=knn_model.predict_proba(X_train)

roc_auc_score(y_train,y_train_pred[:,1]) 

print('KNN Classifier training ROC-AUC Accuracy is :', roc_auc_score(y_train,y_train_pred[:,1]))
y_test_pred=rf_model.predict_proba(X_test)
print('KNN Classifier testing ROC-AUC Accuracy is :', roc_auc_score(y_test,y_test_pred[:,1]))

# Always take class 1 from predict proba, as when calculating roc_aus acuracy score, second column is given priority and it is theory defined.

KNN Classifier training ROC-AUC Accuracy is : 0.9851548801870251
KNN Classifier testing ROC-AUC Accuracy is : 0.9963611693598124


In [None]:
# The roc-auc Accuracy obtaibned above is the maximum, accuracy that can be reached in the graph, that is the toppest point in graph.
# For reference, look the graph in note book where p(0) have point 1.

In [11]:
pred=[]
for model in [rf_model,log_model,ada_model,knn_model]:
    pred.append(pd.Series(model.predict_proba(X_test)[:,1]))

final_prediction = pd.concat(pred,axis=1).mean(axis=1)

In [12]:
pd.concat(pred,axis=1)

Unnamed: 0,0,1,2,3
0,0.03,0.183247,0.429136,0.4
1,1.00,1.000000,0.824762,1.0
2,1.00,0.997621,0.668222,1.0
3,0.00,0.007297,0.369334,0.0
4,0.00,0.001729,0.206400,0.0
...,...,...,...,...
183,0.96,0.999752,0.600773,1.0
184,0.99,0.999999,0.774048,1.0
185,0.91,0.977299,0.622390,0.8
186,0.43,0.716766,0.447127,0.6


In [None]:
# The Final Prediction is the mean of all 4 prediction outputs from 4 models.
# This is done for getting the threshold for ensemble technique.
# We can also give individual probalbilities  of ech model along with Y_test when calculating Roc curve to get the threshold.

In [13]:
final_prediction

0      0.352264
1      0.956190
2      0.916583
3      0.093896
4      0.052002
         ...   
183    0.900144
184    0.943512
185    0.800475
186    0.585061
187    0.094966
Length: 188, dtype: float64

In [15]:
# Calculate the ROC Curve

fpr, tpr, thresholds = roc_curve(y_test, final_prediction)

In [16]:
thresholds

array([1.9617185 , 0.9617185 , 0.61484321, 0.58506107, 0.53437239,
       0.34752442, 0.33666533, 0.05200213])

In [None]:
# These the thresholds given by roc_curve
# With these threshlods, we need to find the class of the predict_proba from the ensembling result.
# After finding the class, we could find the accuracy_score with that predicted class.
# From this Accurascy score, we could understand which threshold gives good accuracy and choose that threshold for final submission.

In [17]:
accuracy_ls = []

for thres in thresholds:
    y_pred=np.where(final_prediction>thres,1,0)
    accuracy_ls.append(accuracy_score(y_test,y_pred))


In [23]:
acc_list = pd.concat([pd.Series(thresholds),pd.Series(accuracy_ls)],axis=1)
acc_list.columns = ['Threshold','Accuracy']
acc_list.sort_values(by='Accuracy', ascending=False,inplace=True)

In [24]:
acc_list

Unnamed: 0,Threshold,Accuracy
4,0.534372,0.984043
3,0.585061,0.973404
2,0.614843,0.968085
5,0.347524,0.946809
6,0.336665,0.941489
0,1.961719,0.643617
1,0.961719,0.643617
7,0.052002,0.361702
