In [117]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [118]:
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res

In [119]:
# Data

In [120]:
#Read a pickle file and load the data
import pickle
with open('final_df.pkl', 'rb') as file:
    final_df = pickle.load(file)

In [121]:
df = final_df
df = df.drop(['id'], axis=1)
df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1.0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,1.0,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,1.0,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,1.0,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,1.0,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1.0,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,1.0,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,1.0,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,1.0,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [122]:
df['diagnosis'].value_counts()

0.0    357
1.0    212
Name: diagnosis, dtype: int64

In [123]:
X = df.loc[:,df.columns != 'diagnosis']
y = df['diagnosis']

In [124]:
from sklearn.model_selection import train_test_split

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [126]:
## Models

In [127]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import xgboost as xgb

In [128]:
models_list = pd.DataFrame()

In [129]:
mod1 = LogisticRegression(max_iter=1000)
mod1.fit(X_train,y_train)

pred1 = mod1.predict(X_test)

In [130]:
print("Coefficients:", mod1.coef_)

Coefficients: [[-2.36693637 -0.17693913  0.12778407  0.00700177  0.17985647  0.42204253
   0.78219161  0.47408197  0.27092339  0.0278306   0.1440861  -1.75663829
  -0.25650711  0.13168072  0.02305564 -0.03402154  0.02381798  0.05442257
   0.05815707 -0.00699278 -0.77732126  0.44084808  0.08678473  0.01570434
   0.34431868  1.07885602  1.79305498  0.81369561  0.9917471   0.10228631]]


In [131]:
feature_names = X_train.columns
coefficients = mod1.coef_[0]
coeff_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coeff_df_sorted = coeff_df.sort_values(by='Coefficient', ascending=False)
print(coeff_df_sorted)

                    Feature  Coefficient
26          concavity_worst     1.793055
25        compactness_worst     1.078856
28           symmetry_worst     0.991747
27     concave points_worst     0.813696
6            concavity_mean     0.782192
7       concave points_mean     0.474082
21            texture_worst     0.440848
5          compactness_mean     0.422043
24         smoothness_worst     0.344319
8             symmetry_mean     0.270923
4           smoothness_mean     0.179856
10                radius_se     0.144086
13                  area_se     0.131681
2            perimeter_mean     0.127784
29  fractal_dimension_worst     0.102286
22          perimeter_worst     0.086785
18              symmetry_se     0.058157
17        concave points_se     0.054423
9    fractal_dimension_mean     0.027831
16             concavity_se     0.023818
14            smoothness_se     0.023056
23               area_worst     0.015704
3                 area_mean     0.007002
19     fractal_d

In [158]:
# classificationMetrics(y_test, pred1)

cm = confusion_matrix(y_test, pred1)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[106   2]
 [  4  59]]


In [133]:
model_dict = {'model': "Logistic Regression"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y_test, pred1)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.964912,0.967213,0.936508,0.951613,1.26469,0.958995


In [134]:
pd.crosstab(y_test, pred1)

col_0,0.0,1.0
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,106,2
1.0,4,59


In [135]:
mod2 = DecisionTreeClassifier(random_state=1)
mod2.fit(X_train,y_train)

pred2 = mod2.predict(X_test)

In [136]:
model_dict = {'model': "Decision Tree"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y_test, pred2)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.964912,0.967213,0.936508,0.951613,1.26469,0.958995
1,Decision Tree,0.935673,0.893939,0.936508,0.914729,2.318598,0.935847


In [137]:
pd.crosstab(y_test, pred2)

col_0,0.0,1.0
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,101,7
1.0,4,59


In [138]:
mod3 = RandomForestClassifier(random_state=1)
mod3.fit(X_train,y_train)

pred3 = mod3.predict(X_test)

In [139]:
model_dict = {'model': "RandomForest"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y_test, pred3)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.964912,0.967213,0.936508,0.951613,1.26469,0.958995
1,Decision Tree,0.935673,0.893939,0.936508,0.914729,2.318598,0.935847
2,RandomForest,0.97076,0.983333,0.936508,0.95935,1.053908,0.963624


In [140]:
pd.crosstab(y_test, pred3)

col_0,0.0,1.0
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,107,1
1.0,4,59


In [141]:
mod4 = AdaBoostClassifier(random_state=1)
mod4.fit(X_train,y_train)

pred4 = mod4.predict(X_test)

In [142]:
model_dict = {'model': "ADABoost"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y_test, pred4)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.964912,0.967213,0.936508,0.951613,1.26469,0.958995
1,Decision Tree,0.935673,0.893939,0.936508,0.914729,2.318598,0.935847
2,RandomForest,0.97076,0.983333,0.936508,0.95935,1.053908,0.963624
3,ADABoost,0.976608,0.968254,0.968254,0.968254,0.843126,0.974868


In [143]:
pd.crosstab(y_test, pred4)

col_0,0.0,1.0
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,106,2
1.0,2,61


In [144]:
mod5 = GradientBoostingClassifier(random_state=1)
mod5.fit(X_train,y_train)

pred5 = mod5.predict(X_test)

In [145]:
model_dict = {'model': "GBM"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y_test, pred5)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.964912,0.967213,0.936508,0.951613,1.26469,0.958995
1,Decision Tree,0.935673,0.893939,0.936508,0.914729,2.318598,0.935847
2,RandomForest,0.97076,0.983333,0.936508,0.95935,1.053908,0.963624
3,ADABoost,0.976608,0.968254,0.968254,0.968254,0.843126,0.974868
4,GBM,0.959064,0.951613,0.936508,0.944,1.475471,0.954365


In [146]:
pd.crosstab(y_test, pred5)

col_0,0.0,1.0
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,105,3
1.0,4,59


In [147]:
### Support Vector Machine (SVM)

In [148]:
mod6 = SVC(probability=True)
mod6.fit(X_train,y_train)

pred6 = mod6.predict(X_test)

In [149]:
model_dict = {'model': "SVM"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y_test, pred6)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.964912,0.967213,0.936508,0.951613,1.26469,0.958995
1,Decision Tree,0.935673,0.893939,0.936508,0.914729,2.318598,0.935847
2,RandomForest,0.97076,0.983333,0.936508,0.95935,1.053908,0.963624
3,ADABoost,0.976608,0.968254,0.968254,0.968254,0.843126,0.974868
4,GBM,0.959064,0.951613,0.936508,0.944,1.475471,0.954365
5,SVM,0.935673,1.0,0.825397,0.904348,2.318598,0.912698


In [150]:
pd.crosstab(y_test, pred6)

col_0,0.0,1.0
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,108,0
1.0,11,52


In [151]:
### XGBoost

In [152]:
mod7 = xgb.XGBClassifier()
mod7.fit(X_train,y_train)

pred7 = mod7.predict(X_test)



In [153]:
model_dict = {'model': "XGB"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y_test, pred7)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.964912,0.967213,0.936508,0.951613,1.26469,0.958995
1,Decision Tree,0.935673,0.893939,0.936508,0.914729,2.318598,0.935847
2,RandomForest,0.97076,0.983333,0.936508,0.95935,1.053908,0.963624
3,ADABoost,0.976608,0.968254,0.968254,0.968254,0.843126,0.974868
4,GBM,0.959064,0.951613,0.936508,0.944,1.475471,0.954365
5,SVM,0.935673,1.0,0.825397,0.904348,2.318598,0.912698
6,XGB,0.982456,0.983871,0.968254,0.976,0.632345,0.979497


In [154]:
pd.crosstab(y_test, pred7)

col_0,0.0,1.0
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,107,1
1.0,2,61


In [155]:
## Model Selection

In [156]:
models_list.sort_values('Accuracy',ascending=False)

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
6,XGB,0.982456,0.983871,0.968254,0.976,0.632345,0.979497
3,ADABoost,0.976608,0.968254,0.968254,0.968254,0.843126,0.974868
2,RandomForest,0.97076,0.983333,0.936508,0.95935,1.053908,0.963624
0,Logistic Regression,0.964912,0.967213,0.936508,0.951613,1.26469,0.958995
4,GBM,0.959064,0.951613,0.936508,0.944,1.475471,0.954365
1,Decision Tree,0.935673,0.893939,0.936508,0.914729,2.318598,0.935847
5,SVM,0.935673,1.0,0.825397,0.904348,2.318598,0.912698


In [157]:
models_list.sort_values('AUC',ascending=False)

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
6,XGB,0.982456,0.983871,0.968254,0.976,0.632345,0.979497
3,ADABoost,0.976608,0.968254,0.968254,0.968254,0.843126,0.974868
2,RandomForest,0.97076,0.983333,0.936508,0.95935,1.053908,0.963624
0,Logistic Regression,0.964912,0.967213,0.936508,0.951613,1.26469,0.958995
4,GBM,0.959064,0.951613,0.936508,0.944,1.475471,0.954365
1,Decision Tree,0.935673,0.893939,0.936508,0.914729,2.318598,0.935847
5,SVM,0.935673,1.0,0.825397,0.904348,2.318598,0.912698
