In [1]:
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset
from sklearn.impute import SimpleImputer # used for handling missing data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.preprocessing import StandardScaler # used for feature scaling
import matplotlib.pyplot as pl
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier #used for creating decision tree
from sklearn.tree import plot_tree 
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image
from sklearn import tree  
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score, recall_score

data = pd.read_csv ("breast-cancer.csv")
dframe = pd.DataFrame(data) 
labelEncoder = LabelEncoder()
labelEncoder.fit(data["diagnosis"])
data["diagnosis"] = labelEncoder.transform(data["diagnosis"])
dframe.drop('id',axis=1,inplace=True)
display(dframe)
print(dframe.duplicated().to_string())


FileNotFoundError: [Errno 2] No such file or directory: 'breast-cancer.csv'

In [None]:
dframe['diagnosis'].value_counts()
sns.countplot(dframe['diagnosis'], label = 'count of B or M')

In [None]:
dframe.corr()[['diagnosis']].sort_values(by='diagnosis')
pl.figure(figsize=(8, 12))
heatmap = sns.heatmap(dframe.corr()[['diagnosis']].sort_values(by='diagnosis'), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with Diagnosis', fontdict={'fontsize':18}, pad=16);
    

In [None]:
#Splitting dataset into 80% training and 20% testing
dframe.drop(['smoothness_se','fractal_dimension_mean','texture_se','symmetry_se','fractal_dimension_se'],axis=1,inplace=True)
X=dframe.iloc[:,1:]
y=dframe.iloc[:,0:1]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)


In [None]:
dt=DecisionTreeClassifier(criterion='entropy',random_state=0)
dt.fit(X_train,y_train)


In [None]:
y_pred=dt.predict(X_test)


In [None]:
Names=list(X.columns.values)
dot_data = tree.export_graphviz(dt, out_file=None, filled=True, rounded=True,special_characters=True,feature_names=Names,class_names=['Benign','Malignant'])  
                                
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  

# Show graph
Image(graph.create_png())

In [None]:
#Generate the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')
ax.set_title('Decision Tree Confusion Matrix');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])
## Display the visualization of the Confusion Matrix.
pl.show()

In [None]:
tree_accuracy=accuracy_score(y_test,y_pred)
tree_precision=precision_score(y_test, y_pred)
tree_recall=recall_score(y_test, y_pred)
tree_error=(1-accuracy_score(y_test,y_pred))
tree_spec=recall_score(y_test, y_pred, pos_label=0)
print("Acuracy score: ",tree_accuracy)
print("Precision score: ",tree_precision) #not very useful
print("Recall Score: ",tree_recall) 
print("Error Rate: ",tree_error)
print('Specificity : ',tree_spec)


In [None]:
#Recall, sensitivity or true positive rate (TPR) gives us a measure for how many of the real “true” values we detected. 
#When we want to keep the false positives to a minimum, we want to increase the precision of our model, and when we want to reduce false negatives, we want to increase the recall.

In [None]:
pl.title('Feature importances obtained from coefficients', size=20)
feat_importances = pd.Series(dt.feature_importances_, index=X.columns)
feat_importances.nlargest(12).plot(kind='barh')
pl.show()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state= 0, test_size=0.2)
#take nearest odd number for accuracy
import math
math.sqrt(len(y_test))


In [None]:
knn = KNeighborsClassifier(n_neighbors=9)
#training model
knn.fit(x_train, y_train)


In [None]:
#testing accuracy compared to training
knn_accuracy = knn.score(x_test, y_test)

In [None]:
#confusion matrix
from sklearn.metrics import confusion_matrix
y_pred= knn.predict(x_test)
output= confusion_matrix(y_test, y_pred)
output

In [None]:
sns.heatmap(output, annot=True)
pl.xlabel('Prediction')
pl.ylabel('Truth')

In [None]:
knn_precision=precision_score(y_test, y_pred)
knn_recall=recall_score(y_test, y_pred)
knn_error=(1-accuracy_score(y_test,y_pred))
knn_spec=recall_score(y_test, y_pred, pos_label=0)
print("Accuracy score: ",knn_accuracy)
print("Precision score: ",knn_precision) #not very useful
print("Recall Score: ",knn_recall) 
print("Error Rate: ",knn_error)
print('Specificity : ',knn_spec)

In [None]:
#prediction with labels
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state= 0, test_size=0.2)
gnb= GaussianNB()
gnb.fit(x_train, y_train)
pred= gnb.predict(x_test)
print(pred)

In [None]:
bayesian_accuracy=accuracy_score(y_test, pred)
bayesian_precision=precision_score(y_test, pred)
bayesian_recall=recall_score(y_test, pred)
bayesian_error=(1-accuracy_score(y_test,pred))
bayesian_spec=recall_score(y_test, pred, pos_label=0)
print("Acuracy score: ",bayesian_accuracy)
print("Precision score: ",bayesian_precision) #not very useful
print("Recall Score: ",bayesian_recall) 
print("Error Rate: ",bayesian_error)
print('Specificity : ',bayesian_spec)

In [None]:
out= confusion_matrix(y_test, pred)
out

In [None]:
sns.heatmap(out, annot=True)
pl.xlabel('Prediction')
pl.ylabel('Truth')

In [None]:
print("Naive Bayes score: ", gnb.score(x_test, y_test))

In [None]:
data={"Accuracy":[knn_accuracy,bayesian_accuracy,tree_accuracy],
     "Precision":[knn_precision,bayesian_precision,tree_precision],
     "Recall":[knn_recall,bayesian_recall,tree_recall],
     "Specificity":[knn_spec,bayesian_spec,tree_spec],
     "Error Rate":[knn_error,bayesian_error,tree_error]};
index=["KNN"," Naïve Bayes Classifier","Decision Tree"];
dataframe=pd.DataFrame(data=data,index=index);
dataframe.plot.bar(rot=15,title="Evaluation Metrics Comparison in Classification Models",color=['orchid','pink','lightblue','aqua','silver'])
pl.rcParams["figure.figsize"] = (15,10)
pl.show(block=True);