In [1]:
# Step 1 : Load required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk

In [16]:
# Step 2 : Load the csv/excel file into pandas dataframe and clean the data
df = pd.read_csv("/content/drive/MyDrive/AIML_datasets/breast-cancer.csv")
df = df.iloc[:, :-1]
df.head()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364


In [4]:
# Step 3 : Create the Feature Matrix and Target Vector and check the first 5 rows
x = df.iloc[:, 2:].values
y = df.diagnosis.values
print(x[:2])
print(y[:5])


[[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
  1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
  6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
  1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
  4.601e-01]
 [2.057e+01 1.777e+01 1.329e+02 1.326e+03 8.474e-02 7.864e-02 8.690e-02
  7.017e-02 1.812e-01 5.667e-02 5.435e-01 7.339e-01 3.398e+00 7.408e+01
  5.225e-03 1.308e-02 1.860e-02 1.340e-02 1.389e-02 3.532e-03 2.499e+01
  2.341e+01 1.588e+02 1.956e+03 1.238e-01 1.866e-01 2.416e-01 1.860e-01
  2.750e-01]]
['M' 'M' 'M' 'M' 'M']


In [5]:
# Step 4 : Split the data into training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=500)

print(x_train.shape)  # (455, 30)
print(x_test.shape)   # (114, 30)
print(y_train.shape)
print(y_test.shape)

print((y_train == 'M').sum())
print((y_train == 'B').sum())


(455, 29)
(114, 29)
(455,)
(114,)
177
278


In [10]:
# Baseline model, accuracy, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, confusion_matrix

baseline_pred = ["B"] * len(y_train)
print("Baseline model of accuracy :", accuracy_score(y_train, baseline_pred))
print(confusion_matrix(y_train, baseline_pred))


Baseline model of accuracy : 0.610989010989011
[[278   0]
 [177   0]]


In [11]:
# Step 5 : Instantiate a Gaussian Naive Bayes model and train the model
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nb_model = GaussianNB()
nb_model.fit(x_train, y_train)

print("Training accuracy:", nb_model.score(x_train, y_train))
print("Testing accuracy:", nb_model.score(x_test, y_test))


Training accuracy: 0.9340659340659341
Testing accuracy: 0.9736842105263158


In [12]:
# Confusion matrix for training data
print("Training Confusion Matrix:\n", confusion_matrix(y_train, nb_model.predict(x_train)))


Training Confusion Matrix:
 [[270   8]
 [ 22 155]]


In [13]:
# Confusion matrix for test data
print("Testing Confusion Matrix:\n", confusion_matrix(y_test, nb_model.predict(x_test)))


Testing Confusion Matrix:
 [[78  1]
 [ 2 33]]


In [14]:
# Classification reports for training and test data
print("Training Classification Report:\n", classification_report(y_train, nb_model.predict(x_train)))
print("Testing Classification Report:\n", classification_report(y_test, nb_model.predict(x_test)))


Training Classification Report:
               precision    recall  f1-score   support

           B       0.92      0.97      0.95       278
           M       0.95      0.88      0.91       177

    accuracy                           0.93       455
   macro avg       0.94      0.92      0.93       455
weighted avg       0.93      0.93      0.93       455

Testing Classification Report:
               precision    recall  f1-score   support

           B       0.97      0.99      0.98        79
           M       0.97      0.94      0.96        35

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

