<a href="https://colab.research.google.com/github/Aman-Gautam007/Breast-Cancer-Prediction-using-Machine-Learning/blob/main/Breast_Cancer_Detection_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the libraries

In [285]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

## Importing the dataset

In [286]:
dataset = pd.read_csv('breast_cancer.csv')
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:,-1].values

#Data Pre-Processing

##Processing Data

In [287]:
dataset.duplicated().sum()

8

In [288]:
dataset.isnull().sum()

Sample code number             0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

## Splitting the dataset into the Training set and Test set

In [289]:
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

##Feature Scaling

In [290]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#Logistic Regression

## Training the model on the Training set

In [291]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)

## Predicting the Test set results

In [292]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [293]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[84  3]
 [ 3 47]]


In [294]:
(84+47)/(84+47+3+3)

0.9562043795620438

## Computing the accuracy with k-Fold Cross Validation

In [295]:
accuracies_l = cross_val_score(estimator=classifier, X = X_train, y = y_train, cv =10)
print("Accuracy: {:.2f} % ".format(accuracies_l.mean()*100))
print("Standard Deviation : {:.2f} % ".format(accuracies_l.std()*100))

Accuracy: 96.70 % 
Standard Deviation : 1.97 % 


#Support Vector Machine

##Training the model on the Training set

In [296]:
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

##Predicting the Test set results

In [297]:
y_pred = classifier.predict(X_test)

##Making the Confusion Matrix

In [298]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[83  4]
 [ 2 48]]


In [299]:
(83+48)/(83+48+2+4)

0.9562043795620438

##Computing the accuracy with k-Fold Cross Validation

In [300]:
accuracies_svm = cross_val_score(estimator=classifier, X = X_train, y = y_train, cv =10)
print("Accuracy: {:.2f} % ".format(accuracies_svm.mean()*100))
print("Standard Deviation : {:.2f} % ".format(accuracies_svm.std()*100))

Accuracy: 97.07 % 
Standard Deviation : 2.19 % 


#Kernel Support Vector Machine

##Training the model on the Training set

In [301]:
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

##Predicting the Test set results

In [302]:
y_pred = classifier.predict(X_test)

##Making the Confusion Matrix

In [303]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[82  5]
 [ 1 49]]


In [304]:
(82+49)/(1+5+82+49)

0.9562043795620438

##Computing the accuracy with k-Fold Cross Validation

In [305]:
accuracies_ksvm = cross_val_score(estimator=classifier, X = X_train, y = y_train, cv =10)
print("Accuracy: {:.2f} % ".format(accuracies_ksvm.mean()*100))
print("Standard Deviation : {:.2f} % ".format(accuracies_ksvm.std()*100))

Accuracy: 96.89 % 
Standard Deviation : 2.17 % 


#K-Nearest Neighbours

##Training the model on the Training set

In [306]:
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

##Predicting the Test set results

In [307]:
y_pred = classifier.predict(X_test)

##Making the Confusion Matrix

In [308]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[83  4]
 [ 2 48]]


In [309]:
(83+48)/(83+48+2+4)

0.9562043795620438

##Computing the accuracy with k-Fold Cross Validation

In [310]:
accuracies_knn = cross_val_score(estimator=classifier, X = X_train, y = y_train, cv =10)
print("Accuracy: {:.2f} % ".format(accuracies_knn.mean()*100))
print("Standard Deviation : {:.2f} % ".format(accuracies_knn.std()*100))

Accuracy: 96.70 % 
Standard Deviation : 1.79 % 


#Naive Bayes

##Training the model on the Training set

In [311]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

##Predicting the Test set results

In [312]:
y_pred = classifier.predict(X_test)

##Making the Confusion Matrix

In [313]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[80  7]
 [ 0 50]]


In [314]:
(80+50)/(80+50+0+7)

0.948905109489051

##Computing the accuracy with k-Fold Cross Validation

In [315]:
accuracies_nb = cross_val_score(estimator=classifier, X = X_train, y = y_train, cv =10)
print("Accuracy: {:.2f} % ".format(accuracies_nb.mean()*100))
print("Standard Deviation : {:.2f} % ".format(accuracies_nb.std()*100))

Accuracy: 96.52 % 
Standard Deviation : 2.24 % 


#Decision Tree Classification

##Training the model on the Training set

In [316]:
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

##Predicting the Test set results

In [317]:
y_pred = classifier.predict(X_test)

##Making the Confusion Matrix

In [318]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[84  3]
 [ 3 47]]


In [319]:
(84+47)/(84+47+3+3)

0.9562043795620438

##Computing the accuracy with k-Fold Cross Validation

In [320]:
accuracies_dt = cross_val_score(estimator=classifier, X = X_train, y = y_train, cv =10)
print("Accuracy: {:.2f} % ".format(accuracies_dt.mean()*100))
print("Standard Deviation : {:.2f} % ".format(accuracies_dt.std()*100))

Accuracy: 94.33 % 
Standard Deviation : 2.65 % 


#Random Forest Classification


##Training the model on the Training set

In [321]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

##Predicting the Test set results

In [322]:
y_pred = classifier.predict(X_test)

##Making the Confusion Matrix

In [323]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[83  4]
 [ 3 47]]


In [324]:
(84+47)/(84+47+4+3)

0.9492753623188406

##Computing the accuracy with k-Fold Cross Validation

In [325]:
accuracies_rf = cross_val_score(estimator=classifier, X = X_train, y = y_train, cv =10)
print("Accuracy: {:.2f} % ".format(accuracies_rf.mean()*100))
print("Standard Deviation : {:.2f} % ".format(accuracies_rf.std()*100))

Accuracy: 96.34 % 
Standard Deviation : 2.16 % 


#Conclusion

The most accurate classification model for our dataset is the Support Vector Machine.
With the accuracy of 97.07%