<a href="https://colab.research.google.com/github/AbdalrahmanAliElnashar/Classification-Project/blob/main/02-Breast_Cancer_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Breast Cancer Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler



In [None]:
data = pd.read_csv('data.csv')
# Data Exploration
print(data.head(20))
print(data.shape)
print(data.info())
print(data.describe())
print(data.columns)
# drop id & unnamed column
data.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)
print('data shape: {}'.format(data.shape))


          id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0     842302         M        17.99         10.38          122.80     1001.0   
1     842517         M        20.57         17.77          132.90     1326.0   
2   84300903         M        19.69         21.25          130.00     1203.0   
3   84348301         M        11.42         20.38           77.58      386.1   
4   84358402         M        20.29         14.34          135.10     1297.0   
5     843786         M        12.45         15.70           82.57      477.1   
6     844359         M        18.25         19.98          119.60     1040.0   
7   84458202         M        13.71         20.83           90.20      577.9   
8     844981         M        13.00         21.82           87.50      519.8   
9   84501001         M        12.46         24.04           83.97      475.9   
10    845636         M        16.02         23.24          102.70      797.8   
11  84610002         M        15.78     

In [None]:
mean_features = list(data.columns[1:11] )
se_features = list(data.columns[11:20])
worst_features = list(data.columns[21:31])
print(mean_features)
print('-----------------------------------------')
print(se_features)
print('-----------------------------------------')
print(worst_features)
print('-----------------------------------------')


In [None]:
# mapping target feature
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
# data['diagnosis'] = data['diagnosis'].replace(['M', 'B'], [1, 0])
print(data['diagnosis'].head())


In [None]:
# # diagnosis distribution
fig = plt.figure(figsize=(8, 8))
sns.countplot(data['diagnosis'])
plt.show()


In [None]:
# check Multicollinearity
# Mean Feature
print('Correlation')
print(data[mean_features].corr())
fig = plt.figure(figsize=(10, 8))
sns.heatmap(data[mean_features].corr(), annot=True, cmap="RdBu")
# plt.show()

In [None]:
mean_features = list(data.columns[1:11].drop(['area_mean', 'perimeter_mean', 'concave points_mean', 'concavity_mean']))
# data.drop(['area_mean', 'perimeter_mean', 'concave points_mean', 'concavity_mean'], axis=1, inplace=True)
print(list(data[mean_features].columns))
print(list(data.columns))
fig = plt.figure(figsize=(10, 8))
sns.heatmap(data[mean_features].corr(), annot=True, cmap="RdBu")
# plt.show()


In [None]:
# Strand error [se]
print('Correlation')
print(data[se_features].corr())
fig = plt.figure(figsize=(10, 8))
sns.heatmap(data[se_features].corr(), annot=True, cmap="RdBu")
# plt.show()

In [None]:
se_features = list(data.columns[11:20].drop(['area_se', 'perimeter_se', 'concave points_se', 'concavity_se']))
print(list(data[se_features].columns))
fig = plt.figure(figsize=(10, 8))
sns.heatmap(data[se_features].corr(), annot=True, cmap="RdBu")
# plt.show()

In [None]:
# Worst
print('Correlation')
print(data[worst_features].corr())
fig = plt.figure(figsize=(10, 8))
sns.heatmap(data[worst_features].corr(), annot=True, cmap="RdBu")
# plt.show()

In [None]:
worst_features = list(data.columns[21:31].drop(['area_worst', 'perimeter_worst', 'concave points_worst', 'concavity_worst', 'fractal_dimension_worst']))
print(list(data[se_features].columns))
fig = plt.figure(figsize=(10, 8))
sns.heatmap(data[worst_features].corr(), annot=True, cmap="RdBu")

In [None]:
# Features
print(list([mean_features, se_features, worst_features]))
print(list(data.columns))
data = pd.concat([data[mean_features], data[se_features], data[worst_features], data['diagnosis']], axis=1)
print(list(data.columns))
print('Data shape {}'.format(data.shape))
print('-----------------------------------------------------')


In [None]:
# X and y
x = data.drop('diagnosis', axis=1).values
y = data['diagnosis'].values
print(x.shape)
print(y.shape)


In [None]:
# train_test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
# Standardization
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)


In [None]:
# KNN model
# Find the Best value for K
error1 = []
error2 = []
for k in range(1, 15):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)
    y_predicted1 = knn.predict(x_train)
    error1.append(np.mean(y_train != y_predicted1))
    y_predicted2 = knn.predict(x_test)
    error2.append(np.mean(y_test != y_predicted2))

plt.figure(figsize=(10, 6))
plt.plot(range(1, 15), error1, label="train")
plt.plot(range(1, 15), error2, label="test")
plt.xlabel('k Value')
plt.ylabel('Error')
plt.legend()
plt.show()
# The best key value = 5


In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
y_predicted = knn.predict(x_test)
print(y_test)
print(y_predicted)
print(metrics.confusion_matrix(y_test, y_predicted))
print(metrics.accuracy_score(y_test, y_predicted)*100)
print(metrics.classification_report(y_test, y_predicted))


In [None]:
# confusion matrix and f1 score
f1_score_knn = metrics.f1_score(y_test, y_predicted, average='micro')
cm_knn = metrics.confusion_matrix(y_test, y_predicted)
sns.heatmap(cm_knn, annot=True, fmt=".0f", linewidths=3, square=True, cmap='Reds', color="#cd1076")
plt.ylabel('actual label')
plt.xlabel('predicted label')
plt.title(f'F1 Score [KNN Algorithm]: {f1_score_knn:.2f}', size=14, color='red')
plt.show()
print('KNN F1 Score {}'.format(f1_score_knn))


In [None]:
# Naive Bayes Algorithm
gnb = GaussianNB()
model = gnb.fit(x_train, y_train)
y_predicted = gnb.predict(x_test)
print(y_test)
print(y_predicted)
# confusion matrix and f1 score
f1_score_NB = metrics.f1_score(y_test, y_predicted, average='micro')
cm_mnb = metrics.confusion_matrix(y_test, y_predicted)
sns.heatmap(cm_mnb, annot=True, fmt=".0f", linewidths=3, square=True, cmap='Reds', color="#cd1076")
plt.ylabel('actual label')
plt.xlabel('predicted label')
plt.title(f'F1 Score [Naive Bayes Algorithm]: {f1_score_NB:.2f}', size=14, color='red')
plt.show()
print('Naive Bayes F1 Score {}'.format(f1_score_NB))



In [None]:
# Logistic Regression
log = LogisticRegression()
model = log.fit(x_train, y_train)
y_predicted = log.predict(x_test)
# confusion matrix and f1 score
print('Accuracy of Logistic Regression Algorithm: '
      , metrics.accuracy_score(y_test, y_predicted)*100)
print(metrics.classification_report(y_test, y_predicted))

f1_score_log = metrics.f1_score(y_test, y_predicted, average='micro')
cm_mnb = metrics.confusion_matrix(y_test, y_predicted)
sns.heatmap(cm_mnb, annot=True, fmt=".0f", linewidths=3, square=True, cmap='Reds', color="#cd1076")
plt.ylabel('actual label')
plt.xlabel('predicted label')
plt.title(f'F1 Score [Logistic Regression Algorithm]: {f1_score_log:.2f}', size=14, color='red')
plt.show()
print('Logistic Regression F1 Score {}'.format(f1_score_log))


In [None]:
# comparison between Algorithms
plt.figure(figsize=(12, 6))
model_acc = [f1_score_log, f1_score_knn, f1_score_NB]
model_name = ['LogisticRegression', 'KNN', 'Naive Bayes']
sns.barplot(x=model_acc, y=model_name, palette='magma')
plt.show()
