In [1]:
'''
Perform the following operations using Python on Breast Cancer data sets  
i. Data cleaning(Remove NA, ?, Negative values etc.)  
j. Error correcting(Outlier detection and removal)  
k. Data transformation   
l. Build Data model using regression and Naïve Bayes methods and compare 
accuracy of benign and malignant tumors in Breast Cancer Dataset.  
'''

'\nPerform the following operations using Python on Breast Cancer data sets  \ni. Data cleaning(Remove NA, ?, Negative values etc.)  \nj. Error correcting(Outlier detection and removal)  \nk. Data transformation   \nl. Build Data model using regression and Naïve Bayes methods and compare \naccuracy of benign and malignant tumors in Breast Cancer Dataset.  \n'

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [18]:
data = pd.read_csv('DSBDALExam DataSets\BreastCancer\BreastCancerWc.csv')

In [19]:
data.head()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [20]:
# Add proper column names
data.columns = [
    'ID', 'Clump_Thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape',
    'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei',
    'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class'
]

In [21]:
# Data Cleaning

# Replace '?' with NaN and convert to numeric
data['Bare_Nuclei'] = pd.to_numeric(data['Bare_Nuclei'], errors='coerce')

# Drop rows with any missing values
data = data.dropna()

# Remove any negative values 
data = data[(data >= 0).all(axis=1)]

In [23]:
# Error Correction

# Z-score method to remove outliers
from scipy.stats import zscore
z_scores = np.abs(zscore(data.drop(columns=['Class'])))
data = data[(z_scores < 3).all(axis=1)]

In [24]:
# Data Transformation
X = data.drop('Class', axis=1)
y = data['Class']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [26]:
# Modell Building and Comparison

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_model_pred = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_model_pred)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
nb_acc = accuracy_score(y_test, nb_pred)

In [28]:
# Results
print("Logistic Regression Accuracy:", round(lr_acc * 100, 2), "%")
print("Naive Bayes Accuracy:", round(nb_acc * 100, 2), "%")

print("\nClassification Report (Logistic Regression):\n", classification_report(y_test, lr_model_pred))
print("\nClassification Report (Naive Bayes):\n", classification_report(y_test, nb_pred))

Logistic Regression Accuracy: 94.29 %
Naive Bayes Accuracy: 93.14 %

Classification Report (Logistic Regression):
               precision    recall  f1-score   support

           2       0.97      0.95      0.96       130
           4       0.87      0.91      0.89        45

    accuracy                           0.94       175
   macro avg       0.92      0.93      0.93       175
weighted avg       0.94      0.94      0.94       175


Classification Report (Naive Bayes):
               precision    recall  f1-score   support

           2       0.99      0.92      0.95       130
           4       0.80      0.98      0.88        45

    accuracy                           0.93       175
   macro avg       0.90      0.95      0.92       175
weighted avg       0.94      0.93      0.93       175

