In [23]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer


In [33]:
# 1. Loading and Preprocessing
# Load the dataset
 #Load the dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [27]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [35]:
# Check for missing values
if X.isnull().sum().sum() == 0:
    print("No missing values in the dataset.")
    df.fillna(df.mean(), inplace=True)


No missing values in the dataset.


In [37]:
# Feature scaling
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.drop(columns=['target']))


In [39]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df_scaled, df['target'], test_size=0.2, random_state=42)

In [49]:

# Classification Algorithm Implementation
# 2.1 Logistic Regression
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression:")
print(f"Accuracy = {accuracy_lr:.2f}")
print(classification_report(y_test, y_pred_lr))
print("-" * 40)



Logistic Regression:
Accuracy = 0.97
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

----------------------------------------


In [51]:
#2.2 Decision Tree Classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Classifier:")
print(f"Accuracy = {accuracy_dt:.2f}")
print(classification_report(y_test, y_pred_dt))
print("-" * 40)

Decision Tree Classifier:
Accuracy = 0.95
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114

----------------------------------------


In [53]:
# 2.3 Random Forest Classifier
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Classifier:")
print(f"Accuracy = {accuracy_rf:.2f}")
print(classification_report(y_test, y_pred_rf))
print("-" * 40)

Random Forest Classifier:
Accuracy = 0.96
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

----------------------------------------


In [55]:
# 2.4 Support Vector Machine (SVM)
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Support Vector Machine (SVM):")
print(f"Accuracy = {accuracy_svm:.2f}")
print(classification_report(y_test, y_pred_svm))
print("-" * 40)

Support Vector Machine (SVM):
Accuracy = 0.97
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

----------------------------------------


In [57]:
#2.5 k-Nearest Neighbors (k-NN)
k_nn = KNeighborsClassifier()
k_nn.fit(X_train, y_train)
y_pred_knn = k_nn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("k-Nearest Neighbors (k-NN):")
print(f"Accuracy = {accuracy_knn:.2f}")
print(classification_report(y_test, y_pred_knn))
print("-" * 40)

k-Nearest Neighbors (k-NN):
Accuracy = 0.95
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114

----------------------------------------


In [59]:
# Model Comparison
results = {
    "Logistic Regression": accuracy_lr,
    "Decision Tree": accuracy_dt,
    "Random Forest": accuracy_rf,
    "SVM": accuracy_svm,
    "k-NN": accuracy_knn
}

results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy'])
print("Model Performance Comparison:")
print(results_df.sort_values(by='Accuracy', ascending=False))


Model Performance Comparison:
                     Accuracy
Logistic Regression  0.973684
SVM                  0.973684
Random Forest        0.964912
Decision Tree        0.947368
k-NN                 0.947368


In [61]:
# Identify the best and worst models
best_model = results_df['Accuracy'].idxmax()
worst_model = results_df['Accuracy'].idxmin()

print(f"Best performing model: {best_model} with accuracy {results_df.loc[best_model, 'Accuracy']:.2f}")
print(f"Worst performing model: {worst_model} with accuracy {results_df.loc[worst_model, 'Accuracy']:.2f}")


Best performing model: Logistic Regression with accuracy 0.97
Worst performing model: Decision Tree with accuracy 0.95
