In [None]:
#1.Loading and preprocessing

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [21]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [51]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [53]:
missing_values = X.isnull().sum().sum()

In [55]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"Missing values in dataset: {missing_values}")

Missing values in dataset: 0


In [None]:
#2.Classification algorithm implementation

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [61]:
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    "SVM": SVC(kernel='linear', random_state=42),
    "k-NN": KNeighborsClassifier(n_neighbors=5)
}

In [63]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")

Logistic Regression: 0.9737
Decision Tree: 0.9474
Random Forest: 0.9649
SVM: 0.9561
k-NN: 0.9474


In [None]:
#3.Model comparison

In [65]:
best_model = max(results, key=results.get)
worst_model = min(results, key=results.get)

print(f"Best performing model: {best_model} with accuracy {results[best_model]:.4f}")
print(f"Worstperforming model: {worst_model} with accuracy {results[worst_model]:.4f}")


Best performing model: Logistic Regression with accuracy 0.9737
Worstperforming model: Decision Tree with accuracy 0.9474


In [None]:
##key observations:
#Best performing model:Typically, Random Forest or SVM performs best due to their ability to handle complex patterns.
#Worst performing model: Decisiom Tree or k-NN might perform the worst depending on overfitting or sensitivity to feature scaling.
