In [6]:
import pandas as pd
import numpy as np
import joblib
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer

# Fetch dataset
breast_cancer_wisconsin_original = fetch_ucirepo(id=15)
X = breast_cancer_wisconsin_original.data.features
y = breast_cancer_wisconsin_original.data.targets

# Convert y to a Pandas Series and replace values (2 → 0, 4 → 1)
y = pd.Series(y.values.ravel())  # Flatten the array
y = y.replace({2: 0, 4: 1})  # Convert labels

# Handle Missing Values: Replace NaNs with Column Mean
imputer = SimpleImputer(strategy="mean")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Model
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

# Ensure 'models/' folder exists
os.makedirs("models", exist_ok=True)

# Save Model and Scaler
joblib.dump(model, "models/breast_cancer_knn.pkl")
joblib.dump(scaler, "models/scaler.pkl")

['models/scaler.pkl']