In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

In [None]:
data = pd.read_csv('wdbc.data', header=None)
print(data.head())

         0  1      2      3       4       5        6        7       8   \
0    842302  M  17.99  10.38  122.80  1001.0  0.11840  0.27760  0.3001   
1    842517  M  20.57  17.77  132.90  1326.0  0.08474  0.07864  0.0869   
2  84300903  M  19.69  21.25  130.00  1203.0  0.10960  0.15990  0.1974   
3  84348301  M  11.42  20.38   77.58   386.1  0.14250  0.28390  0.2414   
4  84358402  M  20.29  14.34  135.10  1297.0  0.10030  0.13280  0.1980   

        9   ...     22     23      24      25      26      27      28      29  \
0  0.14710  ...  25.38  17.33  184.60  2019.0  0.1622  0.6656  0.7119  0.2654   
1  0.07017  ...  24.99  23.41  158.80  1956.0  0.1238  0.1866  0.2416  0.1860   
2  0.12790  ...  23.57  25.53  152.50  1709.0  0.1444  0.4245  0.4504  0.2430   
3  0.10520  ...  14.91  26.50   98.87   567.7  0.2098  0.8663  0.6869  0.2575   
4  0.10430  ...  22.54  16.67  152.20  1575.0  0.1374  0.2050  0.4000  0.1625   

       30       31  
0  0.4601  0.11890  
1  0.2750  0.08902  
2  0.

In [None]:

# Load data
columns = ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
           'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean',
           'symmetry_mean', 'fractal_dimension_mean'] + [f'feat_{i}' for i in range(20)]

df = pd.read_csv('wdbc.data', names=columns)

# Preprocess
X = df.drop(['id', 'diagnosis'], axis=1)
y = LabelEncoder().fit_transform(df['diagnosis']) # M -> 1, B -> 0

# Split and Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# 1. Define Descriptive Column Names
# The dataset contains 10 base features measured as Mean, SE, and Worst (30 total)
base_features = [
    'radius', 'texture', 'perimeter', 'area', 'smoothness',
    'compactness', 'concavity', 'concave_points', 'symmetry', 'fractal_dimension'
]

# Construct the full list of 30 feature names
feature_names = [f"{name}_mean" for name in base_features] + \
                [f"{name}_se" for name in base_features] + \
                [f"{name}_worst" for name in base_features]

# Full column list for the .data file (ID + Diagnosis + 30 Features)
columns = ['id', 'diagnosis'] + feature_names

# 2. Load Data
df = pd.read_csv('wdbc.data', names=columns)

# 3. Preprocess
# We keep X as a DataFrame to preserve the feature names during training
X = df.drop(['id', 'diagnosis'], axis=1)
y = LabelEncoder().fit_transform(df['diagnosis']) # M -> 1, B -> 0

# 4. Split and Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Important: StandardScaler returns a numpy array, so we convert it back
# to a DataFrame to keep the feature names attached
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=feature_names)
X_test = pd.DataFrame(scaler.transform(X_test), columns=feature_names)



In [None]:
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] # Required for AUC

    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

# Initialize Models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbor": KNeighborsClassifier(n_neighbors=5),
    "Gaussian Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Train and Evaluate
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    results.append(evaluate_model(name, model, X_test, y_test))

# Display Results
perf_df = pd.DataFrame(results)
print(perf_df)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


                  Model  Accuracy       AUC  Precision    Recall        F1  \
0   Logistic Regression  0.973684  0.997380   0.976190  0.953488  0.964706   
1         Decision Tree  0.947368  0.943990   0.930233  0.930233  0.930233   
2    K-Nearest Neighbor  0.947368  0.981985   0.930233  0.930233  0.930233   
3  Gaussian Naive Bayes  0.964912  0.997380   0.975610  0.930233  0.952381   
4         Random Forest  0.964912  0.995742   0.975610  0.930233  0.952381   
5               XGBoost  0.956140  0.990829   0.952381  0.930233  0.941176   

        MCC  
0  0.943898  
1  0.887979  
2  0.887979  
3  0.925285  
4  0.925285  
5  0.906379  


In [None]:
# After training, save the models using the model objects from the dictionary
joblib.dump(models["Logistic Regression"], 'breast_cancer_model_lr.pkl')
joblib.dump(models["Decision Tree"], 'breast_cancer_model_dt.pkl')
joblib.dump(models["XGBoost"], 'breast_cancer_model_xg.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [None]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave_points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')