In [1]:
import os, pickle
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef


# 1. Load the raw data
# wdbc.data is comma-separated and has no header
column_names = [
    'id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
    'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean',
    'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se',
    'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave_points_se',
    'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst',
    'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst',
    'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
]

cancer_df = pd.read_csv('wdbc.data', names=column_names)

# 2. Preprocessing
# Drop the ID column as it's not a feature
cancer_df.drop('id', axis=1, inplace=True)

# Encode Diagnosis: M (Malignant) = 0, B (Benign) = 1
# (Matching sklearn's load_breast_cancer convention)
cancer_df['diagnosis'] = cancer_df['diagnosis'].map({'M': 0, 'B': 1})
cancer_df.to_csv('master_data.csv', index=False)
train_df, test_df = train_test_split(cancer_df, test_size=0.1, random_state=42)
test_df.to_csv('test_data.csv', index=False)


# 4. Features and Target
X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']


# 5. Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Initialize Models
models = {
    # Added max_iter=2000 to ensure the model converges
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42),

    # Added max_depth to prevent the tree from becoming too complex
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),

    # Changed n_neighbors to 7
    "KNN": KNeighborsClassifier(n_neighbors=7),

    "Naive Bayes": GaussianNB(),

    # Specified number of trees
    "Random Forest": RandomForestClassifier(n_estimators=150, random_state=42),

    # XGBoost specific parameters
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
}

# 4. Train, Save, and Collect Metrics for README Table
os.makedirs('model', exist_ok=True)
comparison_data = []

for name, model in models.items():
    # Train
    model.fit(X_train_scaled, y_train)

    # Save (Requirement Step 3)
    with open(f'model/{name.lower().replace(" ", "_")}.pkl', 'wb') as f:
        pickle.dump(model, f)

 # Evaluate
    y_pred = model.predict(X_test_scaled)
    metrics = {
        "ML Model Name": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1]),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    comparison_data.append(metrics)

# Save Scaler for the App
with open('model/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Display this table to copy into your README.md
df_metrics = pd.DataFrame(comparison_data)
print(df_metrics)
df_metrics.to_csv('model/comparison_metrics.csv', index=False)

         ML Model Name  Accuracy       AUC  Precision  Recall        F1  \
0  Logistic Regression  0.982456  0.995588   0.975610   1.000  0.987654   
1        Decision Tree  0.947368  0.972794   0.974359   0.950  0.962025   
2                  KNN  0.929825  0.989706   0.950000   0.950  0.950000   
3          Naive Bayes  0.964912  0.994118   0.975000   0.975  0.975000   
4        Random Forest  0.964912  0.997059   0.975000   0.975  0.975000   
5              XGBoost  0.964912  0.994118   0.975000   0.975  0.975000   

        MCC  
0  0.958238  
1  0.877101  
2  0.832353  
3  0.916176  
4  0.916176  
5  0.916176  
