In [1]:
# ml/train_model.py or inside train_model.ipynb

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [3]:
# Load dataset
df = pd.read_csv("breast_cancer.csv")

# Optional: Drop ID column if it exists
if 'id' in df.columns:
    df = df.drop(columns=['id'])

# Map diagnosis if it's in 'M'/'B' format
if 'diagnosis' in df.columns:
    df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
    y = df['diagnosis']
    X = df.drop(columns=['diagnosis'])
elif 'target' in df.columns:
    y = df['target']
    X = df.drop(columns=['target'])



In [5]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "../backend/app/model/model.joblib")
print("✅ Model saved to backend/app/model/model.joblib")


Accuracy: 0.9649122807017544
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

✅ Model saved to backend/app/model/model.joblib
