In [2]:
import pandas as pd
import numpy as np

# Step 1: Load the dataset
df = pd.read_csv("dataset/breast_cancer_data.csv", header=None)

# Step 2: Assign proper column names
df.columns = [
    "Sample code number", "Clump Thickness", "Uniformity of Cell Size",
    "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size",
    "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"
]

# Step 3: Replace '?' with np.nan
df.replace('?', np.nan, inplace=True)

# Step 4: Drop the ID column
df.drop("Sample code number", axis=1, inplace=True)

# Step 5: Convert numeric columns (excluding target label)
cols_to_convert = df.columns.drop('Class')
df[cols_to_convert] = df[cols_to_convert].astype(float)

# Step 6: Fill missing values with column median
df.fillna(df.median(numeric_only=True), inplace=True)

# Step 7: Convert target labels (2 → 0, 4 → 1)
df['Class'] = df['Class'].map({2.0: 0, 4.0: 1})

# Final check
print(" Dataset is ready!")
print(df.info())


 Dataset is ready!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Clump Thickness              699 non-null    float64
 1   Uniformity of Cell Size      699 non-null    float64
 2   Uniformity of Cell Shape     699 non-null    float64
 3   Marginal Adhesion            699 non-null    float64
 4   Single Epithelial Cell Size  699 non-null    float64
 5   Bare Nuclei                  699 non-null    float64
 6   Bland Chromatin              699 non-null    float64
 7   Normal Nucleoli              699 non-null    float64
 8   Mitoses                      699 non-null    float64
 9   Class                        699 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 54.7 KB
None


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import os

#1. Split Features and Target
X = df.drop("Class", axis=1)  # All feature columns
y = df["Class"]

 #2. Split Dataset for Training and Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

# 3. Train Random Forest Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

#  4. Evaluate the Model
accuracy = accuracy_score(y_test, model.predict(X_test))
print(f" Model Accuracy: {accuracy * 100:.2f}%")

#  5. Save Model
os.makedirs("models", exist_ok=True)
joblib.dump(model, "models/breast_cancer_model.pkl")
print(" Model saved as models/breast_cancer_model.pkl")


 Model Accuracy: 96.43%
 Model saved as models/breast_cancer_model.pkl
