In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# --- 1. Load and Initial Cleaning ---

# Ensure data.csv is in the same directory as this file
try:
    df = pd.read_csv("data.csv")
except FileNotFoundError:
    print("Error: 'data.csv' not found. Please ensure it is in the same folder.")
    exit()

print("--- Initial Data Info ---")
df.info()

# Drop irrelevant columns: 'id' and 'Unnamed: 32' (the empty column)
df.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)
print("\n--- Data Cleaning Complete ---")


# --- 2. Feature Engineering for Multi-Class Target ('issue_priority') ---

# The original 'diagnosis' (M/B) is used to create the three-level priority target:
# 2: High Priority (Malignant)
# 1: Medium Priority (Benign, large tumor)
# 0: Low Priority (Benign, small tumor)

# Temporarily encode 'diagnosis' (M=1, B=0)
df['diagnosis_encoded'] = df['diagnosis'].map({'M': 1, 'B': 0})
df.drop('diagnosis', axis=1, inplace=True)

# Calculate the median 'area_worst' for benign cases to split them into Medium/Low
benign_cases = df[df['diagnosis_encoded'] == 0].copy()
median_area_worst_benign = benign_cases['area_worst'].median()

# Initialize the new target column
df['issue_priority'] = np.nan

# High Priority (2): All Malignant cases
df.loc[df['diagnosis_encoded'] == 1, 'issue_priority'] = 2

# Medium Priority (1): Benign cases with large tumors
df.loc[(df['diagnosis_encoded'] == 0) & (df['area_worst'] >= median_area_worst_benign), 'issue_priority'] = 1

# Low Priority (0): Benign cases with small tumors
df.loc[(df['diagnosis_encoded'] == 0) & (df['area_worst'] < median_area_worst_benign), 'issue_priority'] = 0

df['issue_priority'] = df['issue_priority'].astype(int)
df.drop('diagnosis_encoded', axis=1, inplace=True)

print("\n--- New Target Distribution ('issue_priority') ---")
print(df['issue_priority'].value_counts())


# --- 3. Data Splitting and Scaling ---

# Define features (X) and target (y)
X = df.drop('issue_priority', axis=1)
y = df['issue_priority']

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and fit StandardScaler (Standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n--- Data Splitting and Scaling Complete ---")


# --- 4. Train a Random Forest Model ---

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the model
rf_model.fit(X_train_scaled, y_train)

print("\n--- Random Forest Model Training Complete ---")


# --- 5. Evaluate the Model and Print Performance Metrics ---

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
class_report = classification_report(y_test, y_pred)

print("\n=======================================================")
print("           FINAL PERFORMANCE METRICS                 ")
print("=======================================================")
print(f"Overall Accuracy: {accuracy:.4f}")
print(f"F1-Score (Macro Average): {f1_macro:.4f}")
print("\nDetailed Classification Report:")
print(class_report)

--- Initial Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  