In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
df = pd.read_csv(r"C:\Users\temisola\DMML-Dubai-UG-Group-2\Datasets\processed\tabular_data\brain_tumor_dataset.csv")
X = df.drop(columns=["Patient_ID", "Histology"])
y = df["Histology"]

In [4]:
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(exclude="object").columns.tolist()

preprocess = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

In [None]:
#Train the data
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
)

pipe = Pipeline([
    ("prep", preprocess),
    ("clf", rf),
])

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [7]:
# Evaluation
print("Baseline accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred, labels=sorted(y.unique())))

Baseline accuracy: 0.24675
                 precision    recall  f1-score   support

    Astrocytoma       0.25      0.25      0.25       987
   Glioblastoma       0.26      0.28      0.27      1014
Medulloblastoma       0.23      0.23      0.23       996
     Meningioma       0.25      0.23      0.24      1003

       accuracy                           0.25      4000
      macro avg       0.25      0.25      0.25      4000
   weighted avg       0.25      0.25      0.25      4000

Confusion matrix:
 [[247 264 263 213]
 [248 279 252 235]
 [243 265 228 260]
 [237 274 259 233]]


In [8]:
# quick hyperparameter tuning (small grid to keep runtime reasonable)
param_grid = {
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_leaf": [1, 5, 15],
    "clf__max_features": ["sqrt", "log2"],
}
grid = GridSearchCV(pipe, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("CV accuracy:", grid.best_score_)
y_pred_opt = grid.predict(X_test)
print("Tuned test accuracy:", accuracy_score(y_test, y_pred_opt))

Best params: {'clf__max_depth': None, 'clf__max_features': 'log2', 'clf__min_samples_leaf': 1}
CV accuracy: 0.24931192385875822
Tuned test accuracy: 0.2495


In [9]:
# Feature importances from the tuned model
best_model = grid.best_estimator_
cat_names = best_model.named_steps["prep"].named_transformers_["cat"].get_feature_names_out(cat_cols)
feature_names = num_cols + list(cat_names)
importances = best_model.named_steps["clf"].feature_importances_
top = sorted(zip(importances, feature_names), reverse=True)[:15]
print("Top features:")
for score, name in top:
    print(f"{name}: {score:.4f}")

Top features:
Survival_Rate: 0.1059
Tumor_Growth_Rate: 0.1056
Tumor_Size: 0.1050
Age: 0.0973
Follow_Up_Required_Yes: 0.0172
Surgery_Performed_Yes: 0.0172
Stage_II: 0.0171
Stage_I: 0.0171
Surgery_Performed_No: 0.0170
Follow_Up_Required_No: 0.0170
Stage_IV: 0.0170
Symptom_2_Headache: 0.0169
Location_Occipital: 0.0169
Location_Temporal: 0.0168
Tumor_Type_Benign: 0.0168
