# Decision Tree Classifier


In [76]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
import shap
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn import tree 

Load datasets manuseados


In [77]:
df_train = pd.read_csv('../datasets_manuseados/train_radiomics_hipocamp_treated.csv')
df_test = pd.read_csv('../datasets_manuseados/test_radiomics_hipocamp_treated.csv')

Float64/Int64 to Float32/Int32

In [78]:
float_features = df_train.select_dtypes(include='float')
int_features = df_train.select_dtypes(include='int')
df_train[float_features.columns] = df_train[float_features.columns].astype(np.float32)
df_train[int_features.columns] = df_train[int_features.columns].astype(np.int32)
df_train.info()
print("--------------------")
float_features = df_test.select_dtypes(include='float')
int_features = df_test.select_dtypes(include='int')
df_test[float_features.columns] = df_test[float_features.columns].astype(np.float32)
df_test[int_features.columns] = df_test[int_features.columns].astype(np.int32)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 1898 entries, RowId to Transition
dtypes: float32(1896), int32(1), object(1)
memory usage: 2.2+ MB
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 1897 entries, RowId to Age
dtypes: float32(1896), int32(1)
memory usage: 741.1 KB


MDI SHAP Permutation

In [79]:
X = df_train.drop(columns=['Transition'])
y = df_train['Transition']
X_test = df_test

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

dt = DecisionTreeClassifier(random_state=2024)
dt.fit(X_train, y_train)

# MDI - Mean Decrease Impurity - feature importance

mdi_importances = dt.feature_importances_
mdi_features = X_train.columns[mdi_importances > mdi_importances.mean()]

# SHAP feature importance
explainer = shap.TreeExplainer(dt)
shap_values = explainer.shap_values(X_train)

# Aggregating SHAP values for binary or multiclass classification
if len(shap_values.shape) == 3:  # Multiclass
    # Mean over samples (axis=0), then aggregate across classes (axis=1)
    shap_importances = np.mean(np.abs(shap_values), axis=0).sum(axis=1)
else:  # Binary
    shap_importances = np.mean(np.abs(shap_values), axis=0)

# Ensure shap_importances aligns with X_train columns
if len(shap_importances) != len(X_train.columns):
    raise ValueError(f"Mismatch: shap_importances ({len(shap_importances)}) and X_train columns ({len(X_train.columns)})")
shap_features = X_train.columns[shap_importances > shap_importances.mean()]


# Permutation feature importance

perm_importances = permutation_importance(dt, X_val, y_val, n_repeats=10, random_state=2024)
perm_features = X_train.columns[perm_importances.importances_mean > perm_importances.importances_mean.mean()]

# Selected features
num_mdi_features = len(mdi_features)
num_shap_features = len(shap_features)
num_perm_features = len(perm_features)
print(f"MDI features: {num_mdi_features}")
print(f"SHAP features: {num_shap_features}")
print(f"Permutation features: {num_perm_features}")

selected_features = set(mdi_features) | set(shap_features) | set(perm_features)
selected_features = list(selected_features)
print(f"Selected features: {selected_features}")

num_selected_features = len(selected_features)
print(f"Number of selected features: {num_selected_features}")

X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]
X_test_selected = X_test[selected_features]

MDI features: 48
SHAP features: 48
Permutation features: 20
Selected features: ['squareroot_glrlm_ShortRunLowGrayLevelEmphasis', 'gradient_firstorder_Range', 'logarithm_glrlm_GrayLevelNonUniformity', 'wavelet-HHH_glcm_ClusterTendency', 'Age', 'wavelet-HLH_firstorder_Maximum', 'wavelet-HLL_firstorder_Range', 'lbp-3D-k_glszm_LowGrayLevelZoneEmphasis', 'original_glszm_LowGrayLevelZoneEmphasis', 'wavelet-HLH_gldm_SmallDependenceEmphasis', 'original_glcm_MaximumProbability', 'lbp-3D-m2_firstorder_MeanAbsoluteDeviation', 'wavelet-LHH_glszm_SmallAreaLowGrayLevelEmphasis', 'wavelet-LHL_glcm_DifferenceEntropy', 'wavelet-HLH_firstorder_Uniformity', 'wavelet-HHH_glcm_Idn', 'gradient_glcm_Idn', 'logarithm_glcm_JointAverage', 'wavelet-LHL_firstorder_RobustMeanAbsoluteDeviation', 'wavelet-HLH_firstorder_InterquartileRange', 'wavelet-LHH_gldm_DependenceNonUniformityNormalized', 'gradient_glszm_ZonePercentage', 'squareroot_ngtdm_Coarseness', 'wavelet-HLL_glcm_ClusterShade', 'wavelet-LLL_glszm_SmallAre

GridSearch


In [80]:
param_grid = {
    'criterion': ['gini', 'entropy'],      # Splitting criterion
    'max_depth': [None, 5, 10, 20, 30],   # Maximum tree depth
    'min_samples_split': [2, 5, 10],      # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],        # Minimum samples per leaf node
    'max_features': [None, 'sqrt', 'log2']  # Maximum features considered for splitting
}

grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_selected, y_train)

best_dt = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")

feature_importances = best_dt.feature_importances_
important_features = pd.DataFrame({
    'Feature': X_train_selected.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Save important features
important_features.to_csv('../check_files_hip/selected_important_features.csv', index=False)


Fitting 5 folds for each of 270 candidates, totalling 1350 fits
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2; total time=   0.0s[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s

[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_spli

Model training and predict

In [None]:
best_dt.fit(X_train_selected, y_train)

val_accuracy = best_dt.score(X_val_selected, y_val)
print(f"Validation accuracy: {val_accuracy:.4f}")

y_pred = best_dt.predict(X_val_selected)
print("Classification Report on Validation Set:")
print(classification_report(y_val, y_pred))

predictions = best_dt.predict(X_test_selected)

df_test['Transition'] = predictions

df_test[['RowId', 'Transition']].to_csv('../submissions/dtc.csv', index=False)

Validation accuracy: 0.2951
Classification Report on Validation Set:
              precision    recall  f1-score   support

       AD-AD       0.38      0.50      0.43        12
       CN-CN       0.42      0.26      0.32        19
      CN-MCI       0.00      0.00      0.00         2
      MCI-AD       0.24      0.29      0.26        14
     MCI-MCI       0.23      0.21      0.22        14

    accuracy                           0.30        61
   macro avg       0.25      0.25      0.25        61
weighted avg       0.31      0.30      0.30        61

