# Decision Tree Classifier


In [14]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
import shap
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn import tree 

Load datasets manuseados


In [15]:
df_train = pd.read_csv('../datasets_manuseados/train_radiomics_hipocamp_treated.csv')
df_test = pd.read_csv('../datasets_manuseados/test_radiomics_hipocamp_treated.csv')

Float64/Int64 to Float32/Int32

In [16]:
float_features = df_train.select_dtypes(include='float')
int_features = df_train.select_dtypes(include='int')
df_train[float_features.columns] = df_train[float_features.columns].astype(np.float32)
df_train[int_features.columns] = df_train[int_features.columns].astype(np.int32)
df_train.info()
print("--------------------")
float_features = df_test.select_dtypes(include='float')
int_features = df_test.select_dtypes(include='int')
df_test[float_features.columns] = df_test[float_features.columns].astype(np.float32)
df_test[int_features.columns] = df_test[int_features.columns].astype(np.int32)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 1898 entries, RowId to Transition
dtypes: float32(1896), int32(1), object(1)
memory usage: 2.2+ MB
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 1897 entries, RowId to Age
dtypes: float32(1896), int32(1)
memory usage: 741.1 KB


MDI SHAP Permutation

In [17]:
X = df_train.drop(columns=['Transition'])
y = df_train['Transition']
X_test = df_test

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2024)

dt = DecisionTreeClassifier(random_state=2024)
dt.fit(X_train, y_train)

# MDI - Mean Decrease Impurity - feature importance

mdi_importances = dt.feature_importances_
mdi_features = X_train.columns[mdi_importances > mdi_importances.mean()]

# SHAP feature importance
explainer = shap.TreeExplainer(dt)
shap_values = explainer.shap_values(X_train)

# Aggregating SHAP values for binary or multiclass classification
if len(shap_values.shape) == 3:  # Multiclass
    # Mean over samples (axis=0), then aggregate across classes (axis=1)
    shap_importances = np.mean(np.abs(shap_values), axis=0).sum(axis=1)
else:  # Binary
    shap_importances = np.mean(np.abs(shap_values), axis=0)

# Ensure shap_importances aligns with X_train columns
if len(shap_importances) != len(X_train.columns):
    raise ValueError(f"Mismatch: shap_importances ({len(shap_importances)}) and X_train columns ({len(X_train.columns)})")
shap_features = X_train.columns[shap_importances > 0]


# Permutation feature importance

perm_importances = permutation_importance(dt, X_val, y_val, n_repeats=10, random_state=2024)
perm_features = X_train.columns[perm_importances.importances_mean > perm_importances.importances_mean.mean()]

# Selected features
num_mdi_features = len(mdi_features)
num_shap_features = len(shap_features)
num_perm_features = len(perm_features)
print(f"MDI features: {num_mdi_features}")
print(f"SHAP features: {num_shap_features}")
print(f"Permutation features: {num_perm_features}")

selected_features = set(mdi_features) | set(shap_features) | set(perm_features)
selected_features = list(selected_features)
print(f"Selected features: {selected_features}")

num_selected_features = len(selected_features)
print(f"Number of selected features: {num_selected_features}")

X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]
X_test_selected = X_test[selected_features]
X_train_selected.shape, X_val_selected.shape, X_test_selected.shape

MDI features: 44
SHAP features: 44
Permutation features: 16
Selected features: ['wavelet-LLH_glrlm_RunLengthNonUniformityNormalized', 'log-sigma-5-0-mm-3D_glszm_SmallAreaLowGrayLevelEmphasis', 'log-sigma-4-0-mm-3D_ngtdm_Complexity', 'wavelet-LLL_glszm_GrayLevelNonUniformity', 'square_firstorder_Kurtosis', 'log-sigma-3-0-mm-3D_glszm_SmallAreaLowGrayLevelEmphasis', 'wavelet-HLH_glrlm_LowGrayLevelRunEmphasis', 'log-sigma-4-0-mm-3D_gldm_SmallDependenceLowGrayLevelEmphasis', 'wavelet-HLL_gldm_DependenceVariance', 'gradient_glcm_SumSquares', 'Age', 'wavelet-LLH_glszm_HighGrayLevelZoneEmphasis', 'wavelet-HLH_firstorder_Kurtosis', 'log-sigma-5-0-mm-3D_firstorder_Range', 'squareroot_gldm_LargeDependenceEmphasis', 'wavelet-HHL_ngtdm_Busyness', 'log-sigma-1-0-mm-3D_glrlm_RunLengthNonUniformityNormalized', 'log-sigma-5-0-mm-3D_gldm_GrayLevelNonUniformity', 'squareroot_firstorder_Range', 'gradient_glrlm_ShortRunEmphasis', 'square_glcm_JointEnergy', 'squareroot_glcm_JointAverage', 'wavelet-LLH_gldm_

((244, 44), (61, 44), (100, 44))

GridSearch


In [18]:
dt = DecisionTreeClassifier(random_state=2024)

param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [5,10, 15],
    'min_samples_leaf': [5,10,15],
    'max_leaf_nodes': [5,10,15],
}

gridsearchDT = GridSearchCV(dt, param_grid, cv=3, refit=True,verbose=3, scoring='f1_macro')
gridsearchDT.fit(X_train, y_train)

gridpredictions = gridsearchDT.predict(X_val)

print("Best estimator found by grid search:", gridsearchDT.best_estimator_)

print(classification_report(y_val, gridpredictions))



Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV 1/3] END max_depth=None, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=5;, score=0.264 total time=   0.2s
[CV 2/3] END max_depth=None, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=5;, score=0.370 total time=   0.1s
[CV 3/3] END max_depth=None, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=5;, score=0.281 total time=   0.1s
[CV 1/3] END max_depth=None, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=10;, score=0.264 total time=   0.1s
[CV 2/3] END max_depth=None, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=10;, score=0.370 total time=   0.1s
[CV 3/3] END max_depth=None, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=10;, score=0.281 total time=   0.2s
[CV 1/3] END max_depth=None, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=15;, score=0.264 total time=   0.1s
[CV 2/3] END max_depth=None, max_leaf_nodes=5, min_samples_leaf=5, min_samples_split=15;, sco

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model training and predict

In [19]:
dt = DecisionTreeClassifier(max_depth=5, max_leaf_nodes= 15, min_samples_leaf=5, min_samples_split=15, random_state=2024)

dt.fit(X_train, y_train)
predictions = dt.predict(X_test)

df_test['Transition'] = predictions
df_test[['RowId', 'Transition']].to_csv('../submissions/dtc.csv', index=False)

  df_test['Transition'] = predictions
