# XGBoost com RFECV

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import os
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder

Tratamento da Random Seed

In [2]:
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

# Fixar a seed
set_seed(2023)

Load CSVs

In [3]:
df_train = pd.read_csv('../../datasets_manuseados/train_radiomics_hipocamp_treated.csv')
df_test = pd.read_csv('../../datasets_manuseados/test_radiomics_hipocamp_treated.csv')

Float64/Int64 to Float32/Int32

In [None]:
float_features = df_train.select_dtypes(include='float')
int_features = df_train.select_dtypes(include='int')
df_train[float_features.columns] = df_train[float_features.columns].astype(np.float32)
df_train[int_features.columns] = df_train[int_features.columns].astype(np.int32)
df_train.info()
print("--------------------")
float_features = df_test.select_dtypes(include='float')
int_features = df_test.select_dtypes(include='int')
df_test[float_features.columns] = df_test[float_features.columns].astype(np.float32)
df_test[int_features.columns] = df_test[int_features.columns].astype(np.int32)
df_test.info()

Train Test Split

In [5]:
X = df_train.drop(columns=['Transition'])
y = df_train['Transition']

X_test = df_test

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2023, stratify = y)

### Feature Selection using RFECV and PCA

In [None]:
model = RandomForestClassifier(random_state=2023)

### 1. RFECV - Recursive Feature Elimination with Cross-Validation
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(3), scoring='f1_macro', n_jobs=-1, verbose=2)
rfecv.fit(X, y)

rfecv_features = X.columns[rfecv.support_]
print(f"Number of selected features by RFECV: {len(rfecv_features)}")
print(f"Eliminated features by RFECV: {len(X.columns) - len(rfecv_features)}")

plt.figure()
plt.xlabel("Number of selected features")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.title("RFECV - Score vs. Number of Features")
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])
plt.show()

X_rfecv = X[rfecv_features]
X_test_rfecv = X_test[rfecv_features]
X_train_rfecv = X_train[rfecv_features]
X_val_rfecv = X_val[rfecv_features]

### 2. PCA - Principal Component Analysis
pca = PCA(n_components=0.95)
X_final = pca.fit_transform(X_rfecv)
X_test_final = pca.transform(X_test_rfecv)
X_train_final = pca.transform(X_train_rfecv)
X_val_final = pca.transform(X_val_rfecv)
print(f"Original number of features (after RFECV): {X_rfecv.shape[1]}")
print(f"Number of components selected by PCA: {pca.n_components_}")

plt.figure()
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, color='blue', alpha=0.7)
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance Ratio")
plt.title("PCA - Principal Components Explained Variance Ratio")
plt.show()

# Final datasets for training and testing
print(f"Final shape of the train dataset: {X_final.shape}")
print(f"Final shape of the test dataset: {X_test_final.shape}")
print(f"Final shape of the training part of the train dataset: {X_train_final.shape}")
print(f"Final shape of the testing part of the train dataset: {X_val_final.shape}")

### Grid Search

In [None]:
xgb = XGBClassifier(random_state=2023)
le = LabelEncoder()
y_train_xgb = le.fit_transform(y_train)

param_grid_xgb = {
    'n_estimators': [100, 200],  # Number of boosting rounds
    'max_depth': [3, 5, 7],  # Maximum depth of a tree
    'learning_rate': [0.01, 0.1],  # Step size shrinkage used to prevent overfitting
    'subsample': [0.8, 1.0],  # Fraction of samples used for training each tree
    'colsample_bytree': [0.8, 1.0],  # Fraction of features used for training each tree
    'gamma': [0, 0.1, 0.2],  # Minimum loss reduction required for a split
    'reg_alpha': [0, 0.1],  # L1 regularization term on weights
    'reg_lambda': [0, 0.1],  # L2 regularization term on weights
    'min_child_weight': [1, 3, 5]  # Minimum sum of instance weight needed for a node split
}

grid_searcgXGB = GridSearchCV(xgb, param_grid_xgb, cv=3, refit=True, verbose=3, scoring='f1_macro')
grid_searcgXGB.fit(X_train_final, y_train_xgb)
grid_predictionGBC = grid_searcgXGB.predict(X_val_final)

print("Best estimator: ", grid_searcgXGB.best_estimator_)
print("Best parameters: ", grid_searcgXGB.best_params_)
print(classification_report(le.transform(y_val), grid_predictionGBC, target_names=le.classes_))

ConfusionMatrixDisplay.from_predictions(le.transform(y_val), grid_predictionGBC)
plt.show()

### XGBoost Prediction

In [23]:
label_mapping = {'AD-AD': 0, 'CN-CN': 1, 'CN-MCI': 2, 'MCI-AD': 3, 'MCI-MCI': 4}
y = le.fit_transform(y)

XGB_best = grid_searcgXGB.best_estimator_

XGB_best.fit(X_final, y)
predictionsXGB = XGB_best.predict(X_test_final)

inverted_label_mapping = {v: k for k, v in label_mapping.items()}
predictionsXGB = np.vectorize(inverted_label_mapping.get)(predictionsXGB)

with open('../../predictions/xgb.csv', 'w') as results:
    results.write('RowId,Result\n')
    i = 1
    for p in predictionsXGB:
        results.write(f'{i},{p}\n')
        i+=1

In [19]:
params = {k: v for k, v in XGB_best.get_params().items() if v is not None}

with open('../../info/xgb_params.txt', 'w') as f:
    for param, value in params.items():
        f.write(f"{param}: {value}\n")