# Stacking Classifier

In [5]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn import tree

Load datasets manuseados

In [6]:
df_train = pd.read_csv('../../datasets_manuseados/train_radiomics_hipocamp_treated.csv')
df_test = pd.read_csv('../../datasets_manuseados/test_radiomics_hipocamp_treated.csv')

Float64/Int64 to Float32/Int32

In [7]:
float_features = df_train.select_dtypes(include='float')
int_features = df_train.select_dtypes(include='int')
df_train[float_features.columns] = df_train[float_features.columns].astype(np.float32)
df_train[int_features.columns] = df_train[int_features.columns].astype(np.int32)
df_train.info()
print("--------------------")
float_features = df_test.select_dtypes(include='float')
int_features = df_test.select_dtypes(include='int')
df_test[float_features.columns] = df_test[float_features.columns].astype(np.float32)
df_test[int_features.columns] = df_test[int_features.columns].astype(np.int32)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 1898 entries, RowId to Transition
dtypes: float32(1896), int32(1), object(1)
memory usage: 2.2+ MB
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 1897 entries, RowId to Age
dtypes: float32(1896), int32(1)
memory usage: 741.1 KB


Model training and predict

In [None]:
X = df_train.drop('Transition', axis=1)
y = df_train['Transition']
X = X.drop('RowId', axis=1)
X_teste = df_test.drop('RowId', axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2024)

# Decision Tree
dt = DecisionTreeClassifier(max_depth=5, max_leaf_nodes= 15, min_samples_leaf=5, min_samples_split=15, random_state=2024)
dt.fit(X_train, y_train)

# SVM
svm = SVC(C=1, kernel='rbf', random_state=2024)
svm.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(bootstrap=False, max_depth=10, max_features=10, n_estimators=100, random_state=2024)
rf.fit(X_train, y_train)

# Gradient Boosting
gbc = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, max_features=10, n_estimators=100, random_state=2024)
gbc.fit(X_train, y_train)

# XGBoost
xgb = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=100, random_state=2024)
le = LabelEncoder()
y_train_xgb = le.fit_transform(y_train)
xgb.fit(X_train, y_train_xgb)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2024)
final_estimator = LogisticRegression(max_iter=500, random_state=2024)

estimators = [('dt', dt), ('svm', svm), ('rf', rf), ('gbc', gbc), ('xgb', xgb)]
stc_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=cv)
stc_model.fit(X_train, y_train)

stc_predictions = stc_model.predict(X_teste)

op_stc = pd.DataFrame(df_test['RowId'])
op_stc['Transition'] = stc_predictions
op_stc.to_csv("../../predictions/stc.csv", index=False)
