# Dados e Aprendizagem Automática
### Tratamento dos Datasets do Hipocamp

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn import tree

Load CSVs

In [19]:
df_train = pd.read_csv('../datasets/train_radiomics_hipocamp.csv')
df_test = pd.read_csv('../datasets/test_radiomics_hipocamp.csv')

How many columns

In [20]:
shape_train = df_train.shape
shape_test = df_test.shape

print(f'Train shape: {shape_train[0]} rows and {shape_train[1]} columns')
print(f'Test shape: {shape_test[0]} rows and {shape_test[1]} columns')

Train shape: 305 rows and 2181 columns
Test shape: 100 rows and 2180 columns


Analisar os tipos de cada coluna

In [21]:
df_train.info()
print("--------------------")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2181 entries, ID to Transition
dtypes: float64(2014), int64(147), object(20)
memory usage: 5.1+ MB
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 2180 entries, ID to Age
dtypes: float64(2011), int64(150), object(19)
memory usage: 1.7+ MB


There are no missing values

In [22]:
missing_values = df_train.isna().sum()
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
    print(missing_values)
    print("--------------------")
missing_values = df_test.isna().sum()
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
    print(missing_values)

Dropping unique value columns

In [23]:
# Find columns where every row has the same value TRAIN
same_value_columns_train = df_train.columns[df_train.nunique() == 1]
df_train = df_train.drop(columns=same_value_columns_train)

# Find columns where every row has the same value TEST
same_value_columns_test = df_test.columns[df_test.nunique() == 1]
df_test = df_test.drop(columns=same_value_columns_test)

Remove all the duplicated columns except one

In [24]:
def remove_duplicated_columns(df_train, df_test):
    # Seleciona as colunas do tipo float64
    float_columns_train = df_train.select_dtypes(include=[np.number])
    
    # Identifica colunas duplicadas com base no conteúdo
    duplicated_columns_train = float_columns_train.T.duplicated(keep=False)
    
    # Filtra apenas as colunas duplicadas e seus conteúdos
    duplicated_data_train = float_columns_train.loc[:, duplicated_columns_train]
    
    # Agrupa as colunas duplicadas por seu conteúdo
    grouped_duplicates_train = duplicated_data_train.T.groupby(list(duplicated_data_train.T)).groups
    
    columns_to_remove = []
    # Loop por cada grupo de colunas duplicadas, mantendo a primeira e removendo as demais
    for content, columns in grouped_duplicates_train.items():
        columns_to_remove.extend(columns[1:])  # Ignora a primeira coluna e apaga as restantes
    
    # Remove duplicated columns from both train and test datasets
    df_train.drop(columns=columns_to_remove, inplace=True)
    df_test.drop(columns=[col for col in columns_to_remove if col in df_test.columns], inplace=True)
    
    return df_train, df_test

# Uso da função
df_train, df_test = remove_duplicated_columns(df_train, df_test)

Remove all the object columns 

In [25]:
df_train = df_train.drop(columns=[col for col in df_train.select_dtypes(include=['object']).columns if col != 'Transition'])
df_test = df_test.drop(columns=[col for col in df_test.select_dtypes(include=['object']).columns if col != 'Transition'])

Tratamento de outliers


In [26]:
'''def cap_outliers(df):
    for column in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# Aplicar a função ao dataframe df_train e depois ao df_test
df_train = cap_outliers(df_train)
df_test = cap_outliers(df_test)'''

'def cap_outliers(df):\n    for column in df.select_dtypes(include=[np.number]).columns:\n        Q1 = df[column].quantile(0.25)\n        Q3 = df[column].quantile(0.75)\n        IQR = Q3 - Q1\n        lower_bound = Q1 - 1.5 * IQR\n        upper_bound = Q3 + 1.5 * IQR\n        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])\n        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])\n    return df\n\n# Aplicar a função ao dataframe df_train e depois ao df_test\ndf_train = cap_outliers(df_train)\ndf_test = cap_outliers(df_test)'

Normalizar valores

In [27]:
'''from sklearn.preprocessing import MinMaxScaler

def normalize_dataframe(df):
    scaler = MinMaxScaler()
    float_columns = df.select_dtypes(include=[np.number]).columns
    df[float_columns] = scaler.fit_transform(df[float_columns])
    return df

# Aplicar a função ao dataframe df_train e depois ao df_test
df_train = normalize_dataframe(df_train)
df_test = normalize_dataframe(df_test)'''

'from sklearn.preprocessing import MinMaxScaler\n\ndef normalize_dataframe(df):\n    scaler = MinMaxScaler()\n    float_columns = df.select_dtypes(include=[np.number]).columns\n    df[float_columns] = scaler.fit_transform(df[float_columns])\n    return df\n\n# Aplicar a função ao dataframe df_train e depois ao df_test\ndf_train = normalize_dataframe(df_train)\ndf_test = normalize_dataframe(df_test)'

Check how many different transitions TRAIN

In [28]:
if 'Transition' in df_train.columns:
    unique_transitions = df_train['Transition'].nunique()
    print(f"There are {unique_transitions} different transitions.")
    transition_counts = df_train['Transition'].value_counts()
    print(transition_counts)

There are 5 different transitions.
Transition
CN-CN      96
MCI-MCI    71
MCI-AD     68
AD-AD      60
CN-MCI     10
Name: count, dtype: int64


Checking correlation

In [29]:
df_train_numeric = df_train.copy()
transition_mapping = {transition: idx + 1 for idx, transition in enumerate(df_train['Transition'].unique())}
df_train_numeric['Transition'] = df_train_numeric['Transition'].map(transition_mapping)
correlation_matrix = df_train_numeric.corr()
correlation_with_target = correlation_matrix['Transition'].sort_values(ascending=False)
with open('../check_files_hip/correlation_with_target.txt', 'w') as file:
    file.write("Correlation with target (Transition):\n")
    file.write(correlation_with_target.to_string())

Add an ID header

In [30]:
df_test.reset_index(drop=True, inplace=True)
df_test.insert(0, 'RowId', df_test.index + 1)

### Decision Tree Classifier


Running the model

In [31]:
X_train_dtc = df_train.drop('Transition', axis=1)
y_train_dtc = df_train['Transition']

dt_model = DecisionTreeClassifier(max_depth=5, random_state=2022)
dt_model.fit(X_train_dtc, y_train_dtc)

X_test_dtc = df_test.drop('RowId', axis=1)
dt_score = dt_model.predict(X_test_dtc)

op_dtc = pd.DataFrame(df_test['RowId'])
op_dtc['Transition'] = dt_score
#op_dtc.to_csv("../submissions/submission_dtc.csv", index=False)

### Support Vector Machine

In [32]:
X_train_svc = df_train.drop('Transition', axis=1)
y_train_svc = df_train['Transition']

svm_model = SVC(random_state=2022)
svm_model.fit(X_train_svc, y_train_svc)

X_test_svc = df_test.drop('RowId', axis=1)
svc_score = svm_model.predict(X_test_svc)

op_svc = pd.DataFrame(df_test['RowId'])
op_svc['Transition'] = svc_score
op_svc.to_csv("../submissions/submission_svc.csv", index=False)

### Bagging (Bootstrap Aggregating)

In [33]:
X_train_sss = df_train.drop('Transition', axis=1)
y_train_sss = df_train['Transition']

sss = StratifiedShuffleSplit(n_splits=10, test_size=20, random_state=2022)

bg_model = BaggingClassifier(estimator=dt_model, bootstrap= True)

n_estimators = [160]

parameters = {'n_estimators': n_estimators}

grid_bg = GridSearchCV(estimator= bg_model, param_grid= parameters, cv=sss)

grid_bg.fit(X_train_sss, y_train_sss)

bst_bg_model = grid_bg.best_estimator_

print(bst_bg_model)

bst_bg_model.fit(X_train_sss, y_train_sss)

X_test_sss = df_test.drop('RowId', axis=1)
bg_predictions = bst_bg_model.predict(X_test_sss)

op_bg = pd.DataFrame(df_test['RowId'])
op_bg['Transition'] = bg_predictions
op_bg.to_csv("../submissions/submission_bg.csv", index=False)

BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=5,
                                                   random_state=2022),
                  n_estimators=160)


### Random Forest Classifier


Running the model


In [34]:
# Split the data and target from the training dataset
X_train_rfc = df_train.drop('Transition', axis=1)
y_train_rfc = df_train['Transition']

# Initialize and train the RandomForest model
rf_model = RandomForestClassifier(bootstrap=False, max_depth=10, max_features=10, n_estimators=100, random_state=2022)
rf_model.fit(X_train_rfc, y_train_rfc)


# Make predictions on the test dataset
# Assuming 'RowId' exists in df_test and is required in the output
X_test_rfc = df_test.drop('RowId', axis=1)  # Exclude RowId from prediction features
rf_score = rf_model.predict(X_test_rfc)

# Prepare the output file with predictions
op_rf = pd.DataFrame(df_test['RowId'])  # Retrieve rowID from df_test
op_rf['Transition'] = rf_score
#op.to_csv("../submissions/submission_rfc.csv", index=False)


### Gradient Boosting

In [35]:
X_train_gbc = df_train.drop('Transition', axis=1)
y_train_gbc = df_train['Transition']

gbc_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=5, random_state=2022)
gbc_model.fit(X_train_gbc, y_train_gbc)

X_test_gbc = df_test.drop('RowId', axis=1)
gbc_score = gbc_model.predict(X_test_gbc)

op_gbc = pd.DataFrame(df_test['RowId'])
op_gbc['Transition'] = gbc_score
op_gbc.to_csv("../submissions/submission_gbc.csv", index=False)

### XGBoost

In [39]:
from sklearn.preprocessing import LabelEncoder
# Step 1: Encode the target variable
le = LabelEncoder()
y_train_xgb = le.fit_transform(df_train['Transition'])

# Define training features
X_train_xgb = df_train.drop('Transition', axis=1)

# Define and fit the model
xgb_model = XGBClassifier(n_estimators=100, learning_rate=1.0, max_depth=5, random_state=2022, objective='multi:softmax')
xgb_model.fit(X_train_xgb, y_train_xgb)

# Prepare the test set features
X_test_xgb = df_test.drop('RowId', axis=1)

# Predict on the test set
xgb_score = xgb_model.predict(X_test_xgb)

# Step 3: Decode the predicted labels back to the original class names
xgb_score = le.inverse_transform(xgb_score)

# Prepare the output for submission
op_xgb = pd.DataFrame(df_test['RowId'])
op_xgb['Transition'] = xgb_score
op_xgb.to_csv("../submissions/submission_xgb.csv", index=False)


### Stacking

In [None]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5)
final_estimator = LogisticRegression(max_iter=500)

X_train_stc = df_train.drop('Transition', axis=1)
y_train_stc = df_train['Transition']

estimators = [("dt", dt_model), ("svm", svm_model), ("rf", rf_model), ("gbc", gbc_model), ("xgb", xgb_model)]
stc_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=cv)
stc_model.fit(X_train_stc, y_train_stc)

X_test_stc = df_test.drop('RowId', axis=1)
stc_predictions = stc_model.predict(X_test_stc)

op_stc = pd.DataFrame(df_test['RowId'])
op_stc['Transition'] = stc_predictions
op_stc.to_csv("../submissions/submission_stc.csv", index=False)

### Max Voting

In [45]:
X_train_hvt = df_train.drop('Transition', axis=1)
y_train_hvt = df_train['Transition']

hvt_model = VotingClassifier(estimators=estimators, voting='hard', weights=[1, 1, 2, 2, 2])
hvt_model.fit(X_train_hvt, y_train_hvt)

X_test_hvt = df_test.drop('RowId', axis=1)
hvt_predictions = hvt_model.predict(X_test_hvt)

op_hvt = pd.DataFrame(df_test['RowId'])
op_hvt['Transition'] = hvt_predictions
op_hvt.to_csv("../submissions/submission_hvt.csv", index=False)