# Dados e Aprendizagem Automática
### Tratamento dos Datasets do Hipocamp

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score

Load CSVs

In [97]:
df_train = pd.read_csv('../datasets/train_radiomics_hipocamp.csv')
df_test = pd.read_csv('../datasets/test_radiomics_hipocamp.csv')

How many columns

In [98]:
shape_train = df_train.shape
shape_test = df_test.shape

print(f'Train shape: {shape_train[0]} rows and {shape_train[1]} columns')
print(f'Test shape: {shape_test[0]} rows and {shape_test[1]} columns')

Train shape: 305 rows and 2181 columns
Test shape: 100 rows and 2180 columns


Analisar os tipos de cada coluna

In [99]:
df_train.info()
print("--------------------")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2181 entries, ID to Transition
dtypes: float64(2014), int64(147), object(20)
memory usage: 5.1+ MB
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 2180 entries, ID to Age
dtypes: float64(2011), int64(150), object(19)
memory usage: 1.7+ MB


There are no missing values

In [100]:
missing_values = df_train.isna().sum()
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
    print(missing_values)
    print("--------------------")
missing_values = df_test.isna().sum()
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
    print(missing_values)

Dropping unique value columns

In [101]:
# Find columns where every row has the same value TRAIN
same_value_columns_train = df_train.columns[df_train.nunique() == 1]
df_train = df_train.drop(columns=same_value_columns_train)

# Find columns where every row has the same value TEST
same_value_columns_test = df_test.columns[df_test.nunique() == 1]
df_test = df_test.drop(columns=same_value_columns_test)

Remove all the duplicated columns except one

In [102]:
def remove_duplicated_columns(df_train, df_test):
    # Seleciona as colunas do tipo float64
    float_columns_train = df_train.select_dtypes(include=[np.number])
    
    # Identifica colunas duplicadas com base no conteúdo
    duplicated_columns_train = float_columns_train.T.duplicated(keep=False)
    
    # Filtra apenas as colunas duplicadas e seus conteúdos
    duplicated_data_train = float_columns_train.loc[:, duplicated_columns_train]
    
    # Agrupa as colunas duplicadas por seu conteúdo
    grouped_duplicates_train = duplicated_data_train.T.groupby(list(duplicated_data_train.T)).groups
    
    columns_to_remove = []
    # Loop por cada grupo de colunas duplicadas, mantendo a primeira e removendo as demais
    for content, columns in grouped_duplicates_train.items():
        columns_to_remove.extend(columns[1:])  # Ignora a primeira coluna e apaga as restantes
    
    # Remove duplicated columns from both train and test datasets
    df_train.drop(columns=columns_to_remove, inplace=True)
    df_test.drop(columns=[col for col in columns_to_remove if col in df_test.columns], inplace=True)
    
    return df_train, df_test

# Uso da função
df_train, df_test = remove_duplicated_columns(df_train, df_test)

Remove all the object columns 

In [103]:
df_train = df_train.drop(columns=[col for col in df_train.select_dtypes(include=['object']).columns if col != 'Transition'])
df_test = df_test.drop(columns=[col for col in df_test.select_dtypes(include=['object']).columns if col != 'Transition'])

Tratamento de outliers


In [104]:
'''def cap_outliers(df):
    for column in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# Aplicar a função ao dataframe df_train e depois ao df_test
df_train = cap_outliers(df_train)
df_test = cap_outliers(df_test)'''

'def cap_outliers(df):\n    for column in df.select_dtypes(include=[np.number]).columns:\n        Q1 = df[column].quantile(0.25)\n        Q3 = df[column].quantile(0.75)\n        IQR = Q3 - Q1\n        lower_bound = Q1 - 1.5 * IQR\n        upper_bound = Q3 + 1.5 * IQR\n        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])\n        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])\n    return df\n\n# Aplicar a função ao dataframe df_train e depois ao df_test\ndf_train = cap_outliers(df_train)\ndf_test = cap_outliers(df_test)'

Normalizar valores

In [105]:
'''from sklearn.preprocessing import MinMaxScaler

def normalize_dataframe(df):
    scaler = MinMaxScaler()
    float_columns = df.select_dtypes(include=[np.number]).columns
    df[float_columns] = scaler.fit_transform(df[float_columns])
    return df

# Aplicar a função ao dataframe df_train e depois ao df_test
df_train = normalize_dataframe(df_train)
df_test = normalize_dataframe(df_test)'''

'from sklearn.preprocessing import MinMaxScaler\n\ndef normalize_dataframe(df):\n    scaler = MinMaxScaler()\n    float_columns = df.select_dtypes(include=[np.number]).columns\n    df[float_columns] = scaler.fit_transform(df[float_columns])\n    return df\n\n# Aplicar a função ao dataframe df_train e depois ao df_test\ndf_train = normalize_dataframe(df_train)\ndf_test = normalize_dataframe(df_test)'

Check how many different transitions TRAIN

In [106]:
if 'Transition' in df_train.columns:
    unique_transitions = df_train['Transition'].nunique()
    print(f"There are {unique_transitions} different transitions.")
    transition_counts = df_train['Transition'].value_counts()
    print(transition_counts)

There are 5 different transitions.
Transition
CN-CN      96
MCI-MCI    71
MCI-AD     68
AD-AD      60
CN-MCI     10
Name: count, dtype: int64


Checking correlation

In [107]:
df_train_numeric = df_train.copy()
transition_mapping = {transition: idx + 1 for idx, transition in enumerate(df_train['Transition'].unique())}
df_train_numeric['Transition'] = df_train_numeric['Transition'].map(transition_mapping)
correlation_matrix = df_train_numeric.corr()
correlation_with_target = correlation_matrix['Transition'].sort_values(ascending=False)
with open('../check_files_hip/correlation_with_target.txt', 'w') as file:
    file.write("Correlation with target (Transition):\n")
    file.write(correlation_with_target.to_string())

# Random Forest Classifier


Add an ID header

In [108]:
df_test.reset_index(drop=True, inplace=True)
df_test.insert(0, 'ID', df_test.index + 1)

In [109]:
from sklearn.ensemble import RandomForestClassifier

# Split the data and target from the training dataset
X_train = df_train.drop('Transition', axis=1)
y_train = df_train['Transition']

# Initialize and train the RandomForest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Make predictions on the test dataset
# Assuming 'PassengerId' exists in df_test and is required in the output
X_test = df_test.drop('ID', axis=1)  # Exclude ID from prediction features
op_rf = rf_model.predict(X_test)

# Prepare the output file with predictions
op = pd.DataFrame(df_test['ID'])  # Retrieve PassengerId from df_test
op['Transition'] = op_rf
op.to_csv("submission.csv", index=False)
