# Dados e Aprendizagem Automática
### Tratamento dos Datasets do Hipocamp

In [37]:
import pandas as pd
import numpy as np

Load CSVs

In [38]:
df_train = pd.read_csv('../datasets_originais/train_radiomics_hipocamp.csv')
df_test = pd.read_csv('../datasets_originais/test_radiomics_hipocamp.csv')

How many columns

In [39]:
shape_train = df_train.shape
shape_test = df_test.shape

print(f'Train shape: {shape_train[0]} rows and {shape_train[1]} columns')
print(f'Test shape: {shape_test[0]} rows and {shape_test[1]} columns')

Train shape: 305 rows and 2181 columns
Test shape: 100 rows and 2180 columns


Analisar os tipos de cada coluna

In [40]:
df_train.info()
print("--------------------")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2181 entries, ID to Transition
dtypes: float64(2014), int64(147), object(20)
memory usage: 5.1+ MB
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 2180 entries, ID to Age
dtypes: float64(2011), int64(150), object(19)
memory usage: 1.7+ MB


Checking missing values

In [41]:
missing_values = df_train.isna().sum()
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
    print(missing_values)
    print("--------------------")
missing_values = df_test.isna().sum()
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
    print(missing_values)

# THERE ARE NO MISSING VALUES

Unique values

In [42]:
# Find columns where every row has the same value TRAIN
same_value_columns_train = df_train.columns[df_train.nunique() == 1]
print(f'Number of columns with the same value in all rows (Train): {len(same_value_columns_train)}')
df_train = df_train.drop(columns=same_value_columns_train)

# Find columns where every row has the same value TEST
same_value_columns_test = df_test.columns[df_test.nunique() == 1]
print(f'Number of columns with the same value in all rows (Train): {len(same_value_columns_test)}')
df_test = df_test.drop(columns=same_value_columns_test)

Number of columns with the same value in all rows (Train): 159
Number of columns with the same value in all rows (Train): 159


Remove all the duplicated columns except one

In [43]:
def remove_duplicated_columns(df_train, df_test):
    # Seleciona as colunas do tipo float64
    float_columns_train = df_train.select_dtypes(include=[np.number])
    
    # Identifica colunas duplicadas com base no conteúdo
    duplicated_columns_train = float_columns_train.T.duplicated(keep=False)
    
    # Filtra apenas as colunas duplicadas e seus conteúdos
    duplicated_data_train = float_columns_train.loc[:, duplicated_columns_train]
    
    # Agrupa as colunas duplicadas por seu conteúdo
    grouped_duplicates_train = duplicated_data_train.T.groupby(list(duplicated_data_train.T)).groups
    
    columns_to_remove = []
    # Loop por cada grupo de colunas duplicadas, mantendo a primeira e removendo as demais
    for content, columns in grouped_duplicates_train.items():
        columns_to_remove.extend(columns[1:])  # Ignora a primeira coluna e apaga as restantes
    
    # Remove duplicated columns from both train and test datasets
    print(f'Number of duplicated columns: {len(columns_to_remove)}')
    df_train.drop(columns=columns_to_remove, inplace=True)
    df_test.drop(columns=[col for col in columns_to_remove if col in df_test.columns], inplace=True)
    
    return df_train, df_test

# Uso da função
df_train, df_test = remove_duplicated_columns(df_train, df_test)


Number of duplicated columns: 112


Remove all the object columns 

In [44]:
df_train = df_train.drop(columns=[col for col in df_train.select_dtypes(include=['object']).columns if col != 'Transition'])
df_test = df_test.drop(columns=[col for col in df_test.select_dtypes(include=['object']).columns if col != 'Transition'])

Tratamento de outliers


Normalizar valores

In [45]:
from sklearn.preprocessing import MinMaxScaler

def normalize_dataframe(df):
    scaler = MinMaxScaler()
    float_columns = df.select_dtypes(include=[np.number]).columns
    df[float_columns] = scaler.fit_transform(df[float_columns])
    return df

# Aplicar a função ao dataframe df_train e depois ao df_test
df_train = normalize_dataframe(df_train)
df_test = normalize_dataframe(df_test)

Check how many different transitions TRAIN and types

In [46]:
if 'Transition' in df_train.columns:
    unique_transitions = df_train['Transition'].nunique()
    print(f"There are {unique_transitions} different transitions.")
    transition_counts = df_train['Transition'].value_counts()
    print(transition_counts)

There are 5 different transitions.
Transition
CN-CN      96
MCI-MCI    71
MCI-AD     68
AD-AD      60
CN-MCI     10
Name: count, dtype: int64


Checking correlation

In [47]:
df_train_numeric = df_train.copy()
transition_mapping = {transition: idx + 1 for idx, transition in enumerate(df_train['Transition'].unique())}
df_train_numeric['Transition'] = df_train_numeric['Transition'].map(transition_mapping)
correlation_matrix = df_train_numeric.corr()
correlation_with_target = correlation_matrix['Transition'].sort_values(ascending=False)
with open('../check_files_hip/correlation_with_target.txt', 'w') as file:
    file.write("Correlation with target (Transition):\n")
    file.write(correlation_with_target.to_string())

Save new dataset

In [48]:
print(df_test.shape)
print(df_train.shape)

# Save the train and test datasets
df_train.to_csv('../datasets_manuseados/train_radiomics_hipocamp_treated.csv', index=False)
df_test.to_csv('../datasets_manuseados/test_radiomics_hipocamp_treated.csv', index=False)

(100, 1901)
(305, 1902)
