# Dados e Aprendizagem Automática
### Tratamento do Dataset de Controlo Occipital



In [57]:
import pandas as pd
import numpy as np


Load CSV


In [58]:
df_control = pd.read_csv('../datasets_originais/train_radiomics_occipital_CONTROL.csv')

How many columns


In [59]:
# Count the number of columns and rows in each dataset
shape_control = df_control.shape

print(f"Control dataset: {shape_control[0]} rows, {shape_control[1]} columns")

Control dataset: 305 rows, 2181 columns


Analisar os tipos de cada coluna

In [60]:
df_control.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2181 entries, ID to Transition
dtypes: float64(2014), int64(147), object(20)
memory usage: 5.1+ MB


Checking missing values


In [61]:
missing_values = df_control.isna().sum()
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
    print(missing_values)

# THERE ARE NO MISSING VALUES

Dropping unique value columns

In [62]:
# Find columns where every row has the same value
same_value_columns = df_control.columns[df_control.nunique() == 1]
print(f"Columns with the same value in every row: {len(same_value_columns)}")

# Save the columns with the same value in every row to a text file
with open('../check_files_occ/same_value_columns.txt', 'w') as file:
    file.write("Columns with the same value in every row:\n")
    for column in same_value_columns:
        file.write(f"{column}\n")

df_control = df_control.drop(columns=same_value_columns)


Columns with the same value in every row: 147


Check for columns with the same values



In [63]:
# Selecionar colunas do tipo numerico
float_columns = df_control.select_dtypes(include=[np.number])

# Encontrar colunas duplicadas com base nos valores
duplicated_columns1 = float_columns.T.duplicated(keep=False)

# Filtrar apenas as colunas duplicadas e seus conteúdos
duplicated_data1 = float_columns.loc[:, duplicated_columns1]

# Agrupar as colunas duplicadas por seu conteúdo e salvar o conteúdo de cada grupo em um arquivo de texto
with open('../check_files_occ/duplicated_columns.txt', 'w') as file:
    if duplicated_data1.empty:
        file.write("Nenhuma coluna duplicada encontrada.\n")
    else:
        file.write("Colunas duplicadas e seus conteúdos:\n")
        grouped_duplicates1 = duplicated_data1.T.groupby(list(duplicated_data1.T)).groups
        for content, columns in grouped_duplicates1.items():
            file.write("Colunas duplicadas:\n")
            for col in columns:
                file.write(f"- {col}\n")
            file.write("\n")

Remove all the duplicated columns except one



In [64]:
def remove_duplicated_columns(df):
    # Seleciona as colunas do tipo numerico
    float_columns = df.select_dtypes(include=[np.number])
    
    # Identifica colunas duplicadas com base no conteúdo
    duplicated_columns = float_columns.T.duplicated(keep=False)
    
    # Filtra apenas as colunas duplicadas e seus conteúdos
    duplicated_data = float_columns.loc[:, duplicated_columns]
    
    # Agrupa as colunas duplicadas por seu conteúdo
    grouped_duplicates = duplicated_data.T.groupby(list(duplicated_data.T)).groups
    
    columns_to_remove = []
    # Loop por cada grupo de colunas duplicadas, mantendo a primeira e removendo as demais
    for content, columns in grouped_duplicates.items():
        columns_to_remove.extend(columns[1:])  # Ignora a primeira coluna e apaga as restantes
    
    # Remove duplicated columns from both train and test datasets
    print(f'Number of duplicated columns: {len(columns_to_remove)}')
    df.drop(columns=columns_to_remove, inplace=True)
    
    return df

# Uso da função
df_control = remove_duplicated_columns(df_control)

Number of duplicated columns: 115


Remove all the object columns


In [65]:
df_control = df_control.drop(columns=[col for col in df_control.select_dtypes(include=['object']).columns if col != 'Transition'])

Verificação de outliers


In [66]:
def detect_outliers(df):
    outliers = []
    for column in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outlier_count = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
        if outlier_count > 0:
            outliers.append({'Column': column, 'Outliers': outlier_count})
    return pd.DataFrame(outliers)

outliers = detect_outliers(df_control)
outliers = outliers.sort_values(by='Outliers', ascending=False)

# Salvar os resultados em um arquivo de texto
with open('../check_files_occ/outliers.txt', 'w') as file:
    if outliers.empty:
        file.write("No outliers detected.\n")
    else:
        file.write(outliers.to_string(index=False))

Tratamento de outliers


Normalizar valores

In [67]:
from sklearn.preprocessing import MinMaxScaler

def normalize_dataframe(df):
    scaler = MinMaxScaler()
    float_columns = df.select_dtypes(include=[np.number]).columns
    df[float_columns] = scaler.fit_transform(df[float_columns])
    return df

# Aplicar a função ao dataframe df_control
df_control = normalize_dataframe(df_control)

Check how many different transitions

In [68]:
if 'Transition' in df_control.columns:
    unique_transitions = df_control['Transition'].nunique()
    print(f"There are {unique_transitions} different transitions.")
    transition_counts = df_control['Transition'].value_counts()
    print(transition_counts)

There are 5 different transitions.
Transition
CN-CN      96
MCI-MCI    71
MCI-AD     68
AD-AD      60
CN-MCI     10
Name: count, dtype: int64


Checking correlation

In [69]:
df_control_numeric = df_control.copy()
transition_mapping = {transition: idx + 1 for idx, transition in enumerate(df_control['Transition'].unique())}
df_control_numeric['Transition'] = df_control_numeric['Transition'].map(transition_mapping)
correlation_matrix = df_control_numeric.corr()
correlation_with_target = correlation_matrix['Transition'].sort_values(ascending=False)
with open('../check_files_occ/correlation_with_target.txt', 'w') as file:
    file.write("Correlation with target (Transition):\n")
    file.write(correlation_with_target.to_string())

Save new dataset

In [70]:
print(df_control.shape)

# Save the cleaned dataset to a new CSV file
df_control.to_csv('../datasets_manuseados/train_radiomics_occipital_CONTROL_treated.csv', index=False)

(305, 1911)
