# Dados e Aprendizagem Automática
### Tratamento do Dataset de Controlo Occipital



In [19]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score

Load CSV


In [20]:
df_control = pd.read_csv('../datasets/train_radiomics_occipital_CONTROL.csv')

How many columns


In [21]:
# Count the number of columns and rows in each dataset
control_shape = df_control.shape

print(f"Control dataset: {control_shape[0]} rows, {control_shape[1]} columns")

Control dataset: 305 rows, 2181 columns


Analisar os tipos de cada coluna

In [22]:
df_control.info()
print("------------------------------------")
df_control.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2181 entries, ID to Transition
dtypes: float64(2014), int64(147), object(20)
memory usage: 5.1+ MB
------------------------------------


Unnamed: 0,ID,Image,Mask,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
0,006_S_0681,/notebooks/disk2/DS2_FreeSurfer/ADNI_006_S_068...,/notebooks/disk2/DS2_FreeSurfer/ADNI_006_S_068...,2.2.0,1.18.5,1.2.4,1.1.1,3.7.7,"{'minimumROIDimensions': 2, 'minimumROISize': ...","{'Original': {}, 'Wavelet': {}, 'LoG': {'sigma...",...,0.004785,1727126.0,98.443622,0.005262,0.021912,0.000244,0.005402,0,77.1,CN-CN
1,941_S_1203,/notebooks/disk2/DS2_FreeSurfer/ADNI_941_S_120...,/notebooks/disk2/DS2_FreeSurfer/ADNI_941_S_120...,2.2.0,1.18.5,1.2.4,1.1.1,3.7.7,"{'minimumROIDimensions': 2, 'minimumROISize': ...","{'Original': {}, 'Wavelet': {}, 'LoG': {'sigma...",...,0.00844,1012217.0,190.711701,0.0028,0.039662,0.000799,0.002887,1,83.4,CN-CN
2,011_S_0003,/notebooks/disk2/DS2_FreeSurfer/ADNI_011_S_000...,/notebooks/disk2/DS2_FreeSurfer/ADNI_011_S_000...,2.2.0,1.18.5,1.2.4,1.1.1,3.7.7,"{'minimumROIDimensions': 2, 'minimumROISize': ...","{'Original': {}, 'Wavelet': {}, 'LoG': {'sigma...",...,0.006291,1646099.0,285.07863,0.001925,0.047025,0.001358,0.001915,1,81.3,AD-AD
3,057_S_0779,/notebooks/disk2/DS2_FreeSurfer/ADNI_057_S_077...,/notebooks/disk2/DS2_FreeSurfer/ADNI_057_S_077...,2.2.0,1.18.5,1.2.4,1.1.1,3.7.7,"{'minimumROIDimensions': 2, 'minimumROISize': ...","{'Original': {}, 'Wavelet': {}, 'LoG': {'sigma...",...,0.005281,2169425.0,172.000383,0.003052,0.027462,0.000426,0.003078,1,79.6,CN-MCI
4,033_S_0920,/notebooks/disk2/DS2_FreeSurfer/ADNI_033_S_092...,/notebooks/disk2/DS2_FreeSurfer/ADNI_033_S_092...,2.2.0,1.18.5,1.2.4,1.1.1,3.7.7,"{'minimumROIDimensions': 2, 'minimumROISize': ...","{'Original': {}, 'Wavelet': {}, 'LoG': {'sigma...",...,0.004082,2073170.0,75.795004,0.006768,0.016754,0.000135,0.007042,0,80.1,CN-CN


There are no missing values


In [23]:
missing_values = df_control.isna().sum()
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
    print(missing_values)

Dropping unique value columns

In [24]:
# Find columns where every row has the same value
same_value_columns = df_control.columns[df_control.nunique() == 1]

# Save the columns with the same value in every row to a text file
with open('../check_files_occ/same_value_columns.txt', 'w') as file:
    file.write("Columns with the same value in every row:\n")
    for column in same_value_columns:
        file.write(f"{column}\n")

df_control = df_control.drop(columns=same_value_columns)


Check for columns with the same values



In [25]:
# Selecionar colunas do tipo numerico
float_columns = df_control.select_dtypes(include=[np.number])

# Encontrar colunas duplicadas com base nos valores
duplicated_columns1 = float_columns.T.duplicated(keep=False)

# Filtrar apenas as colunas duplicadas e seus conteúdos
duplicated_data1 = float_columns.loc[:, duplicated_columns1]

# Agrupar as colunas duplicadas por seu conteúdo e salvar o conteúdo de cada grupo em um arquivo de texto
with open('../check_files_occ/duplicated_columns.txt', 'w') as file:
    if duplicated_data1.empty:
        file.write("Nenhuma coluna duplicada encontrada.\n")
    else:
        file.write("Colunas duplicadas e seus conteúdos:\n")
        grouped_duplicates1 = duplicated_data1.T.groupby(list(duplicated_data1.T)).groups
        for content, columns in grouped_duplicates1.items():
            file.write("Colunas duplicadas:\n")
            for col in columns:
                file.write(f"- {col}\n")
            file.write("\n")

Remove all the duplicated columns except one



In [26]:
def remove_duplicated_columns(df):
    # Seleciona as colunas do tipo numerico
    float_columns = df.select_dtypes(include=[np.number])
    
    # Identifica colunas duplicadas com base no conteúdo
    duplicated_columns = float_columns.T.duplicated(keep=False)
    
    # Filtra apenas as colunas duplicadas e seus conteúdos
    duplicated_data = float_columns.loc[:, duplicated_columns]
    
    # Agrupa as colunas duplicadas por seu conteúdo
    grouped_duplicates = duplicated_data.T.groupby(list(duplicated_data.T)).groups
    
    # Abre o arquivo para escrita
    with open('../check_files_occ/duplicated_columns_details.txt', 'w') as file:
        # Loop por cada grupo de colunas duplicadas, mantendo a primeira e removendo as demais
        for content, columns in grouped_duplicates.items():
            columns_to_remove = columns[1:]  # Ignora a primeira coluna e apaga as restantes
            file.write(f"Group with content {content}:\n")
            for col in columns_to_remove:
                file.write(f"Dropping column: {col}\n")
            file.write("\n")
            df.drop(columns=columns_to_remove, inplace=True)
    
    return df

# Uso da função
df_control = remove_duplicated_columns(df_control)

Remove all the object columns


In [27]:
df_control = df_control.drop(columns=[col for col in df_control.select_dtypes(include=['object']).columns if col != 'Transition'])

Verificação de outliers


In [28]:
def detect_outliers(df):
    outliers = []
    for column in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outlier_count = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
        if outlier_count > 0:
            outliers.append({'Column': column, 'Outliers': outlier_count})
    return pd.DataFrame(outliers)

outliers = detect_outliers(df_control)
outliers = outliers.sort_values(by='Outliers', ascending=False)

# Salvar os resultados em um arquivo de texto
with open('../check_files_occ/outliers.txt', 'w') as file:
    if outliers.empty:
        file.write("No outliers detected.\n")
    else:
        file.write(outliers.to_string(index=False))

'def cap_outliers(df):\n    for column in df.select_dtypes(include=[np.number]).columns:\n        Q1 = df[column].quantile(0.25)\n        Q3 = df[column].quantile(0.75)\n        IQR = Q3 - Q1\n        lower_bound = Q1 - 1.5 * IQR\n        upper_bound = Q3 + 1.5 * IQR\n        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])\n        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])\n    return df\n\n# Aplicar a função ao dataframe df_control\ndf_control = cap_outliers(df_control)'

Tratamento de outliers


In [29]:
'''def cap_outliers(df):
    for column in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# Aplicar a função ao dataframe df_control
df_control = cap_outliers(df_control)'''

'from sklearn.preprocessing import MinMaxScaler\n\ndef normalize_dataframe(df):\n    scaler = MinMaxScaler()\n    float_columns = df.select_dtypes(include=[np.number]).columns\n    df[float_columns] = scaler.fit_transform(df[float_columns])\n    return df\n\n# Aplicar a função ao dataframe df_control\ndf_control = normalize_dataframe(df_control)'

Normalizar valores

In [30]:
'''from sklearn.preprocessing import MinMaxScaler

def normalize_dataframe(df):
    scaler = MinMaxScaler()
    float_columns = df.select_dtypes(include=[np.number]).columns
    df[float_columns] = scaler.fit_transform(df[float_columns])
    return df

# Aplicar a função ao dataframe df_control
df_control = normalize_dataframe(df_control)'''

There are 5 different transitions.
Transition
CN-CN      96
MCI-MCI    71
MCI-AD     68
AD-AD      60
CN-MCI     10
Name: count, dtype: int64


Check how many different transitions

In [31]:
if 'Transition' in df_control.columns:
    unique_transitions = df_control['Transition'].nunique()
    print(f"There are {unique_transitions} different transitions.")
    transition_counts = df_control['Transition'].value_counts()
    print(transition_counts)

There are 5 different transitions.
Transition
CN-CN      96
MCI-MCI    71
MCI-AD     68
AD-AD      60
CN-MCI     10
Name: count, dtype: int64


Checking correlation

In [32]:
df_control_numeric = df_control.copy()
transition_mapping = {transition: idx + 1 for idx, transition in enumerate(df_control['Transition'].unique())}
df_control_numeric['Transition'] = df_control_numeric['Transition'].map(transition_mapping)
correlation_matrix = df_control_numeric.corr()
correlation_with_target = correlation_matrix['Transition'].sort_values(ascending=False)
with open('../check_files_occ/correlation_with_target.txt', 'w') as file:
    file.write("Correlation with target (Transition):\n")
    file.write(correlation_with_target.to_string())

In [33]:
df_control.reset_index(drop=True, inplace=True)
df_control.insert(0, 'ID', df_control.index + 1)
df_control.head()

Unnamed: 0,ID,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
0,1,5.848123,178,17346,2,0.315029,0.267306,34.411652,128.734925,105.171289,...,0.004785,1727126.0,98.443622,0.005262,0.021912,0.000244,0.005402,0,77.1,CN-CN
1,2,5.238834,190,18010,2,0.361175,0.272491,32.329625,118.644809,97.082439,...,0.00844,1012217.0,190.711701,0.0028,0.039662,0.000799,0.002887,1,83.4,CN-CN
2,3,6.816667,155,22096,4,0.374464,0.288734,37.469777,129.772712,108.295891,...,0.006291,1646099.0,285.07863,0.001925,0.047025,0.001358,0.001915,1,81.3,AD-AD
3,4,6.445162,171,23859,2,0.355133,0.288648,37.694946,130.59162,111.157546,...,0.005281,2169425.0,172.000383,0.003052,0.027462,0.000426,0.003078,1,79.6,CN-MCI
4,5,5.568269,150,17637,2,0.320548,0.275708,36.716529,133.171648,108.295891,...,0.004082,2073170.0,75.795004,0.006768,0.016754,0.000135,0.007042,0,80.1,CN-CN
