In [3]:
import os
import csv
import glob
import pandas as pd

#### Dataset original

In [4]:
train_path = '/workspace/Alzheimer/alzheimer_dataset/train/'
val_path = '/workspace/Alzheimer/alzheimer_dataset/val/'

data = []
image_counts = {}

def extract_images_from_directory(directory_path, label, dataset_type):
    images = glob.glob(os.path.join(directory_path, '*.*'))  
    
    for image_path in images:
        image_name = os.path.basename(image_path)

        image_dir = f"{dataset_type}/{label}"
        
        label_list = [label]  
        
        data.append([image_name, label_list, image_dir])  

        if label in image_counts:
            image_counts[label] += 1
        else:
            image_counts[label] = 1

for main_path, dataset_type in [(train_path, 'train'), (val_path, 'val')]:
    for label in os.listdir(main_path):
        folder_path = os.path.join(main_path, label)
        
        if os.path.isdir(folder_path):
            extract_images_from_directory(folder_path, label, dataset_type)

alzheimer_dataset = pd.DataFrame(data, columns=['ImageID', 'Labels', 'ImageDir'])

alzheimer_dataset['Labels'] = alzheimer_dataset['Labels'].apply(lambda x: ast.literal_eval(str(x)) if isinstance(x, str) else x)

In [9]:
print("Conteo de imágenes por tipo:")
for label, count in image_counts.items():
    print(f"{label}: {count} imágenes")

Conteo de imágenes por tipo:
ModerateDemented: 6528 imágenes
NonDemented: 12800 imágenes
VeryMildDemented: 11200 imágenes
MildDemented: 9856 imágenes


In [10]:
alzheimer_dataset

Unnamed: 0,ImageID,Labels,ImageDir
0,804a7a1d-771e-4ba4-8d42-426c46ebfb2c.jpg,[ModerateDemented],train/ModerateDemented
1,54565cfb-4c4c-4644-845a-2895f94483bb.jpg,[ModerateDemented],train/ModerateDemented
2,4cd73bda-4773-464d-87be-f619c5790ffe.jpg,[ModerateDemented],train/ModerateDemented
3,0ad3376f-69f0-4603-946a-c8095214cbcb.jpg,[ModerateDemented],train/ModerateDemented
4,e40c2582-742a-4770-9969-808c25c5a5c9.jpg,[ModerateDemented],train/ModerateDemented
...,...,...,...
40379,mildDem315.jpg,[MildDemented],val/MildDemented
40380,mildDem510.jpg,[MildDemented],val/MildDemented
40381,mildDem5.jpg,[MildDemented],val/MildDemented
40382,26 (22).jpg,[MildDemented],val/MildDemented


In [11]:
alzheimer_dataset.to_csv('/workspace/Alzheimer/alzheimer_dataset/alzheimer_dataset.csv', index=False)

print("CSV creado con éxito.")

CSV creado con éxito.


#### Dataset undersampling

In [14]:
# Subconjunto del 25% del total
subset_fraction = 0.25  

total_images = len(alzheimer_dataset)
subset_size = int(total_images * subset_fraction)

# Número de imágenes por etiqueta en el dataset original
label_counts = {
    "ModerateDemented": 6528,
    "NonDemented": 12800,
    "VeryMildDemented": 11200,
    "MildDemented": 9856
}

subset_counts = {label: int(count * subset_fraction) for label, count in label_counts.items()}

alzheimer_dataset_undersampling = pd.DataFrame(columns=alzheimer_dataset.columns)

for label, count in subset_counts.items():
    subset = alzheimer_dataset[alzheimer_dataset["Labels"].apply(lambda x: label in x)].sample(n=count, random_state=42)
    alzheimer_dataset_undersampling = pd.concat([alzheimer_dataset_undersampling, subset])
    
alzheimer_dataset_undersampling = alzheimer_dataset_undersampling.reset_index(drop=True)

In [15]:
print(alzheimer_dataset_undersampling["Labels"].value_counts())

[NonDemented]         3200
[VeryMildDemented]    2800
[MildDemented]        2464
[ModerateDemented]    1632
Name: Labels, dtype: int64


In [16]:
alzheimer_dataset_undersampling

Unnamed: 0,ImageID,Labels,ImageDir
0,74383376-cacd-45da-bfb8-6f0913de0c40.jpg,[ModerateDemented],train/ModerateDemented
1,deb6815b-5e5c-4810-81d3-251ac7d37101.jpg,[ModerateDemented],train/ModerateDemented
2,2a3bd8e5-7038-4592-94e0-dd92794ce31f.jpg,[ModerateDemented],train/ModerateDemented
3,3a642c50-2581-46a5-935f-319c3b979e15.jpg,[ModerateDemented],train/ModerateDemented
4,08dd5364-9105-415b-b943-998d0e733382.jpg,[ModerateDemented],train/ModerateDemented
...,...,...,...
10091,8bd67dbe-d394-48a1-bf56-e77cd79f7730.jpg,[MildDemented],train/MildDemented
10092,3af9baae-c868-42a4-95e7-654ac304623c.jpg,[MildDemented],train/MildDemented
10093,4a8b1ded-6d39-4f93-a85f-ff6be8377f7f.jpg,[MildDemented],train/MildDemented
10094,6d18e54b-480f-490e-a407-a6849ec0587c.jpg,[MildDemented],train/MildDemented


In [17]:
alzheimer_dataset_undersampling.to_csv('/workspace/Alzheimer/alzheimer_dataset/alzheimer_dataset_undersampling.csv', index=False)

print("CSV creado con éxito.")

CSV creado con éxito.
