In [2]:
import os
import json
import numpy as np
import cv2
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models
from PIL import Image
from sklearn.model_selection import train_test_split


In [3]:
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import StratifiedKFold

In [136]:
# Import OS libraries
import os
import itertools

# Data handling tools
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix , classification_report


# Deep learning libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense,  BatchNormalization, Activation, Dropout  
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam , Adamax
from tensorflow.keras import regularizers

# from googleapiclient.discovery import build
# from google.oauth2.credentials import Credentials
# from googleapiclient.http import MediaFileUpload

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")


# Check GPU availability
print("GPU is available" if tf.config.list_physical_devices('GPU') else "GPU is not available")

sns.set_style('whitegrid')

GPU is not available


In [87]:
# Diretório onde estão armazenadas as imagens sem ser separadas por pastas
image_dir = r'C:\Users\bruna\OneDrive - Universidade do Minho\Tese Mestrado em Bioinformática\AGAR_dataset\AGAR_dataset\dataset'


In [70]:
def load_image(image_path, target_size=(224, 224)):
    try:
        image = Image.open(image_path)
        image = image.resize(target_size)  
        return np.array(image) / 255.0  
    except Exception as e:
        print(f"Erro ao carregar a imagem {image_path}: {e}")
        return None

In [71]:
# Carregar IDs do grupo de treinamento de um arquivo de texto
train_ids_file = r'C:\Users\bruna\OneDrive - Universidade do Minho\Tese Mestrado em Bioinformática\AGAR_dataset\AGAR_dataset\training_lists\lower_resolution_train.txt'
with open(train_ids_file, 'r') as file:
    train_ids = [str(id) for id in json.loads(file.read())]

In [72]:
# Carregar IDs do grupo de validação de um arquivo de texto
val_ids_file = r'C:\Users\bruna\OneDrive - Universidade do Minho\Tese Mestrado em Bioinformática\AGAR_dataset\AGAR_dataset\training_lists\lower_resolution_val.txt'
with open(val_ids_file, 'r') as file:
    val_ids = [str(id) for id in json.loads(file.read())]

In [73]:
# Verificar se os IDs foram carregados corretamente
print(f"IDs de treinamento carregados: {train_ids[:5]} ... ({len(train_ids)} no total)")
print(f"IDs de validação carregados: {val_ids[:5]} ... ({len(val_ids)} no total)")

IDs de treinamento carregados: ['16078', '16831', '16073', '16072', '16830'] ... (3319 no total)
IDs de validação carregados: ['14175', '14176', '15540', '14172', '14678'] ... (1107 no total)


In [74]:
%%capture
# Lista para armazenar caminho da imagem e anotações correspondentes
train_data = []
val_data = []

# Iterar sobre todos os arquivos no diretório
for filename in os.listdir(image_dir):
    if filename.endswith('.json'):
        # Construir o caminho completo do arquivo JSON
        json_path = os.path.join(image_dir, filename)
        
        # Carregar o conteúdo do arquivo JSON
        with open(json_path, 'r') as file:
            try:
                json_data = json.load(file)
            except json.JSONDecodeError as e:
                print(f"Erro ao carregar JSON {json_path}: {e}")
                continue
        
        # Verificar se o sample_id está presente no JSON
        sample_id = str(json_data.get('sample_id', None))
        if sample_id is None:
            print(f"ID da amostra ausente no JSON: {json_path}")
            continue
        
        # Construir o caminho completo do arquivo de imagem
        image_filename = f"{sample_id}.jpg"
        image_path = os.path.join(image_dir, image_filename)
        
        # Verificar se o arquivo de imagem existe
        if not os.path.exists(image_path):
            print(f"Arquivo de imagem não encontrado: {image_path}")
            continue
        
        # Adicionar os dados à lista apropriada
        data_tuple = (image_path, json_data)
        if sample_id in train_ids:
            train_data.append(data_tuple)
            print(f"Imagem de treinamento encontrada e carregada: {image_path}")
        elif sample_id in val_ids:
            val_data.append(data_tuple)
            print(f"Imagem de validação encontrada e carregada: {image_path}")

In [75]:
# Verificar o número de imagens carregadas
print(f"Número de imagens de treinamento carregadas: {len(train_data)}")
print(f"Número de imagens de validação carregadas: {len(val_data)}")

Número de imagens de treinamento carregadas: 3318
Número de imagens de validação carregadas: 1106


In [76]:
# Separar as imagens e os rótulos (labels) para treinamento
train_images = [data[0] for data in train_data]
train_labels = [data[1]['classes'][0] for data in train_data]

In [77]:
# Separar as imagens e os rótulos (labels) para validação
val_images = [data[0] for data in val_data]
val_labels = [data[1]['classes'][0] for data in val_data]

In [78]:
# Verificar se as listas estão vazias
if len(train_images) == 0:
    print("Nenhuma imagem de treinamento foi carregada.")
if len(val_images) == 0:
    print("Nenhuma imagem de validação foi carregada.")

In [79]:
# Filtrar caminhos de imagem válidos
train_images = [img for img in train_images if img is not None]
val_images = [img for img in val_images if img is not None]

In [80]:
# Converter as imagens em tensores e normalizar, se necessário
train_images = [load_image(image_path, target_size=(224, 224)) for image_path in train_images]
val_images = [load_image(image_path, target_size=(224, 224)) for image_path in val_images]

In [81]:
# Verificar se alguma imagem falhou ao carregar
train_images = [img for img in train_images if img is not None]
val_images = [img for img in val_images if img is not None]

In [82]:
# Verificar o shape das imagens e dos rótulos antes da codificação
print("Shape de train_images antes da codificação:", np.array(train_images).shape)
print("Shape de train_labels antes da codificação:", np.array(train_labels).shape)
print("Shape de val_images antes da codificação:", np.array(val_images).shape)
print("Shape de val_labels antes da codificação:", np.array(val_labels).shape)

Shape de train_images antes da codificação: (3318, 224, 224, 3)
Shape de train_labels antes da codificação: (3318,)
Shape de val_images antes da codificação: (1106, 224, 224, 3)
Shape de val_labels antes da codificação: (1106,)


In [83]:
# Converter os rótulos em formato adequado (numérico), se necessário
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)

In [84]:
# Verificar o número de classes
num_classes = len(label_encoder.classes_)
print(f"Número de classes: {num_classes}")


Número de classes: 5


In [178]:
# Compilar o modelo
#model.compile(optimizer='adam',
#              loss='sparse_categorical_crossentropy',
#              metrics=['accuracy'])

In [88]:
# Dividir os dados em conjuntos de treinamento e validação
train_images, val_images, train_labels, val_labels = train_test_split(train_images, train_labels, test_size=0.2, random_state=42)

# Verificar os shapes dos dados após a divisão
print("Shape de train_images após a divisão:", np.array(train_images).shape)
print("Shape de train_labels após a divisão:", np.array(train_labels).shape)
print("Shape de val_images após a divisão:", np.array(val_images).shape)
print("Shape de val_labels após a divisão:", np.array(val_labels).shape)

Shape de train_images após a divisão: (2654, 224, 224, 3)
Shape de train_labels após a divisão: (2654,)
Shape de val_images após a divisão: (664, 224, 224, 3)
Shape de val_labels após a divisão: (664,)


In [118]:
# Directory to save the .npy files
save_dir = r'C:\Users\bruna\OneDrive\Tese mestrado\lower_resolution'  
os.makedirs(save_dir, exist_ok=True)

In [119]:
# Function to save numpy array as .npy file and return the path
def save_image(image_array, save_dir, idx):
    save_path = os.path.join(save_dir, f"image_{idx}.npy")
    np.save(save_path, image_array)
    return save_path

In [120]:
# Save train images and get paths
train_image_paths = [save_image(image, save_dir, idx) for idx, image in enumerate(train_images)]

# Save val images and get paths
val_image_paths = [save_image(image, save_dir, idx) for idx, image in enumerate(val_images)]

In [121]:
# Create DataFrames
train_df = pd.DataFrame({'image_path': train_image_paths, 'label': train_labels})
val_df = pd.DataFrame({'image_path': val_image_paths, 'label': val_labels})

In [122]:
train_df.head()

Unnamed: 0,image_path,label
0,C:\Users\bruna\OneDrive\Tese mestrado\lower_re...,3
1,C:\Users\bruna\OneDrive\Tese mestrado\lower_re...,1
2,C:\Users\bruna\OneDrive\Tese mestrado\lower_re...,1
3,C:\Users\bruna\OneDrive\Tese mestrado\lower_re...,2
4,C:\Users\bruna\OneDrive\Tese mestrado\lower_re...,2


In [123]:
val_df.head()

Unnamed: 0,image_path,label
0,C:\Users\bruna\OneDrive\Tese mestrado\lower_re...,4
1,C:\Users\bruna\OneDrive\Tese mestrado\lower_re...,2
2,C:\Users\bruna\OneDrive\Tese mestrado\lower_re...,3
3,C:\Users\bruna\OneDrive\Tese mestrado\lower_re...,2
4,C:\Users\bruna\OneDrive\Tese mestrado\lower_re...,3


In [124]:
import os

# Function to check if the image paths in the DataFrame are valid
def check_image_paths(df, column_name):
    for path in df[column_name]:
        if not os.path.isfile(path):
            print(f"Invalid path: {path}")
        else:
            print(f"Valid path: {path}")
            break

# Check some paths in the training and validation DataFrames
print("Checking training image paths:")
check_image_paths(train_df, 'image_path')

print("\nChecking validation image paths:")
check_image_paths(val_df, 'image_path')


Checking training image paths:
Valid path: C:\Users\bruna\OneDrive\Tese mestrado\lower_resolution\image_0.npy

Checking validation image paths:
Valid path: C:\Users\bruna\OneDrive\Tese mestrado\lower_resolution\image_0.npy


In [149]:
def npy_file_generator(dataframe, batch_size, input_size, mode='train'):
    while True:
        for start in range(0, len(dataframe), batch_size):
            end = min(start + batch_size, len(dataframe))
            batch_df = dataframe[start:end]
            batch_images = []
            for img_path in batch_df['image_path']:
                img_array = np.load(img_path)
                batch_images.append(img_array)
            batch_images = np.array(batch_images)
            if mode == 'train':
                yield batch_images, batch_df['label'].values
            elif mode == 'val':
                yield batch_images, batch_df['label'].values
            else:
                print(f"Unknown mode: {mode}")

            # Adicionar logs para depuração
            print(f"Processed batch from {start} to {end}")

# Verifique se os caminhos estão corretos no gerador personalizado
for x, y in train_generator:
    print(x.shape, y.shape)
    print(f"First image path in batch: {train_df.iloc[0]['image_path']}")
    break


(0, 224, 224, 3) (0, 0)
First image path in batch: C:\Users\bruna\OneDrive\Tese mestrado\lower_resolution\image_0.npy


In [150]:
# Create the custom data generators
train_generator = npy_file_generator(train_df, batch_size=16, input_size=input_size, mode='train')
val_generator = npy_file_generator(val_df, batch_size=16, input_size=input_size, mode='val')

In [151]:
# Test the generators
for x, y in train_generator:
    print(x.shape, y.shape)
    break

for x, y in val_generator:
    print(x.shape, y.shape)
    break

(16, 224, 224, 3) (16,)
(16, 224, 224, 3) (16,)


In [152]:
# Check some paths in the training and validation DataFrames
print("Checking training image paths:")
check_image_paths(train_df, 'image_path')

print("\nChecking validation image paths:")
check_image_paths(val_df, 'image_path')

# Verify the content of one .npy file
npy_file = train_df['image_path'].iloc[0]
image_data = np.load(npy_file)
print(f"Shape of the loaded image data from {npy_file}: {image_data.shape}")

Checking training image paths:
Valid path: C:\Users\bruna\OneDrive\Tese mestrado\lower_resolution\image_0.npy

Checking validation image paths:
Valid path: C:\Users\bruna\OneDrive\Tese mestrado\lower_resolution\image_0.npy
Shape of the loaded image data from C:\Users\bruna\OneDrive\Tese mestrado\lower_resolution\image_0.npy: (224, 224, 3)


In [153]:
train_df['image_path'] = train_df['image_path'].apply(lambda x: os.path.abspath(x))
val_df['image_path'] = val_df['image_path'].apply(lambda x: os.path.abspath(x))

In [154]:
train_df['label'] = train_df['label'].astype(str)
val_df['label'] = val_df['label'].astype(str)

In [156]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Criar um ImageDataGenerator para dados de treinamento com aumento de dados
train_datagen = ImageDataGenerator(rescale=1./255)

# Criar um ImageDataGenerator para dados de validação (sem aumento de dados)
valid_test_datagen = ImageDataGenerator(rescale=1./255)

# Especificar o tamanho de entrada esperado pelo modelo
input_size = (224, 224)

# Criar geradores de dados para treinamento e validação
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='image_path',
    y_col='label',
    target_size=input_size,
    shuffle=True,
    batch_size=16,
    class_mode='categorical',
    color_mode='rgb'
)

val_generator = valid_test_datagen.flow_from_dataframe(
    dataframe=val_df,
    x_col='image_path',
    y_col='label',
    target_size=input_size,
    shuffle=True,
    batch_size=16,
    class_mode='categorical',
    color_mode='rgb'
)

Found 0 validated image filenames belonging to 0 classes.
Found 0 validated image filenames belonging to 0 classes.


In [1]:
sample_images, sample_labels = next(train_generator)

# Display the images and labels
plt.figure(figsize=(12, 12))
for i in range(16):
    image = sample_images[i]
    label_index = np.argmax(sample_labels[i]) 
    label = list(train_generator.class_indices.keys())[label_index]  

    plt.subplot(4, 4, i+1)
    plt.imshow(image)
    plt.title(label, color='k', fontsize=12)
    plt.axis("off")

plt.show()


NameError: name 'train_generator' is not defined