In [16]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.utils.data import random_split
from PIL import Image
import cv2
import numpy as np
import json
import glob
import re

In [3]:
transform = transforms.Compose([
        transforms.Resize((128, 128)),  # Cambiar el tamaño de las imágenes según lo necesites
        transforms.ToTensor(),  # Convertir la imagen a un tensor
])
img_data = ImageFolder(root="./images", transform=transform)

test_size = 0.15  
num_train = int((1 - test_size) * len(img_data))
num_test = len(img_data) - num_train

train_data, test_data = random_split(img_data, [num_train, num_test])

train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
test_loader = DataLoader(test_data, batch_size=8, shuffle=False)
print(len(train_loader), len(test_loader))

50 9


In [4]:
# Definir la arquitectura del modelo CNN
class CNN(nn.Module):
    def __init__(self, num_classes, seed = 66):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.conv4 = nn.Conv2d(64, 128, kernel_size=3, stride=1)
        self.bn4 = nn.BatchNorm2d(128)
    
        self.fc1 = nn.Linear(10*10*128, 512) #dimensions*output_features from the convolutional layer
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
    
        x = x.view(x.size(0), -1) #flattening layer
    
        x = F.relu(self.fc1(x)) #sends the info throw the connected layers having the ReLU activation function applied
        x = F.relu(self.fc2(x))
        return self.fc3(x)

    def predict_image(self, image_path):
        image = transform(Image.open(image_path)).unsqueeze(0)

        with torch.no_grad():
            output = self(image)

        probabilities = torch.softmax(output, dim=1)[0]
        predicted_label = torch.argmax(probabilities).item()

        class_names = img_data.classes
        predicted_class = class_names[predicted_label]

        print(f'Predicted Class for {image_path}: {predicted_class}')
        for i, prob in enumerate(probabilities):
            print(f'{class_names[i]}: {prob:.4f}')

    def predict_region(self, region_of_interest):
        if isinstance(region_of_interest, np.ndarray):
            region_of_interest = Image.fromarray(region_of_interest)
        image = transform(region_of_interest).unsqueeze(0)

        with torch.no_grad():
            output = self(image)

        probabilities = torch.softmax(output, dim=1)[0]
        predicted_label = torch.argmax(probabilities).item()

        #class_names = img_data.classes
        #predicted_class = class_names[predicted_label]

        return predicted_label

model = CNN(num_classes=len(img_data.classes))

In [5]:
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [6]:
# Entrenamiento del modelo
epochs = 16
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Training Loss: {running_loss/len(train_loader)}")

print('Finished Training')

Epoch 1, Training Loss: 1.0825598680973052
Epoch 2, Training Loss: 0.7298293957114219
Epoch 3, Training Loss: 0.6778105515241623
Epoch 4, Training Loss: 0.750079015493393
Epoch 5, Training Loss: 0.6899474114179611
Epoch 6, Training Loss: 0.5152839502692222
Epoch 7, Training Loss: 0.46968883961439134
Epoch 8, Training Loss: 0.4331125044822693
Epoch 9, Training Loss: 0.44701874420046805
Epoch 10, Training Loss: 0.37252117313444616
Epoch 11, Training Loss: 0.3382280695438385
Epoch 12, Training Loss: 0.35827051371335983
Epoch 13, Training Loss: 0.3136134836636484
Epoch 14, Training Loss: 0.1574496181681752
Epoch 15, Training Loss: 0.11455472772242502
Epoch 16, Training Loss: 0.10226235596288462
Finished Training


In [7]:
# Evaluación del modelo en el conjunto de prueba
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f"Accuracy on test set: {100 * correct / total}%")

Accuracy on test set: 73.2394366197183%


In [8]:
model.predict_image("./images/keys/llaves_17.jpg")

Predicted Class for ./images/keys/llaves_17.jpg: keys
keys: 0.9576
pencil: 0.0345
sneakers: 0.0080


In [9]:
def generate_bounding_boxes(image):
    # Convertir la imagen a escala de grises
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Aplicar un umbral para obtener una imagen binaria
    _, thresh = cv2.threshold(gray, 147, 255, cv2.THRESH_BINARY)

    # Aplicar erosión y dilatación para eliminar ruido y conectar regiones
    kernel = np.ones((5,5),np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
    
    # Encontrar contornos en la imagen binaria
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    bounding_boxes = []
    
    # Iterar sobre los contornos encontrados
    for contour in contours:
        area = cv2.contourArea(contour)
        if area > 50:  # Filtrar contornos con un área mínima de 100 píxeles
            # Obtener las coordenadas de la caja delimitadora que rodea al contorno
            x, y, w, h = cv2.boundingRect(contour)
            bounding_boxes.append((x, y, x+w, y+h))  # Formato de caja: (x_min, y_min, x_max, y_max)
    
    return bounding_boxes

In [12]:
def process_images(images, images_names): #Generar KEYS
    annotations = []
    categories = [
        {"id": 91, "name": "keys"},
        {"id": 92, "name": "pencil"},
        {"id": 93, "name": "sneakers"}
    ]
    
    for idx, image in enumerate(images):
        bounding_boxes = generate_bounding_boxes(image)
        
        for bbox in bounding_boxes:
            x_min, y_min, x_max, y_max = bbox
            region_of_interest = image[y_min:y_max, x_min:x_max]
            
            #category_id = model.predict_region(region_of_interest) + 91 # Coco dataset ends in id=90  
            category_id = 91
            annotations.append({"image_id": idx, "category_id": category_id, "bbox": [x_min, y_min, x_max - x_min, y_max - y_min]})
    
    # Construir el diccionario JSON final
    json_data = {"annotations": annotations, "categories": categories, "images": [{"id": i, "file_name": images_names[i]} for i in range(len(images_names))]}
    
    return json_data

In [29]:
def process_all_images(images, images_names, category=0):
    annotations = []
    categories = [
        {"id": 91, "name": "keys"},
        {"id": 92, "name": "pencil"},
        {"id": 93, "name": "sneakers"}
    ]
    
    for idx, image in enumerate(images):
        bounding_boxes = generate_bounding_boxes(image)
        
        for bbox in bounding_boxes:
            x_min, y_min, x_max, y_max = bbox
            #region_of_interest = image[y_min:y_max, x_min:x_max]
            
            #category_id = model.predict_region(region_of_interest) + 91 # Coco dataset ends in id=90  
            #category_id = 91
            if "llaves" in images_names[idx]:
                category_id = 91
            elif "pencil" in images_names[idx]:
                category_id = 92
            elif "zapatilas" in images_names[idx]:
                category_id = 93
            else:
                category_id = 0  # Otra categoría por defecto
            annotations.append({"image_id": idx, "category_id": category_id, "bbox": [x_min, x_max, y_min, y_max]})
    
    # Construir el diccionario JSON final
    json_data = {"annotations": annotations, "categories": categories, "images": [{"id": i, "file_name": images_names[i]} for i in range(len(images_names))]}
    
    return json_data

In [91]:
# GENERAR BOXES A UNA IMAGEN
image = cv2.imread('./images/keys/llaves_142.jpg')

# Generar las cajas delimitadoras
bounding_boxes = generate_bounding_boxes(image)

# Dibujar las cajas delimitadoras en la imagen original
for box in bounding_boxes:
    cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)

# Mostrar la imagen con las cajas delimitadoras
cv2.imshow('Bounding Boxes', image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [13]:
# GENERAR BOXES A TODA LA CARPETA DE IMAGENES DE KEYS
images = []
images_names = []

# Obtener la lista de nombres de archivos de imagen en la carpeta
image_files = glob.glob("./images/keys/*.jpg") 
image_files = sorted(image_files, key=lambda x: int(x.split("_")[-1].split(".")[0]))

# Iterar sobre cada archivo de imagen y cargarlo
for image_file in image_files:
    images_names.append(image_file.split('\\')[-1])
    image = cv2.imread(image_file)
    if image is not None:
        images.append(image)
    else:
        print(f"Error: No se pudo cargar la imagen {image_file}")

# Procesar las imágenes y generar las anotaciones
annotations_data = process_images(images, images_names)

# Guardar los datos en un archivo JSON
with open("annotations.json", "w") as json_file:
    json.dump(annotations_data, json_file, indent=4)

print("Annotations saved to annotations.json")

Annotations saved to annotations.json


In [14]:
def custom_sort_key(file_path):
    # Obtener el nombre del archivo sin la extensión
    file_name = file_path.split("/")[-1].split(".")[0]
    
    # Separar el nombre del archivo y el número al final
    name, number = re.match(r'([^_]+)_(\d+)', file_name).groups()
    
    return name, int(number)

In [30]:
# GENERAR BOXES A TODAS LA CARPETA DE IMAGENES
images = []
images_names = []

# Obtener la lista de nombres de archivos de imagen en la carpeta
image_files = glob.glob("./images/images/train/*.jpg") 
image_files = sorted(image_files, key=custom_sort_key)

# Iterar sobre cada archivo de imagen y cargarlo
for image_file in image_files:
    images_names.append(image_file.split('\\')[-1])
    image = cv2.imread(image_file)
    if image is not None:
        images.append(image)
    else:
        print(f"Error: No se pudo cargar la imagen {image_file}")

# Procesar las imágenes y generar las anotaciones
annotations_data = process_all_images(images, images_names)

# Guardar los datos en un archivo JSON
with open("all_annotations.json", "w") as json_file:
    json.dump(annotations_data, json_file, indent=4)

print("Annotations saved to annotations.json")

Annotations saved to annotations.json
