# Universidad del Valle de Guatemala
## Facultad de Ingenier√≠a
### Departamento de Computaci√≥n

---

# Laboratorio 4: CNN

**Integrantes:**
- Diego Alexander Hern√°ndez Silvestre, 21270
- Linda In√©s Jim√©nez Vides, 21169

**Curso:** Data Science  
**Secci√≥n:** 10  

---

Guatemala, 22 de agosto de 2024


In [4]:
import opendatasets as od
import pandas as pd
import os
import zipfile
import matplotlib.pyplot as plt
from matplotlib.image import imread
import random
from os import listdir
import shutil
import numpy as np
import keras.preprocessing.image as kerasImg
import keras.layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras import ops
from PIL import Image
from collections import Counter

### Inciso 1. üñºÔ∏è Carga y visualizaci√≥n de datos

Se extraen las labels de las im√°genes, que en este dataset corresponden los n√∫meros del 0 al 9 y adem√°s, se cargan. 

In [None]:
baseDir = './data/PolyMNIST/MMNIST/'
trainDirs = [os.path.join(baseDir, 'train', f'm{i}') for i in range(5)]
testDirs = [os.path.join(baseDir, 'test', f'm{i}') for i in range(5)]

def loadImages(folder):
    images = []
    labels = []
    for filename in os.listdir(folder):
        img = Image.open(os.path.join(folder, filename))
        if img is not None:
            images.append(np.array(img))
            label = int(filename.split('.')[1]) 
            labels.append(label) 
    return images, labels

trainImages, trainLabels = [], []
for dir in trainDirs:
    images, labels = loadImages(dir)
    trainImages.extend(images)
    trainLabels.extend(labels)

testImages, testLabels = [], []
for dir in testDirs:
    images, labels = loadImages(dir)
    testImages.extend(images)
    testLabels.extend(labels)

Una vez cargadas, se visualizan algunos ejemplos para ver sus caracter√≠sticas. 

In [None]:
plt.figure(figsize=(10, 10))
for i in range(25):
    plt.subplot(5, 5, i + 1)
    plt.imshow(trainImages[i])  
    height, width, _ = trainImages[i].shape
    plt.title(f'Label: {trainLabels[i]}\nSize: {width}x{height}')  
    plt.axis('off')
plt.show()

Guardamos las im√°genes en forma binaria.

In [None]:
np.save('MMNIST_train_images.npy', trainImages)
np.save('MMNIST_train_labels.npy', trainLabels)
np.save('MMNIST_test_images.npy', testImages)
np.save('MMNIST_test_labels.npy', testLabels)

Se cargan de forma binaria para que sea m√°s r√°pido su uso.

In [None]:
trainImages = np.load('MMNIST_train_images.npy')
trainLabels = np.load('MMNIST_train_labels.npy')
testImages = np.load('MMNIST_test_images.npy')
testLabels = np.load('MMNIST_test_labels.npy')

## Inciso 2. üîç EDA (Exploratory Data Analysis)

Se identifica el tama√±o del dataset. 

In [None]:
def countImagesInDirectory(directory):
    totalImages = 0
    print(f'üîç Contando im√°genes en: {directory}')
    for subdir in os.listdir(directory):
        subdirPath = os.path.join(directory, subdir)
        if os.path.isdir(subdirPath):
            numImages = len(os.listdir(subdirPath))
            print(f'üìÇ N√∫mero de im√°genes en {subdirPath}: {numImages}')
            totalImages += numImages
    return totalImages

trainDir = os.path.join(baseDir, 'train')
testDir = os.path.join(baseDir, 'test')
totalTrainImages = countImagesInDirectory(trainDir)
totalTestImages = countImagesInDirectory(testDir)

print(f'üìà Total de im√°genes en el conjunto de entrenamiento: {totalTrainImages}')
print(f'üìà Total de im√°genes en el conjunto de prueba: {totalTestImages}')

Se observan los tama√±os que tienen las im√°genes

In [None]:
def getImageSizes(directory):
    sizes = []
    for subdir in os.listdir(directory):
        subdir_path = os.path.join(directory, subdir)
        if os.path.isdir(subdir_path):
            for file_name in os.listdir(subdir_path):
                file_path = os.path.join(subdir_path, file_name)
                with Image.open(file_path) as img:
                    sizes.append(img.size)
    return sizes

imageSizes = getImageSizes('./data/PolyMNIST/MMNIST/')
print(f'üñºÔ∏è Tama√±os de las im√°genes en el dataset: {imageSizes}')

Se analiza la distribuci√≥n de los tama√±os.

In [None]:
def analyzeSizes(sizes):
    widths, heights = zip(*sizes)
    avg_width = np.mean(widths)
    avg_height = np.mean(heights)
    std_width = np.std(widths)
    std_height = np.std(heights)
    
    print(f'üîç An√°lisis de tama√±os:')
    print(f'Promedio de ancho: {avg_width:.2f} px')
    print(f'Promedio de alto: {avg_height:.2f} px')
    print(f'Desviaci√≥n est√°ndar de ancho: {std_width:.2f} px')
    print(f'Desviaci√≥n est√°ndar de alto: {std_height:.2f} px')

analyzeSizes(imageSizes)

Tambi√©n se analiza la distribuci√≥n del dataset para ver si est√° balanceado. 

In [None]:
etiquetasTrainDf = pd.DataFrame(trainLabels, columns=['Etiqueta'])
etiquetasTestDf = pd.DataFrame(testLabels, columns=['Etiqueta'])
distribucionTrain = etiquetasTrainDf['Etiqueta'].value_counts()
distribucionTest = etiquetasTestDf['Etiqueta'].value_counts()

plt.figure(figsize=(10, 6))
plt.bar(distribucionTrain.index - 0.2, distribucionTrain.values, width=0.4, color='blue', alpha=0.7, label='Entrenamiento')
plt.bar(distribucionTest.index + 0.2, distribucionTest.values, width=0.4, color='red', alpha=0.7, label='Prueba')

plt.xlabel('D√≠gito')
plt.ylabel('Frecuencia')
plt.title('Distribuci√≥n de los d√≠gitos en los conjuntos de datos')
plt.xticks(range(10))  
plt.legend()
plt.show()

### üßπ Preprocesamiento de las im√°genes

Redimensionamiento

In [None]:
def resizeImages(imageArray, new_size=(28, 28)):
    resizedImages = []
    for image in imageArray:
        imagePil = Image.fromarray(image)
        resizedImagePil = imagePil.resize(new_size)
        resizedImageArray = np.array(resizedImagePil)
        resizedImages.append(resizedImageArray)
    return np.array(resizedImages)

newSize = (28, 28) 
resizedTrainImages = resizeImages(trainImages, newSize)
resizedTestImages = resizeImages(testImages, newSize)

print(f'üñºÔ∏è Tama√±o de im√°genes redimensionadas de entrenamiento: {resizedTrainImages.shape}')
print(f'üñºÔ∏è Tama√±o de im√°genes redimensionadas de prueba: {resizedTestImages.shape}')

Filtro de escala de grises (los colores no son relevantes)

In [None]:
def convertToGrayscale(imageArray):
    grayscaleImages = []
    for image in imageArray:
        imagePIL = Image.fromarray(image)
        grayscaleImagePIL = imagePIL.convert('L')
        grayscaleImageArray = np.array(grayscaleImagePIL)
        grayscaleImages.append(grayscaleImageArray)
    return np.array(grayscaleImages)

grayscaleTrainImages = convertToGrayscale(resizedTrainImages)
grayscaleTestImages = convertToGrayscale(resizedTestImages)

print(f'üñºÔ∏è Tama√±o de im√°genes en escala de grises de entrenamiento: {grayscaleTrainImages.shape}')
print(f'üñºÔ∏è Tama√±o de im√°genes en escala de grises de prueba: {grayscaleTestImages.shape}')

np.save('MMNIST_grayscale_scaled_train_images.npy', grayscaleTrainImages)
np.save('MMNIST_grayscale_scaled_images.npy', grayscaleTestImages)