In [None]:
!pip install pycuda
!pip install matplotlib
!pip install scikit-learn


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import time

HIDDEN_NODES = 128
OUTPUT_NODES = 20
LEARNING_RATE = 0.01
NUM_EPOCHS = 100
BATCH_SIZE = 64

In [None]:
base_path = '/content/drive/MyDrive/faces'
def load_images(base_path):
    images_30x32 = []
    images_60x64 = []
    images_120x128 = []
    labels_30x32 = []
    labels_60x64 = []
    labels_120x128 = []

    for person_folder in os.listdir(base_path):
        person_path = os.path.join(base_path, person_folder)
        if os.path.isdir(person_path):
            for image_file in os.listdir(person_path):
                if image_file.lower().endswith('.pgm'):
                    image_path = os.path.join(person_path, image_file)
                    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
                    if image is not None:
                        if image.shape == (30, 32):
                            images_30x32.append(image.flatten())
                            labels_30x32.append(person_folder)
                        elif image.shape == (60, 64):
                            images_60x64.append(image.flatten())
                            labels_60x64.append(person_folder)
                        elif image.shape == (120, 128):
                            images_120x128.append(image.flatten())
                            labels_120x128.append(person_folder)


    return (images_30x32, labels_30x32), (images_60x64, labels_60x64), (images_120x128, labels_120x128)

(images_30x32, labels_30x32), (images_60x64, labels_60x64), (images_120x128, labels_120x128) = load_images(base_path)

print(f"Imagens 30x32: {len(images_30x32)}")
print(f"Imagens 60x64: {len(images_60x64)}")
print(f"Imagens 120x128: {len(images_120x128)}")

df_30x32 = pd.DataFrame(images_30x32)
df_30x32['label'] = labels_30x32

df_60x64 = pd.DataFrame(images_60x64)
df_60x64['label'] = labels_60x64

df_120x128 = pd.DataFrame(images_120x128)
df_120x128['label'] = labels_120x128

In [None]:
def split_data(df):
    if len(df) == 0:
        return np.array([]), np.array([]), np.array([]), np.array([])
    X = df.drop('label', axis=1).values
    y = df['label'].values
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val

splits = {
    '30x32': split_data(df_30x32),
    '60x64': split_data(df_60x64),
    '120x128': split_data(df_120x128)
}

In [None]:
def get_cuda_source(input_nodes):
    return f"""
    #define INPUT_NODES {input_nodes}
    #define HIDDEN_NODES {HIDDEN_NODES}
    #define OUTPUT_NODES {OUTPUT_NODES}
    #define LEARNING_RATE {LEARNING_RATE}

    __device__ float sigmoid(float x) {{
        return 1.0 / (1.0 + expf(-x));
    }}

    __global__ void forward_pass(float *input, float *hidden_weights, float *hidden_output, float *output_weights, float *final_output) {{
        int idx = threadIdx.x;

        if (idx < HIDDEN_NODES) {{
            float sum = 0.0f;
            for (int i = 0; i < INPUT_NODES; ++i) {{
                sum += input[i] * hidden_weights[idx * INPUT_NODES + i];
            }}
            hidden_output[idx] = sigmoid(sum);
        }}

        __syncthreads();

        if (idx < OUTPUT_NODES) {{
            float sum = 0.0f;
            for (int i = 0; i < HIDDEN_NODES; ++i) {{
                sum += hidden_output[i] * output_weights[idx * HIDDEN_NODES + i];
            }}
            final_output[idx] = sigmoid(sum);
        }}
    }}

    __global__ void backpropagation(float *input, float *hidden_weights, float *hidden_output, float *output_weights, float *final_output, float *target, float *hidden_delta, float *output_delta) {{
        int idx = threadIdx.x;

        if (idx < OUTPUT_NODES) {{
            float error = target[idx] - final_output[idx];
            output_delta[idx] = error * final_output[idx] * (1.0f - final_output[idx]);
        }}

        __syncthreads();

        if (idx < HIDDEN_NODES) {{
            float error = 0.0f;
            for (int i = 0; i < OUTPUT_NODES; ++i) {{
                error += output_delta[i] * output_weights[i * HIDDEN_NODES + idx];
            }}
            hidden_delta[idx] = error * hidden_output[idx] * (1.0f - hidden_output[idx]);
        }}

        __syncthreads();

        if (idx < OUTPUT_NODES) {{
            for (int i = 0; i < HIDDEN_NODES; ++i) {{
                output_weights[idx * HIDDEN_NODES + i] += LEARNING_RATE * output_delta[idx] * hidden_output[i];
            }}
        }}

        if (idx < HIDDEN_NODES) {{
            for (int i = 0; i < INPUT_NODES; ++i) {{
                hidden_weights[idx * INPUT_NODES + i] += LEARNING_RATE * hidden_delta[idx] * input[i];
            }}
        }}
    }}
    """


In [None]:
resolution = '30x32'

# MUDAR A RESOLUÇ˜AO
INPUT_NODES = int(resolution.split('x')[0]) * int(resolution.split('x')[1])

mod = SourceModule(get_cuda_source(INPUT_NODES))
forward_pass = mod.get_function("forward_pass")
backpropagation = mod.get_function("backpropagation")

np.random.seed(42)
hidden_weights = (np.random.rand(INPUT_NODES, HIDDEN_NODES) - 0.5).astype(np.float32)
output_weights = (np.random.rand(HIDDEN_NODES, OUTPUT_NODES) - 0.5).astype(np.float32)

d_input = cuda.mem_alloc(INPUT_NODES * np.float32().nbytes)
d_hidden_weights = cuda.mem_alloc(hidden_weights.nbytes)
d_output_weights = cuda.mem_alloc(output_weights.nbytes)
d_hidden_output = cuda.mem_alloc(HIDDEN_NODES * np.float32().nbytes)
d_final_output = cuda.mem_alloc(OUTPUT_NODES * np.float32().nbytes)
d_target = cuda.mem_alloc(OUTPUT_NODES * np.float32().nbytes)
d_hidden_delta = cuda.mem_alloc(HIDDEN_NODES * np.float32().nbytes)
d_output_delta = cuda.mem_alloc(OUTPUT_NODES * np.float32().nbytes)

cuda.memcpy_htod(d_hidden_weights, hidden_weights)
cuda.memcpy_htod(d_output_weights, output_weights)


In [None]:
encoder = OneHotEncoder(sparse_output=False)
encoded_splits = {}
y_train_encoded = encoder.fit_transform(splits[resolution][2].reshape(-1, 1))
y_val_encoded = encoder.transform(splits[resolution][3].reshape(-1, 1))
encoded_splits[resolution] = {
    'X_train': splits[resolution][0],
    'X_val': splits[resolution][1],
    'y_train': y_train_encoded,
    'y_val': y_val_encoded
}
def evaluate(X_val, y_val):
    correct_predictions = 0
    for i in range(len(X_val)):
        cuda.memcpy_htod(d_input, X_val[i].astype(np.float32))
        forward_pass(d_input, d_hidden_weights, d_hidden_output, d_output_weights, d_final_output, block=(max(HIDDEN_NODES, OUTPUT_NODES), 1, 1))
        final_output = np.empty(OUTPUT_NODES, dtype=np.float32)
        cuda.memcpy_dtoh(final_output, d_final_output)
        if np.argmax(final_output) == np.argmax(y_val[i]):
            correct_predictions += 1
    accuracy = correct_predictions / len(X_val)
    return accuracy


In [None]:
accuracies = []
start_time = time.time()
X_train = encoded_splits[resolution]['X_train']
X_val = encoded_splits[resolution]['X_val']
y_train_encoded = encoded_splits[resolution]['y_train']
y_val_encoded = encoded_splits[resolution]['y_val']

threads_per_block = 128
blocks_hidden = (HIDDEN_NODES + threads_per_block - 1) // threads_per_block
blocks_output = (OUTPUT_NODES + threads_per_block - 1) // threads_per_block

for epoch in range(NUM_EPOCHS):
    for i in range(0, len(X_train), BATCH_SIZE):
        batch_X = X_train[i:i+BATCH_SIZE]
        batch_y = y_train_encoded[i:i+BATCH_SIZE]

        for j in range(len(batch_X)):
            cuda.memcpy_htod(d_input, batch_X[j].astype(np.float32))
            cuda.memcpy_htod(d_target, batch_y[j].astype(np.float32))

            forward_pass(d_input, d_hidden_weights, d_hidden_output, d_output_weights, d_final_output, block=(threads_per_block, 1, 1), grid=(blocks_hidden, 1, 1))
            backpropagation(d_input, d_hidden_weights, d_hidden_output, d_output_weights, d_final_output, d_target, d_hidden_delta, d_output_delta, block=(threads_per_block, 1, 1), grid=(blocks_output, 1, 1))

    accuracy = evaluate(X_val, y_val_encoded)
    accuracies.append(accuracy)

end_time = time.time()
execution_time = end_time - start_time
