In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
from pathlib import Path
import kagglehub

path = Path(kagglehub.dataset_download("zalando-research/fashionmnist"))

train_csv = path / "fashion-mnist_train.csv"
test_csv  = path / "fashion-mnist_test.csv"

df_train = pd.read_csv(train_csv)
df_test  = pd.read_csv(test_csv)

In [3]:
df_train.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X_train = np.array(df_train.drop(columns=["label"])).T
Y_train    = np.array(df_train["label"]).reshape(1, -1)
X_test  = np.array(df_test.drop(columns=["label"])).T
Y_test     = np.array(df_test["label"]).reshape(1, -1)

In [5]:
print(X_train[0].shape)
print(X_train[:, 0].shape)
print(Y_train.shape)

(60000,)
(784,)
(1, 60000)


In [6]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [7]:
Y_train

array([[2, 9, 6, ..., 8, 8, 7]], dtype=int64)

In [8]:
def init_params():
    # gerar valores aleatorios para os parametros entre -0.5 e 0.5
    # 10 neuronios em cada camada
    # segunda coluna representa os 784 pixels de cada imagem
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    
    return W1, b1, W2, b2

In [9]:
def ReLu(Z):
    return np.maximum(0, Z)

def softmax(Z):
    Z_max = np.max(Z, axis=0, keepdims=True)
    exp_Z = np.exp(Z - Z_max)
    return exp_Z / np.sum(exp_Z, axis=0, keepdims=True)

In [10]:
def forward_prop(X, W1, b1, W2, b2):
    # camada de entrada para camada oculta
    Z1 = W1.dot(X) + b1
    # função de ativação ReLu
    A1 = ReLu(Z1)
    # camada oculta para camada de saída
    Z2 = W2.dot(A1) + b2
    # função de ativação softmax = probabilidade de cada classe
    A2 = softmax(Z2)
    
    return Z1, A1, Z2, A2

In [11]:
def one_hot(Y):
    # criar matriz de zeros com dimensoes (numero de classes, numero de exemplos)
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    # definir 1 na posicao correta para cada exemplo
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [12]:
def ReLU_deriv(Z):
    # sera 1 se Z > 0, senao 0, basta pensar no desenho da funcao
    return Z > 0

In [13]:
def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    m = Y.size
    one_hot_Y = one_hot(Y)
    # derivada da loss em relacao a Z2
    dZ2 = A2 - one_hot_Y
    # derivadas dos parametros da camada 2
    # derivada do peso da camada 2
    dW2 = 1 / m * dZ2.dot(A1.T)
    # derivada do bias da camada 2
    db2 = 1 / m * np.sum(dZ2)
    # derivadas da camada 1
    # derivada da loss em relacao a Z1
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    # derivada do peso da camada 1
    dW1 = 1 / m * dZ1.dot(X.T)
    # derivada do bias da camada 1
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2

In [14]:
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    # atualiza os parametros com a taxa de aprendizado alpha multiplicada pelas derivadas calculadas    
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1    
    W2 = W2 - alpha * dW2  
    b2 = b2 - alpha * db2    
    return W1, b1, W2, b2

In [15]:
def get_predictions(A2):
    # retorna o indice da classe com maior probabilidade
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    # print(predictions, Y)
    acc = np.sum(predictions == Y) / Y.size
    return round(float(acc * 100), 2)

def gradient_descent(X, Y, alpha, iterations):
    # inicializar parametros
    W1, b1, W2, b2 = init_params()
    # realizar o treinamento
    for i in range(iterations):
        # propagacao para frente
        Z1, A1, Z2, A2 = forward_prop(X, W1, b1, W2, b2)
        # propagacao para tras - calculo de derivadas
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        # atualizar parametros
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A2)
            print(float(get_accuracy(predictions, Y)))
    # retornar os parametros treinados
    return W1, b1, W2, b2

In [16]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.10, 500)

Iteration:  0
10.42
Iteration:  10
10.0
Iteration:  20
10.0
Iteration:  30
10.0
Iteration:  40
10.0
Iteration:  50
10.0
Iteration:  60
10.0
Iteration:  70
10.0
Iteration:  80
10.0
Iteration:  90
10.0


KeyboardInterrupt: 