In [None]:
# === 1. Utilitarios mínimos (acurácia, dataset de exemplo, plot e padronização) ===
import numpy as np
import matplotlib.pyplot as plt

def accuracy(y_true, y_pred):
    return float((y_true == y_pred).mean())

def make_moons(n=1000, noise=0.30, seed=0):
    rng = np.random.RandomState(seed)
    angles = rng.rand(n//2) * np.pi
    x1 = np.c_[np.cos(angles), np.sin(angles)]
    x2 = np.c_[np.cos(angles), -np.sin(angles)] + [1.0, 0.4]
    X = np.vstack([x1, x2])
    y = np.r_[np.zeros(n//2, dtype=int), np.ones(n//2, dtype=int)]
    X += rng.normal(scale=noise, size=X.shape)
    return X, y

def train_test_split(X, y, test_size=0.3, seed=0, stratify=True):
    rng = np.random.RandomState(seed)
    n = X.shape[0]
    if stratify:
        tr, te = [], []
        for c in np.unique(y):
            ii = np.where(y == c)[0]
            rng.shuffle(ii)
            t = int(round((1 - test_size) * len(ii)))
            tr.append(ii[:t]); te.append(ii[t:])
        tr = np.concatenate(tr); te = np.concatenate(te)
    else:
        idx = np.arange(n); rng.shuffle(idx)
        t = int(round((1 - test_size) * n))
        tr, te = idx[:t], idx[t:]
    return X[tr], X[te], y[tr], y[te]

def fit_standardizer(X):
    mu = X.mean(axis=0, keepdims=True)
    sd = X.std(axis=0, keepdims=True) + 1e-12
    return mu, sd

def transform_standardizer(X, mu, sd):
    return (X - mu) / sd

def plot_decision_boundary(model, X, y, h=0.03, proba=False, title="Fronteira de decisão"):
    x_min, x_max = X[:,0].min()-0.5, X[:,0].max()+0.5
    y_min, y_max = X[:,1].min()-0.5, X[:,1].max()+0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    grid = np.c_[xx.ravel(), yy.ravel()]
    if proba and hasattr(model, "predict_proba"):
        Z = model.predict_proba(grid)
        if Z.ndim == 2 and Z.shape[1] > 1: Z = Z[:,1]
    else:
        Z = model.predict(grid)
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.contourf(xx, yy, Z, alpha=0.4)
    #plt.scatter(X[:,0], X[:,1], c=y, edgecolor='k')
    plt.title(title); plt.xlabel("x1"); plt.ylabel("x2")
    plt.show()

def plot_lines(xs, ys_dict, xlabel="", ylabel="", title=""):
    plt.figure()
    for label, ys in ys_dict.items():
        plt.plot(xs, ys, marker="o", label=label)
    plt.xlabel(xlabel); plt.ylabel(ylabel); plt.title(title)
    plt.legend(); plt.grid(True, alpha=0.3)
    plt.show()


In [None]:

# === 2. Regressão logística => OneHiddenMLP (ReLU + Softmax) ===
import numpy as np

# Rectified Linear Activation Unit
def _relu(x): return np.maximum(0, x)
def _relu_grad(x): return (x > 0).astype(float)

def _softmax(z):
    z = z - np.max(z, axis=1, keepdims=True)
    exp = np.exp(z)
    return exp / (np.sum(exp, axis=1, keepdims=True) + 1e-12)

class OneHiddenMLP:
    def __init__(self, n_hidden=64, lr=0.05, epochs=40, batch_size=64,
                 l2=1e-4, random_state=0, verbose=False):
        self.n_hidden = n_hidden
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.l2 = l2
        self.random_state = random_state
        self.verbose = verbose
        self.history_ = {}

    def _init_params(self, n_in, n_out):
        rng = np.random.RandomState(self.random_state)
        W1 = rng.randn(n_in, self.n_hidden) * np.sqrt(2.0 / n_in)
        b1 = np.zeros((1, self.n_hidden))
        W2 = rng.randn(self.n_hidden, n_out) * np.sqrt(2.0 / self.n_hidden)
        b2 = np.zeros((1, n_out))
        self.params_ = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}

    def _forward(self, X):
        W1, b1, W2, b2 = self.params_["W1"], self.params_["b1"], self.params_["W2"], self.params_["b2"]
        z1 = X @ W1 + b1
        h1 = _relu(z1)
        scores = h1 @ W2 + b2
        probs = _softmax(scores)
        return probs, {"X": X, "z1": z1, "h1": h1, "scores": scores, "probs": probs}

    def _loss_and_grads(self, cache, y, n_classes):
        X, z1, h1  = cache["X"], cache["z1"], cache["h1"]
        scores, probs= cache["scores"], cache["probs"]
        m = X.shape[0]
        Y = np.zeros((m, n_classes)); Y[np.arange(m), y.astype(int)] = 1.0
        W1, W2 = self.params_["W1"], self.params_["W2"]

        loss = -np.sum(Y * np.log(probs + 1e-12)) / m + 0.5*self.l2*(np.sum(W1*W1)+np.sum(W2*W2))
        dscores = (probs - Y) / m
        dW2 = h1.T @ dscores + self.l2 * W2
        db2 = np.sum(dscores, axis=0, keepdims=True)
        dh1 = dscores @ W2.T
        dz1 = dh1 * _relu_grad(z1)
        dW1 = X.T @ dz1 + self.l2 * W1
        db1 = np.sum(dz1, axis=0, keepdims=True)
        return loss, {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

    def fit(self, X, y, X_val=None, y_val=None, early_stopping=False, patience=10):
        n_in = X.shape[1]; n_out = int(np.max(y)) + 1
        self._init_params(n_in, n_out)
        m = X.shape[0]

        hist = {"loss_tr": [], "acc_tr": []}
        if X_val is not None:
            hist.update({"loss_va": [], "acc_va": []})

        best_state = None; best_val = np.inf; wait = 0

        for ep in range(self.epochs):
            idx = np.arange(m); np.random.shuffle(idx)
            for start in range(0, m, self.batch_size):
                batch = idx[start:start+self.batch_size]
                probs, cache = self._forward(X[batch])
                loss, grads = self._loss_and_grads(cache, y[batch], n_out)
                self.params_["W1"] -= self.lr * grads["dW1"]
                self.params_["b1"] -= self.lr * grads["db1"]
                self.params_["W2"] -= self.lr * grads["dW2"]
                self.params_["b2"] -= self.lr * grads["db2"]

            p_tr = self.predict_proba(X); yhat_tr = np.argmax(p_tr, axis=1)
            loss_tr = -np.mean(np.log(p_tr[np.arange(m), y.astype(int)] + 1e-12))
            hist["loss_tr"].append(float(loss_tr))
            hist["acc_tr"].append(float((yhat_tr == y).mean()))

            if X_val is not None:
                pv = self.predict_proba(X_val); yhat_va = np.argmax(pv, axis=1)
                loss_va = -np.mean(np.log(pv[np.arange(X_val.shape[0]), y_val.astype(int)] + 1e-12))
                acc_va = float((yhat_va == y_val).mean())
                hist["loss_va"].append(float(loss_va))
                hist["acc_va"].append(acc_va)

                if early_stopping:
                    if loss_va < best_val - 1e-6:
                        best_val = loss_va
                        best_state = {k: v.copy() for k, v in self.params_.items()}
                        wait = 0
                    else:
                        wait += 1
                        if wait >= patience:
                            if best_state is not None:
                                self.params_ = best_state
                            break

        self.history_ = hist
        return self

    def predict_proba(self, X):
        return self._forward(X)[0]

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def score(self, X, y):
        return accuracy(y, self.predict(X))


In [None]:

# 3. Exemplo de treinamento do MLP em moons
X, y = make_moons(n=1200, noise=0.30, seed=42)
Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=0.35, seed=3, stratify=True)

mu, sd = fit_standardizer(Xtr)
Xtr_s = transform_standardizer(Xtr, mu, sd)
Xva_s = transform_standardizer(Xva, mu, sd)

mlp = OneHiddenMLP(n_hidden=64, lr=0.05, epochs=40, batch_size=64, l2=1e-4, random_state=0, verbose=False)
mlp.fit(Xtr_s, ytr, X_val=Xva_s, y_val=yva, early_stopping=True, patience=8)

print("Train acc:", mlp.score(Xtr_s, ytr), "| Val acc:", mlp.score(Xva_s, yva))

epochs = np.arange(1, len(mlp.history_["loss_tr"])+1)
ys = {"loss_tr": mlp.history_["loss_tr"]}
if "loss_va" in mlp.history_: ys["loss_va"] = mlp.history_["loss_va"]
plot_lines(epochs, ys, xlabel="epoch", ylabel="loss", title="Loss vs epoch")

ys2 = {"acc_tr": mlp.history_["acc_tr"]}
if "acc_va" in mlp.history_: ys2["acc_va"] = mlp.history_["acc_va"]
plot_lines(epochs, ys2, xlabel="epoch", ylabel="accuracy", title="Accuracy vs epoch")

plot_decision_boundary(mlp, Xtr_s, ytr, title="MLP (train space, standardized)")


In [None]:

# === 4. Varredura de Hiperparametro (width & taxas de aprendizado) ===
import matplotlib.pyplot as plt
widths = [8, 16, 32, 64, 128]
lrs = [0.01, 0.03, 0.05, 0.1]
results = {}

for w in widths:
    vals = []
    for lr in lrs:
        m = OneHiddenMLP(n_hidden=w, lr=lr, epochs=30, batch_size=64, l2=1e-4, random_state=0)
        m.fit(Xtr_s, ytr, X_val=Xva_s, y_val=yva, early_stopping=True, patience=5)
        vals.append(m.score(Xva_s, yva))
    results[f"width={w}"] = vals

plt.figure()
for label, vals in results.items():
    plt.plot(lrs, vals, marker="o", label=label)
plt.xlabel("Taxa de aprendizado"); plt.ylabel("Val accuracy"); plt.title("Val acc vs LR para diferentes larguras")
plt.legend(); plt.grid(True, alpha=0.3)
plt.show()


In [None]:


np.random.seed(1)
Xt = Xtr_s[:5]; yt = ytr[:5]
m = OneHiddenMLP(n_hidden=5, lr=0.01, epochs=1, batch_size=5, l2=0.0, random_state=0)
m._init_params(Xt.shape[1], int(np.max(y)+1))

probs, cache = m._forward(Xt)
loss, grads = m._loss_and_grads(cache, yt, int(np.max(y)+1))
print("loss (analytical):", loss)

def loss_only(W2_new):
    P, c = m._forward(Xt)
    m.params_["W2"] = W2_new
    P, c = m._forward(Xt)
    Y = np.zeros((Xt.shape[0], int(np.max(y)+1))); Y[np.arange(Xt.shape[0]), yt] = 1.0
    return -np.sum(Y * np.log(P + 1e-12)) / Xt.shape[0]

W2 = m.params_["W2"].copy()
num = np.zeros_like(W2)
eps = 1e-5
for i in range(W2.shape[0]):
    for j in range(W2.shape[1]):
        W2p = W2.copy(); W2p[i,j] += eps
        W2m = W2.copy(); W2m[i,j] -= eps
        m.params_["W2"] = W2p; lp = loss_only(W2p)
        m.params_["W2"] = W2m; lm = loss_only(W2m)
        num[i,j] = (lp - lm) / (2*eps)
m.params_["W2"] = W2

rel_err = np.linalg.norm(num - grads["dW2"]) / (np.linalg.norm(num) + np.linalg.norm(grads["dW2"]) + 1e-12)
print("Relative error (W2):", rel_err)
