In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)

In [None]:
data = pd.read_csv("iris.csv")

In [None]:
data["class"].value_counts()

In [None]:
np_data = data.to_numpy()
x = np.c_[np.ones(len(np_data)), np_data]
x[x == "Iris-setosa"] = 0
x[x == "Iris-versicolor"] = 1
x[x == "Iris-virginica"] = 2
np.random.shuffle(x)

x_train, y_train = x[:100, :-1].astype(float), x[:100, -1]
x_valid, y_valid = x[100: 125, :-1].astype(float), x[100:125, -1]
x_test, y_test = x[125:, :-1].astype(float), x[125:, -1]

In [None]:
def batch_GD(n_inputs, n_outputs):
    parameter = np.random.randn(n_inputs, n_outputs)
    return parameter

In [None]:
def encoder(y):
    return np.diag(np.ones(y.max() + 1))[y.astype(int)]
    #creates 3x3 matrix, diagonal with 1s.
    #indexes and selects the right one with the array required (2 gets the last one with the 1 at the end),
    #1 gets the middle, 0 gets the top left.

In [None]:
def softmax(x):
    return np.exp(x) / ((np.exp(x)).sum(axis = 1, keepdims = True))

In [None]:
def cross_entropy_cost(y_real, y_hat):
    return -y_real * np.log(y_hat)

In [None]:
mean = x_train[:, 1:].mean(axis = 0)
std = x_train[:, 1:].std(axis = 0)

y_train_encoded, y_valid_encoded, y_test_encoded = encoder(y_train), encoder(y_valid), encoder(y_test)

(x_train[:, 1:] - mean) / std
(x_valid[:, 1:] - mean) / std
(x_train[:, 1:] - mean) / std

y_train_encoded.shape

In [None]:
#training

learning_rate = 0.01
epoch = 3001
eps = 1e-5
length = len(x_train)
alpha = 0.1
parameters = batch_GD(5, 3)
best_loss = np.inf

for i in range(epoch):
    valid_prob = softmax(x_valid @ parameters)
    loss =  (alpha / length) * (parameters[1:] ** 2).sum() + cross_entropy_cost(y_valid_encoded, valid_prob + eps).mean()
    if i % 500 == 0:
        print(i, loss)
    if loss < best_loss:
        best_loss = loss
    else:
        print("Stopping... ", i, loss)
        break
    prob = softmax(x_train @ parameters)
    gradients = (1 / length) * x_train.T @ (prob - y_train_encoded)
    gradients += np.r_[np.zeros([1, 3]), alpha * parameters[1:]]
    #zeros for bias
    parameters -= gradients * learning_rate

In [None]:
prob = x_test @ parameters
predict = softmax(prob).argmax(axis = 1)
(predict == y_test).mean()