In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [142]:
def One_hot_encoder(num_classes):
    OHE = {i : np.array([1 if k == i else 0 for k in range(num_classes)]) for i in range(num_classes)}
    return OHE

def softmax(t):
    exp_t = np.exp(t - np.max(t))
    return exp_t / (np.sum(exp_t))

class SoftmaxMSE:
    def __init__(self,num_classes,lr=0.01,num_epochs=2):
        self.lr = lr
        self.num_epochs = num_epochs
        self.num_classes = num_classes
        self.weight = None
        self.bias = None
        self.encoder = One_hot_encoder(self.num_classes)

    def forward(self,X):
        tmp = self.weight @ X + self.bias
        return softmax(tmp)

    def parametrize(self,X,y):
        self.n_sample, self.n_feature = X.shape[0], X.shape[1]
        self.weight = np.random.randn(self.num_classes,self.n_feature)
        self.bias = np.random.randn(self.num_classes)

    def MSE(self,logits,y):
        return 0.5 * np.sum(np.square(logits - self.encoder[y]))

    def MSE_prime(self,logits,k,y):
        d_k =  np.array([-logits[k] *(logits[i] - np.where(y==i,1,0)) if i != k else -logits[k] *(logits[i] - 1)*(logits[i]- np.where(y==i,1,0))  for i in range(self.num_classes)]) 
        #d_k -=   self.encoder[y]
        #print(d_k)
        tmp = np.sum(d_k)
        #print(tmp,y)
        return tmp
        
    def train(self,X,y):
        
        for _ in range(self.num_epochs):
            for x, y_ in zip(X,y):
                logits = self.forward(x)
                #print(logits)
                #mse_prime = np.sum(logits - self.encoder[y_])
                
                for k in range(self.num_classes) : 
                    t = self.MSE_prime(logits,k,y_)
                    #print(t)
                    self.weight[k] -= self.lr * t * x
                    self.bias[k]-= self.lr * t
        

    def predict(self,x):
        res = self.forward(x)
        return np.argmax(res)

In [76]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [77]:
iris = load_iris()
X, y = iris.data, iris.target

In [143]:
X_train , X_test , y_train , y_test = train_test_split(X ,y , random_state=42)

In [150]:
clf = SoftmaxMSE(num_classes=3, num_epochs=100, lr=0.01)

In [151]:
clf.parametrize(X_train,y_train)

In [152]:
clf.train(X_train,y_train)

In [153]:
clf.weight

array([[15.84024049, 11.74757421,  4.64236633,  0.5224238 ],
       [15.96501366,  6.64462995, 13.25068168,  3.05133067],
       [14.65791019,  4.87154416, 15.52859934,  6.04648558]])

In [154]:
clf.bias

array([ 2.2900666 ,  2.40694898, -0.01467799])

In [155]:
y_pred =  []
for x in X_test:
    y_pred.append(clf.predict(x))

np.sum(y_pred == y_test)

np.int64(37)