In [7]:
import numpy as np
def softmax(Z):
    """
    Compute softmax values for each sets of scores in V.
    each column of V is a set of scores.
    Z: a numpy array of shape (N, C)
    return a numpy array of shape (N, C)
    """
    e_Z = np.exp(Z)
    A = e_Z / e_Z.sum(axis = 1, keepdims = True)
    return A
def softmax_stable(Z):
    """
    Compute softmax values for each sets of scores in Z.
    each row of Z is a set of scores.
    """
    # Z = Z.reshape(Z.shape[0], -1)
    e_Z = np.exp(Z - np.max(Z, axis = 1, keepdims = True))
    A = e_Z / e_Z.sum(axis = 1, keepdims = True)
    return A

In [17]:
def softmax_loss(X, y, W):
    """
    W: 2d numpy array of shape (d, C),
    each column correspoding to one output node
    X: 2d numpy array of shape (N, d), each row is one data point
    y: 1d numpy array -- label of each row of X
    """
    A = softmax_stable(X.dot(W))
    id0 = range(X.shape[0]) # indexes in axis 0, indexes in axis 1 are in y
    return -np.mean(np.log(A[id0, y]))
def softmax_grad(X, y, W):
    """
    W: 2d numpy array of shape (d, C),
    each column correspoding to one output node
    X: 2d numpy array of shape (N, d), each row is one data point
    y: 1d numpy array -- label of each row of X
    """
    A = softmax_stable(X.dot(W)) # shape of (N, C)
    id0 = range(X.shape[0])
    A[id0, y] -= 1 # A - Y, shape of (N, C)
    return X.T.dot(A)/X.shape[0]

In [25]:
def softmax_fit(X, y, W, lr = 0.01, nepoches = 100, tol = 1e-5, batch_size = 10):
    W_old = W.copy()
    ep = 0
    loss_hist = [softmax_loss(X, y, W)] # store history of loss
    N = X.shape[0]
    nbatches = int(np.ceil(float(N)/batch_size))
    while ep < nepoches:
        ep += 1
        mix_ids = np.random.permutation(N) # mix data
        for i in range(nbatches):
            # get the i-th batch
            batch_ids = mix_ids[batch_size*i:min(batch_size*(i+1), N)]
            X_batch, y_batch = X[batch_ids], y[batch_ids]
            W -= lr*softmax_grad(X_batch, y_batch, W) # update gradient descent
        loss_hist.append(softmax_loss(X, y, W))
        if np.linalg.norm(W - W_old)/W.size < tol:
            break
        W_old = W.copy()
    return W, loss_hist

In [27]:
def pred(W, X):
    """
    predict output of each columns of X . Class of each x_i is determined by
    location of max probability. Note that classes are indexed from 0.
    """
    return np.argmax(X.dot(W), axis =1)

In [29]:
C, N = 5, 500 # number of classes and number of points per class
means = [[2, 2], [8, 3], [3, 6], [14, 2], [12, 8]]
cov = [[1, 0], [0, 1]]
X0 = np.random.multivariate_normal(means[0], cov, N)
X1 = np.random.multivariate_normal(means[1], cov, N)
X2 = np.random.multivariate_normal(means[2], cov, N)
X3 = np.random.multivariate_normal(means[3], cov, N)
X4 = np.random.multivariate_normal(means[4], cov, N)
X = np.concatenate((X0, X1, X2, X3, X4), axis = 0) # each row is a datapoint
Xbar = np.concatenate((X, np.ones((X.shape[0], 1))), axis = 1) # bias trick
y = np.asarray([0]*N + [1]*N + [2]*N+ [3]*N + [4]*N)
W_init = np.random.randn(Xbar.shape[1], C)
W, loss_hist = softmax_fit(Xbar, y, W_init, batch_size = 10, nepoches = 100, lr =
0.05)

In [53]:
print(loss_hist)

[8.323107456523601, 0.6043749154838903, 0.47132442013315956, 0.3961944080504211, 0.3459797095361791, 0.3088092606529686, 0.2727758584556505, 0.2602868978496584, 0.24140040723762904, 0.23975800353022741, 0.2143791720625992, 0.21985389467364475, 0.19440898638008774, 0.19588365794502496, 0.18082177899409113, 0.3275755358559046, 0.16792216786151845, 0.1737137365883429, 0.16544352727966044, 0.15760388659129454, 0.16917696644702937, 0.14707309158234724, 0.15537645489755564, 0.1499557980580345, 0.13727747316177757, 0.1501671770033077, 0.13999973783067957, 0.14605914795705585, 0.1438449233056143, 0.12717627703609086, 0.13082150503617723, 0.12704034479770912, 0.11769295200031779, 0.12089125954361547, 0.12218928472306147, 0.11889602634390417, 0.15444410524513102, 0.12856316512431662, 0.11005046105990869, 0.10758275605023937, 0.10863547666750033, 0.11151596107391606, 0.10324519086235075, 0.10277410007146343, 0.10673169638828467, 0.14111883237958292, 0.09795521786411059, 0.09819758489399746, 0.098

In [62]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Tải dữ liệu MNIST
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data, mnist.target

# Chuyển nhãn thành số nguyên
y = y.astype(np.int8)

# Chia tập dữ liệu thành train và test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Huấn luyện mô hình Softmax Regression (Logistic Regression đa lớp)
model = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='multinomial')
model.fit(X_train, y_train)

# Dự đoán
y_pred = model.predict(X_test)

# Đánh giá mô hình
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')


Accuracy: 0.9154
