# Data Preprocess
one hot encoding to transfer the data

In [43]:
import CommonUtils

X, y = CommonUtils.get_X_y('../mushroom_dataset/agaricus-lepiota.data')
print(X, y)

        1_b    1_c    1_f    1_k    1_s    1_x    2_f    2_g    2_s    2_y  \
0     False  False  False  False  False   True  False  False   True  False   
1     False  False  False  False  False   True  False  False   True  False   
2      True  False  False  False  False  False  False  False   True  False   
3     False  False  False  False  False   True  False  False  False   True   
4     False  False  False  False  False   True  False  False   True  False   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
8119  False  False  False   True  False  False  False  False   True  False   
8120  False  False  False  False  False   True  False  False   True  False   
8121  False  False   True  False  False  False  False  False   True  False   
8122  False  False  False   True  False  False  False  False  False   True   
8123  False  False  False  False  False   True  False  False   True  False   

      ...   21_s   21_v   21_y   22_d   22_g   22_l   22_m   22

# Logistic Regression

## 1. Initialize the parameters
init weight and biases, since we have 22 features, but after one hot encoding, the dimension of feature becames x.shape[1], so we are going to generate s.shape[1] random features and a single bias.

In [44]:
import numpy as np

NUM_OF_FEATURES = X.shape[1]
weights = np.random.random(NUM_OF_FEATURES)
bias = 0  # I randomly init as zero
learning_rate = 0.01
epochs = 10 ** 5

## 2. Define the Prediction Function
$$
\Huge P(x) = \frac{1}{1+e^{-(x \cdot weights + bias)}}
$$
x is the input of features, weight is the matrix of weight for all features, P(x) is the Prediction Function

In [45]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


# prediction function
def pred(x, p_weights, p_bias):
    return sigmoid(np.dot(x, p_weights) + p_bias)


def classify(x, cl_weights, cl_bias, threshold=0.5):
    y_pred = pred(x, cl_weights, cl_bias)
    return [1 if p >= threshold else 0 for p in y_pred]

## 3. Define the Loss Function (Cross-Entropy)
$$
\Huge L = -\sum_{k=1}^{K} \left( y_k \ln(p_k) + (1 - y_k) \ln(1 - p_k) \right)
$$
y means the true label, and p means the prediction

In [46]:
def compute_loss(y, y_pred):
    return - np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

## 4. Gradient Descent


In [47]:
import logging

logging.basicConfig(
    filename="train.log",
    filemode="a",
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO
)


def gradient_descent(x, y, gd_weights, gd_bias, gd_learning_rate, gd_epochs):
    n = len(y)
    logging.info('Start training ---')
    for eee in range(gd_epochs):
        y_pred = pred(x, gd_weights, gd_bias)

        dw = (1 / n) * np.dot(x.T, (y_pred - y))
        db = (1 / n) * np.sum(y_pred - y)

        gd_weights -= gd_learning_rate * dw
        gd_bias -= gd_learning_rate * db

        if eee % 100 == 0:
            loss = compute_loss(y, y_pred)
            logging.info(f"Epoch {eee}/{gd_epochs}, Loss: {loss}\n")

    return gd_weights, gd_bias

## 5. Model training


In [48]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# K-Fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=10)
accuracies = []

# K-Fold Loop
for train_index, test_index in kf.split(X):
    # Split the data
    X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
    y_train, y_test = y.iloc[train_index].values, y.iloc[test_index].values

    # Train the model using gradient descent
    weights, bias = gradient_descent(X_train, y_train, weights, bias, learning_rate, epochs)
    print(weights, bias)

    # Make predictions on the test set
    y_pred = classify(X_test, weights, bias)

    # Calculate accuracy and append to the list
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

print(accuracies)

[ 4.46607186e-01  1.12492055e+00 -2.45303637e-03 -1.52994669e-02
  3.41348442e-02 -1.72232255e-01 -9.75810821e-01  8.03299094e-01
  1.80999107e-01 -9.61967356e-02  1.15137610e+00  9.77605675e-03
  5.88708936e-01  2.81002103e-01 -9.45459715e-02  1.31984766e+00
  4.51492994e-01  3.69078230e-01  7.44529521e-01 -1.74966148e-01
  1.73881048e-01  1.79960281e-02 -1.77698654e+00  2.39087719e+00
  2.56244608e+00 -1.87004545e+00  7.82620939e-01 -3.71721682e+00
  1.98390731e+00  1.34853098e+00  9.43222836e-01 -1.80672314e-01
 -4.81388577e-01  1.45808824e+00 -1.26189180e+00 -2.03034808e+00
  2.53575720e+00  2.29014800e+00  3.81237157e-01  4.28127613e-01
  4.97849421e-01  7.59246242e-03 -3.32960807e-01  5.57856728e-01
 -1.34904734e-01  6.57886728e-01 -1.94028180e-02  3.94390711e-02
  3.99435683e-01  4.74709811e-01 -6.70074857e-01 -9.68913663e-01
  2.30662481e+00 -5.70703944e-01 -5.32922211e-02  4.16847787e-02
 -3.80286841e-01  1.65776532e+00 -1.15717855e+00  5.69581305e-01
 -1.01172687e+00  1.21868