In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys

sys.path.append('../')
%matplotlib inline

# **Classification**

## Data Pre Processing

In [46]:
def get_X_y(file_path):
    # Load the dataset
    data = pd.read_csv(file_path, header=None)
    # Map the first column ('e', 'p') to binary values
    data[0] = data[0].apply(lambda x: 1 if x == 'p' else 0)  # 1 for poisonous, 0 for edible
    X = pd.get_dummies(data.iloc[:, 1:])  # one hot encoding
    y = data[0]
    return X, y


In [47]:
#file_path = 'https://archive.ics.uci.edu/static/public/73/data.csv'

file_path = "./agaricus-lepiota.data"

X, y = get_X_y(file_path)
print(X, y)

        1_b    1_c    1_f    1_k    1_s    1_x    2_f    2_g    2_s    2_y  \
0     False  False  False  False  False   True  False  False   True  False   
1     False  False  False  False  False   True  False  False   True  False   
2      True  False  False  False  False  False  False  False   True  False   
3     False  False  False  False  False   True  False  False  False   True   
4     False  False  False  False  False   True  False  False   True  False   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
8119  False  False  False   True  False  False  False  False   True  False   
8120  False  False  False  False  False   True  False  False   True  False   
8121  False  False   True  False  False  False  False  False   True  False   
8122  False  False  False   True  False  False  False  False  False   True   
8123  False  False  False  False  False   True  False  False   True  False   

      ...   21_s   21_v   21_y   22_d   22_g   22_l   22_m   22

## Logistic Regression

### 1. Initialize the parameters
init weight and biases, since we have 22 features, but after one hot encoding, the dimension of feature becames x.shape[1], so we are going to generate s.shape[1] random features and a single bias.

In [48]:
import numpy as np

NUM_OF_FEATURES = X.shape[1]
weights = np.random.random(NUM_OF_FEATURES)
bias = 0  # I randomly init as zero
learning_rate = 0.01
epochs = 10 ** 3

### 2. Define the Prediction Function
$$
\Huge P(x) = \frac{1}{1+e^{-(x \cdot weights + bias)}}
$$
x is the input of features, weight is the matrix of weight for all features, P(x) is the Prediction Function

In [49]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


# prediction function
def pred(x, p_weights, p_bias):
    return sigmoid(np.dot(x, p_weights) + p_bias)


def classify(x, cl_weights, cl_bias, threshold=0.5):
    y_pred = pred(x, cl_weights, cl_bias)
    return [1 if p >= threshold else 0 for p in y_pred]

### 3. Define the Loss Function (Cross-Entropy)
$$
\Huge L = -\sum_{k=1}^{K} \left( y_k \ln(p_k) + (1 - y_k) \ln(1 - p_k) \right)
$$
y means the true label, and p means the prediction

In [50]:
def compute_loss(y, y_pred):
    return - np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

### 4. Gradient Descent


In [51]:
import logging

logging.basicConfig(
    filename="train.log",
    filemode="a",
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO
)


def gradient_descent(x, y, gd_weights, gd_bias, gd_learning_rate, gd_epochs):
    n = len(y)
    logging.info('Start training ---')
    for eee in range(gd_epochs):
        y_pred = pred(x, gd_weights, gd_bias)

        dw = (1 / n) * np.dot(x.T, (y_pred - y))
        db = (1 / n) * np.sum(y_pred - y)

        gd_weights -= gd_learning_rate * dw
        gd_bias -= gd_learning_rate * db

        if eee % 100 == 0:
            loss = compute_loss(y, y_pred)
            logging.info(f"Epoch {eee}/{gd_epochs}, Loss: {loss}\n")

    return gd_weights, gd_bias

### 5. Model training




---



In [52]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# K-Fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=10)
accuracies = []

# K-Fold Loop
for train_index, test_index in kf.split(X):
    # Split the data
    X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
    y_train, y_test = y.iloc[train_index].values, y.iloc[test_index].values

    # Train the model using gradient descent
    weights = np.random.random(NUM_OF_FEATURES)
    bias = 0
    weights, bias = gradient_descent(X_train, y_train, weights, bias, learning_rate, epochs)
    print(weights, bias)

    # Make predictions on the test set
    y_pred = classify(X_test, weights, bias)

    # Calculate accuracy and append to the list
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

print(accuracies)

[ 0.57730831  0.62279728  0.41875838  0.75064014  0.24421362  0.32328039
  0.10696619  0.30234054  0.45183328  0.51024452  0.40375171  0.80303516
  0.41524028 -0.14972021  0.17402072  0.96440045  0.5291486   0.06993429
  0.02366402 -0.02832354  0.32170319 -0.28198105  0.39612915  0.1711821
  1.26456701  0.16752359  0.37213415 -0.61378934  0.57229757  0.10175445
  0.75177463  0.34705026 -0.42393877  0.2012905  -0.42778063 -0.65301414
  0.96715437  0.5393104  -0.03334017  0.32712526  0.3096324   0.06101794
  0.25723102  0.53201918  0.35033848  0.19266288  0.75194455  0.17800564
  0.78699953 -0.39311408 -0.1988687   0.33986023  0.08047478  0.0239133
  0.36827281  0.24132175  0.60436751  0.82317996 -0.18447523  0.55783315
  0.42969914  0.51059429 -0.34944128  0.78705689  0.48979691  0.98452589
  0.46874211 -0.06231946  0.34344994  0.01606865  0.82396666  0.08604841
  0.10647913  1.00939145  0.57400271 -0.03152603  0.31824275  0.87006193
  0.15156959  0.58964266  0.41558016  0.54176306 -0.3

## SVM

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

from sklearn.svm import SVC

svm_clf = SVC(kernel='linear')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
svm_clf.fit(X_train, y_train)

scores = cross_val_score(svm_clf, X_test, y_test, cv=kf)

print(scores)

[1. 1. 1. 1. 1.]


## Decision Tree

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

tree_clf = DecisionTreeClassifier(max_depth=7)
tree_clf.fit(X_train, y_train)

scores = cross_val_score(tree_clf, X_test, y_test, cv=kf)

print(scores)

tree.plot_tree(tree_clf)

[0.99795082 1.         1.         1.         1.        ]


[Text(0.5625, 0.9375, 'x[27] <= 0.5\ngini = 0.499\nsamples = 5686\nvalue = [2972.0, 2714.0]'),
 Text(0.3125, 0.8125, 'x[53] <= 0.5\ngini = 0.286\nsamples = 3190\nvalue = [551, 2639]'),
 Text(0.4375, 0.875, 'True  '),
 Text(0.1875, 0.6875, 'x[63] <= 0.5\ngini = 0.134\nsamples = 2817\nvalue = [204, 2613]'),
 Text(0.125, 0.5625, 'x[34] <= 0.5\ngini = 0.055\nsamples = 2689\nvalue = [76, 2613]'),
 Text(0.0625, 0.4375, 'gini = 0.0\nsamples = 2551\nvalue = [0, 2551]'),
 Text(0.1875, 0.4375, 'x[49] <= 0.5\ngini = 0.495\nsamples = 138\nvalue = [76, 62]'),
 Text(0.125, 0.3125, 'gini = 0.0\nsamples = 76\nvalue = [76, 0]'),
 Text(0.25, 0.3125, 'gini = 0.0\nsamples = 62\nvalue = [0, 62]'),
 Text(0.25, 0.5625, 'gini = 0.0\nsamples = 128\nvalue = [128, 0]'),
 Text(0.4375, 0.6875, 'x[65] <= 0.5\ngini = 0.13\nsamples = 373\nvalue = [347.0, 26.0]'),
 Text(0.375, 0.5625, 'gini = 0.0\nsamples = 347\nvalue = [347, 0]'),
 Text(0.5, 0.5625, 'gini = 0.0\nsamples = 26\nvalue = [0, 26]'),
 Text(0.8125, 0.8125, 

## Neural Network


### 1. Define the MLP Classifier

define 2 hidden layers with 50 and 50 nodes, use relu as activation function, max iteration time is 1000

In [55]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neural_network import MLPClassifier

# Define the MLP model
mlp = MLPClassifier(hidden_layer_sizes=(50, 50),
                    activation='relu',
                    solver='adam',
                    max_iter=100,
                    random_state=10)

### 2. k-fold Cross-Validation

In [56]:
cv_scores = cross_val_score(mlp, X, y, cv=kf, scoring='accuracy')

print(f"Cross-validation accuracies for each fold: {cv_scores}")
print(f"Average cross-validation accuracy: {cv_scores.mean():.2f}")

Cross-validation accuracies for each fold: [1. 1. 1. 1. 1.]
Average cross-validation accuracy: 1.00


# Cross Validation

In [58]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# K-Fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=10)
accuracies_lg = []
accuracies_svm = []
accuracies_dt = []
accuracies_nn = []

# K-Fold Loop
for train_index, test_index in kf.split(X):
    # Split the data
    X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
    y_train, y_test = y.iloc[train_index].values, y.iloc[test_index].values

    # Logistic Regression
    # Train the model using gradient descent
    weights = np.random.random(NUM_OF_FEATURES)
    bias = 0
    weights, bias = gradient_descent(X_train, y_train, weights, bias, learning_rate, epochs)

    # Make predictions on the test set
    y_pred = classify(X_test, weights, bias)

    # Calculate accuracy and append to the list
    accuracy = accuracy_score(y_test, y_pred)
    accuracies_lg.append(accuracy)

    # SVM
    svm_clf.fit(X_train, y_train)
    accuracies_svm = cross_val_score(svm_clf, X_test, y_test, cv=kf)

    # Decision Tree
    tree_clf.fit(X_train, y_train)
    accuracies_dt = cross_val_score(tree_clf, X_test, y_test, cv=kf)

    # Neural Network
    mlp.fit(X_train, y_train)
    accuracies_nn = cross_val_score(mlp, X_test, y_test, cv=kf)

print("Logistic regression's accuracies: " + str(accuracies_lg))
print("SVM's accuracies: " + str(accuracies_svm))
print("Decision tree's accuracies: " + str(accuracies_dt))
print("Neural network's accuracies: " + str(accuracies_nn))

Logistic regression's accuracies: [0.9316923076923077, 0.8947692307692308, 0.912, 0.9403076923076923, 0.9144088669950738]
SVM's accuracies: [1.         1.         1.         1.         0.99691358]
Decision tree's accuracies: [0.99692308 0.99692308 1.         1.         0.99691358]
Neural network's accuracies: [1.         1.         1.         1.         0.99691358]
