In [1]:
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from ucimlrepo import fetch_ucirepo
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
class MulticlassLogisticRegression():
    def __init__(self, learning_rate, num_iterations):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None

    def softmax(self, t):
        exp_t = np.exp(t - np.max(t, axis=0, keepdims=True))
        return exp_t / np.sum(exp_t, axis=0, keepdims=True)

    def fit(self, x_train, y_train):
        """
        Fit the multiclass logistic regression model.

        Parameters:
            x_train: Input features
            y_train: Target labels
        """
        # Split the data into training and validation sets
        x_train, x_valuation, y_train, y_valuation = train_test_split(x_train, y_train, test_size=0.1)

        num_of_classes = len(np.unique(y_train))
        num_of_features = x_train.shape[1]
        num_of_examples = x_train.shape[0]

        # One-hot encode the target labels
        encoder = OneHotEncoder(sparse_output=False)
        y_train_one_hot = encoder.fit_transform(y_train.reshape(-1, 1))

        # Concatenate a column of ones to the left side of the entire array (bias term)
        x_train = np.c_[np.ones(num_of_examples), x_train]

        # Step 1: start with random weights
        self.weights = np.random.rand(num_of_features + 1, num_of_classes)

        iteration = 1
        unchanged_epochs = 0
        best_s = 0

        while iteration <= self.num_iterations:
            # Shuffle the data
            permutation_index = np.random.permutation(num_of_examples)
            shuffled_y_train = y_train_one_hot[permutation_index]
            shuffled_x_train = x_train[permutation_index]

            # Iterate through each example
            for i in range(num_of_examples):
                # Step 2: calculate probabilities using softmax
                t = np.dot(shuffled_x_train[i], self.weights)
                probabilities = self.softmax(t)

                # Step 3: calculate the gradient
                gradient = -np.outer(shuffled_x_train[i], (shuffled_y_train[i] - probabilities))

                # Step 4: update weights using the outer product
                self.weights = self.weights - self.learning_rate * gradient

            # Step 5: checking accuracy on the validation set
            y_pred = np.argmax(self.predict_proba(x_valuation), axis=1)
            s = accuracy_score(y_valuation, y_pred)             
            # s = accuracy_score(y_valuation, self.predict(x_valuation))

            if s > best_s:
                best_s = s
                best_weights = self.weights
                unchanged_epochs = 0
            else:
                iteration += 1
                unchanged_epochs += 1

            if unchanged_epochs == 31:
                self.weights = best_weights
                break

        return x_train, y_train


    def predict(self, x_test):
        return np.argmax(self.predict_proba(x_test), axis=1)
    
    def predict_proba(self, x_test):
        num_of_examples = x_test.shape[0]
        x_test = np.c_[np.ones(num_of_examples), x_test]
        y_pred = list()
        
        for example in x_test:
            t = np.dot(example, self.weights)
            y_pred.append(self.softmax(t))
            
        return np.array(y_pred)


In [3]:
# Fetch dataset
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544)

# Data (as pandas dataframes)
X = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features
y = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets

# Drop categorical columns
string_columns = ['MTRANS', 'Gender', 'family_history_with_overweight', 'FAVC', 'CALC', 'SCC', 'SMOKE', 'CAEC']
X = X.drop(string_columns, axis=1)

# Convert the target variable to numerical format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y).ravel()

# Print the values of the target variable y
print("Target variable y:")
print(y)

# Standardization
mu = X.mean()
stdu = X.std()
X = (X - mu) / stdu

# Print the first few rows of the dataset to inspect features and values
print("First few rows of the dataset:")
print(X.head())
# Convert pandas dataframe to numpy array
X = X.to_numpy()
# y = y.to_numpy()

np.random.seed(42)
# Instantiate and train the logistic regression model
lr = MulticlassLogisticRegression(learning_rate=0.00215, num_iterations=170)
x_train, y_train = lr.fit(X, y)
# Test the custom model on a separate dataset

y_test_pred_custom = lr.predict(X)

# Evaluate the accuracy on the test set for the custom model
accuracy_custom = accuracy_score(y, y_test_pred_custom)
print(f"Accuracy on the custom model test set: {accuracy_custom}")

# Print classification report for the custom model
print("Classification report for Custom Logistic Regression at Testing")
print(classification_report(y, y_test_pred_custom))

# Remove the bias term from X before using scikit-learn's model
X_no_bias = X[:, 1:]

# Instantiate and train scikit-learn's logistic regression model
log = LogisticRegression()
log.fit(X_no_bias, y)

# Evaluate the accuracy of scikit-learn's model on the test set
y_test_pred_sklearn = log.predict(X_no_bias)
accuracy_sklearn = accuracy_score(y, y_test_pred_sklearn)
print(f"Accuracy on scikit-learn's model test set: {accuracy_sklearn}")

# Print classification report for scikit-learn's Logistic Regression
print("Classification report for Logistic Regression from scikit-learn at Testing")
print(classification_report(y, y_test_pred_sklearn))

  y = column_or_1d(y, warn=True)


Target variable y:
[1 1 1 ... 4 4 4]
First few rows of the dataset:
        Age    Height    Weight      FCVC       NCP      CH2O       FAF  \
0 -0.522001 -0.875382 -0.862354 -0.784833  0.404057 -0.013070 -1.187758   
1 -0.522001 -1.947138 -1.167800  1.088084  0.404057  1.618375  2.339196   
2 -0.206840  1.053779 -0.366003 -0.784833  0.404057 -0.013070  1.163545   
3  0.423481  1.053779  0.015805  1.088084  0.404057 -0.013070  1.163545   
4 -0.364420  0.839428  0.122711 -0.784833 -2.166509 -0.013070 -1.187758   

        TUE  
0  0.561864  
1 -1.080369  
2  0.561864  
3 -1.080369  
4 -1.080369  
Accuracy on the custom model test set: 0.8739933680720038
Classification report for Custom Logistic Regression at Testing
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       272
           1       0.93      0.73      0.82       287
           2       0.95      0.93      0.94       351
           3       0.95      0.97      0.96       297
    