In [1]:
import numpy as np
import pandas as pd

In [2]:
class Dataset:
    def __init__(self, dataset_path=None):
        self.path = dataset_path

    def get_data(self):
        data = pd.read_csv(self.path).to_numpy()
        X, Y_str = data[:, :-1], data[:, -1]  # remove the target column from the input and extract our targets
        # n_classes = len(set(Y_str))
        n_examples = len(Y_str)
        Y = np.zeros(n_examples)
        for i in range(len(Y_str)):
            category = Y_str[i]
            if category == "banana":
                Y[i] = 0
            elif category == "carrot":
                Y[i] = 1
            elif category == "cucumber":
                Y[i] = 2
            elif category == "mandarin":
                Y[i] = 3
            else:
                Y[i] = 4
        return X, Y


In [3]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iters=1000):
        self.learning_rate = learning_rate
        self.num_iters = num_iters
        self.weights = None
        self.bias = None
        self.loss_hist = []

    def sigmoid(z):
        return 1 / (1 + np.exp(-z))
    
    def predict(y):
        return 1 if y > 0.5 else 0
    
    def logisticLoss(self, y_true, y_pred):
        """
            binary cross entropy
        """
        y0 = y_true * np.log(y_pred)
        y1 = (1 - y_true) * np.log(1 - y_pred)
        return -np.mean(y0 + y1)
    
    def train(self, X, Y):
        n_examples, n_features = X.shape

        self.weights = np.zeros(n_features)
        self.bias    = 0

        for i in range(self.num_iters):
            y_pred = self.sigmoid(np.dot(self.weights, X) + self.bias)

            # gradient of binary cross entropy
            y_diff = (y_pred - Y)
            self.weights -= self.learning_rate * np.mean(y_diff * X)
            self.bias    -= self.learning_rate * np.mean(y_diff)
            print(f"---------- WEIGHTS (in step {i + 1}) ----------")
            print(self.weights)
            print("------------------------------------------------")

In [5]:
dataset_path = "./data/tabular/feature_extraction.csv"
dataset = Dataset(dataset_path)
X, Y = dataset.get_data()
print(X)
print(Y)
# logistic_regression = LogisticRegression()
# logistic_regression.train(X)

[[75.19295501708984 95.6454696377902 92.3571548461914 ... 235.0
  136.69693421886893 20.32672843418623]
 [169.61224746704102 75.30395897817334 187.84392166137692 ... 210.0
  130.65284700636062 16.250803118889454]
 [157.57231903076172 73.42373320346408 193.6863441467285 ... 193.0
  111.54250186258751 19.401907547798128]
 ...
 [117.6336441040039 47.24371812478304 118.35143661499023 ... 61.0
  95.74628636537764 8.548919022365538]
 [112.62142944335938 42.76008329358205 115.11558151245116 ... 87.0
  106.43180303436095 7.07523644176209]
 [66.81015396118164 63.37149468329491 77.40125274658203 ... 35.0
  85.86439752813237 6.535334727042556]]
[0. 0. 0. ... 4. 4. 4.]
