In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Training Dataset

In [None]:
df_train = pd.read_csv("./train_tfidf_features.csv")
display(df_train)

In [None]:
df_train.describe()

# Task 1: Implement Logistic Regression
Recalled that you have learned about Logistic Regression in your earlier class. Your task is to implement a Logistic Regression model from scratch. \
Note that you are NOT TO USE the sklearn logistic regression package or any other pre-defined logistic regression package for this task! \
Usage of any logistic regression packages will result in 0 marks for this task.

## Key Task Deliverables
1a. Code implementation of the Logistic Regression model. \
1b. Prediction made by your Logistic Regression on the Test set. Note that you are welcome to submit your predicted labels to Kaggle but you will need to submit the final prediction output in the final project submission. Please label the file as "LogRed_Prediction.csv".

-- `sigmoid(z)`: A function that takes in a Real Number input and returns an output value between 0 and 1.

In [None]:
def sigmoid(z):
    result = 1/(1 + np.exp(-z))
#     print("sigmoid:", result)
    return result

-- `loss(y, y_hat)`: A loss function that allows us to minimize and determine the optimal parameters. The function takes in the actual labels y and the predicted labels yhat, and returns the overall training loss. Note that you should be using the Log Loss function taught in class.

In [None]:
# def loss(y, y_hat):
#         result = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
# #         print("loss:", result)
#         return result

def loss(y, y_hat):
    loss = np.where(y == 1, np.log(y_hat), np.log(1 - y_hat).mean()
    print(f"{loss = }")
    return loss

-- `gradients(X, y, y_hat)`: The Gradient Descent Algorithm to find the optimal values of our parameters. The function takes in the training feature X, actual labels y and the predicted labels yhat, and returns the partial derivative of the Loss function with respect to weights (w) and bias (db).

In [None]:
def gradients(X, y, y_hat):
    # m - number of training examples
    m = np.shape(X)[0]
    dw = (1 / m) * np.dot(X.T, (y_hat - y))
    db = (1 / m) * np.sum((y_hat - y))
#     print("dw:", dw, "db:", db)
    return dw, db

-- `train(X, y, bs, epochs, lr)`: The training function for your model.

In [None]:
# @param X - features
# @param y - labels
# @param bs - batch size
# @param epochs - number of iterations through dataset
# @param lr - learning rate

def train(X, y, bs, epochs, lr):
    # n - number of training examples, d - number of features
    n, d = np.shape(X)
    
    randomize = np.arange(n)
    rng = np.random.default_rng(100)
    
    w = np.zeros((d, 1))
    b = 0
    
    y = y.reshape(n, 1)
    
    old_losses = []
    old_w = []
    old_b = []
    
    old_w.append(w.copy())
    old_b.append(b)
    l = loss(y, sigmoid(np.dot(X, w) + b))
    old_losses.append(l)
    
    for epoch in range(epochs):
        limit = n // bs
#         print("limit:", limit)
        for i in range(limit):
            start = i * bs
            end = start + bs
#             print("epoch:", epoch, "start:", start, "end:", end)

#             X_batch = X[start:end]
#             y_batch = y[start:end]
            
            rng.shuffle(randomize)
            choice = randomize[start:end]
#             print(choice)
            X_batch = X[choice]
            y_batch = y[choice]
#             print(X_batch, y_batch)
            
            y_hat = sigmoid(np.dot(X_batch, w) + b)
            
            dw, db = gradients(X_batch, y_batch, y_hat)
            
            loss_old = loss(y, sigmoid(np.dot(X, w) + b))
            
            w_new = w.copy() - lr * dw
            b_new = b - lr * db
            loss_new = loss(y, sigmoid(np.dot(X, w_new) + b_new))
            
#             w = w_new
#             b = b_new
#             old_w.append(w)
#             old_b.append(b)
#             old_losses.append(loss_new)
            
            if (loss_new < loss_old):
#                 print(loss_new, loss_old)
                w = w_new
                b = b_new
                old_w.append(w)
                old_b.append(b)
                old_losses.append(loss_new)
    
#     print("old_w:", old_w)
#     print("old_b:", old_b)
#     print("old_losses:", old_losses)

    min_loss = min(old_losses)
#     print("min_loss:", min_loss)
    
    min_index = old_losses.index(min_loss)
#     print("min_index:", min_index)
    
    return old_w[min_index], old_b[min_index], old_losses
            

-- `predict(X)`: The prediction function where you can apply your validation and test sets.

In [None]:
def predict(X, w, b):
    y_pred = sigmoid(np.dot(X, w) + b)
    pred_labels = np.array([1 if i >= 0.5 else 0 for i in y_pred])
    return pred_labels

## Performance Evaluation

As per the grading rubric - "Perfect Implementation of the Logistics Regression algorithm. Successfully trained the implemented model with the train set and achieved comparative performance compared to SKLearn Logistic Regression package", we shall compare the performance of our model with the SKLearn Logistic Regression package.

We shall first implement a function to evaluate the accuracy of our model.

In [None]:
def accuracy(y, y_hat):
    accuracy = np.sum(y == y_hat) / np.shape(y)[0]
    return accuracy

Due to the large number of features, we shall evaluate the model performance and compare it with the SKLearn Logistic Regression using only the first 100 columns.

In [None]:
X = df_train.iloc[:, 2:102].to_numpy()
y = df_train.iloc[:,1].to_numpy()
print("X:", X.shape, "y:", y.shape)

In [None]:
w, b, l = train(X, y, bs = 100, epochs = 20, lr = 0.01)
accuracy(y, predict(X, w, b))

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state = 100).fit(X, y)
clf.predict(X)
clf.score(X, y)

In [None]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, noise=0.24)

In [None]:
w, b, l = train(X, y, bs = 100, epochs = 20, lr = 0.01)
accuracy(y, predict(X, w, b))

In [None]:
clf = LogisticRegression(random_state = 100).fit(X, y)
clf.predict(X)
clf.score(X, y)

Based on the above score, we can deem that the performance of our Logistic Regression Model is comparable to that of SKLearn Logistic Regression Package.

## Exporting Prediction
Prediction made by your Logistic Regression on the Test set. Note that you are welcome to submit your predicted labels to Kaggle but you will need to submit the final prediction output in the final project submission. Please label the file as "LogRed_Prediction.csv".