In [1]:
"""
Notebook for logistic regression
"""

'\nNotebook for logistic regression\n'

In [4]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../../')
from src.modelling.logistic_regression.logistic_cost_function import logistic_cost_function
from src.modelling.logistic_regression.logistic_hypothesis_function import sigmoid
from src.modelling.logistic_regression.logistic_gradient_descent import logistic_gradient_descent

In [6]:
# Load data from Excel files
train_data = pd.read_excel('../../data/preprocessed/train_data.xlsx')
cv_data = pd.read_excel('../../data/preprocessed/cv_data.xlsx')
test_data = pd.read_excel('../../data/preprocessed/test_data.xlsx')
# Print the first 5 rows of each data set
print(train_data.head())
print(cv_data.head())
print(test_data.head())

        AGE       BMI   density
0  2.349969 -0.001759 -0.453182
1  0.119025 -0.168809 -0.957611
2  2.864802 -0.455181  0.051246
3 -1.425474 -0.693824 -0.116897
4 -0.395808 -0.550638  1.060103
        AGE       BMI   density
0 -0.297527 -0.212745  0.046396
1  2.581536  0.013392 -0.118026
2 -0.117585 -0.715271  1.032928
3 -0.117585  1.772234 -1.597824
4  2.041712  1.118950 -0.940136
        AGE       BMI   density
0  2.270235  0.102744 -1.537497
1  1.099942 -1.324573  0.816613
2 -1.742196 -0.767027 -1.201196
3 -1.240642  2.778963 -0.696743
4 -0.906273  1.217835 -1.537497


In [None]:
# Separate target variable (y) and features (X)
y_train = train_data['']
X_train = train_data.drop('target_column', axis=1)

y_cv = cv_data['target_column']
X_cv = cv_data.drop('target_column', axis=1)

y_test = test_data['target_column']
X_test = test_data.drop('target_column', axis=1)

In [None]:
# Add a column of ones to the feature matrices for the bias term
X_train = np.column_stack((np.ones(len(X_train)), X_train))
X_cv = np.column_stack((np.ones(len(X_cv)), X_cv))
X_test = np.column_stack((np.ones(len(X_test)), X_test))

In [None]:
# Initialize parameters
theta = np.zeros(X_train.shape[1])

In [None]:
# Set hyperparameters
alpha = 0.01
num_iterations = 1000

In [None]:
# Train the model using gradient descent
theta = logistic_gradient_descent(X_train, y_train, theta, alpha, num_iterations)

In [None]:
# Predictions on training, cv, and test sets
y_train_pred = sigmoid(np.dot(X_train, theta))
y_cv_pred = sigmoid(np.dot(X_cv, theta))
y_test_pred = sigmoid(np.dot(X_test, theta))

In [None]:
# Calculate cost on training, cv, and test sets
cost_train = logistic_cost_function(y_train, y_train_pred)
cost_cv = logistic_cost_function(y_cv, y_cv_pred)
cost_test = logistic_cost_function(y_test, y_test_pred)

print(f"Final theta: {theta}")
print(f"Cost on training set: {cost_train}")
print(f"Cost on cross-validation set: {cost_cv}")
print(f"Cost on test set: {cost_test}")