# Advanced Optimization: Newton’s Method and Adam

## Abstract

In [None]:
%load_ext autoreload
%autoreload 2

from newton_logistic import LogisticRegression, NewtonOptimizer, GradientDescentOptimizer
import torch

## Newton Experiments

### Getting Data

For my external dataset, I used a [heart prediction dataset](https://www.kaggle.com/datasets/shantanugarg274/heart-prediction-dataset-quantum) from Kaggle. 

In [None]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split

# Download dataset from Kaggle
path = kagglehub.dataset_download("shantanugarg274/heart-prediction-dataset-quantum")
print("Path to dataset files:", path)

data_path = path + "/Heart Prediction Quantum Dataset.csv"
df = pd.read_csv(data_path)
# print(df.head())

Here I found a dataset on heart disease prediction from Kaggle. The data was in 1 csv file with 7 columns representing age, gender, blood pressure, cholesterol, heart rate, quantum pattern feature, and heart disease. 

In [None]:
X_data = df.drop("HeartDisease", axis=1).values
y_data = df["HeartDisease"].values

Since I intend to predict heart disease, I removed that column from the main dataset in part in the target set.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_data = scaler.fit_transform(X_data)
X_data = torch.tensor(X_data, dtype=torch.float32)
y_data = torch.tensor(y_data, dtype=torch.float32)

The data across features widely varied in range and so I used sci-kit learn's StandardScaler to standardize the both datasets and then converted them into tensors.  The model was trained on the training set and the loss computed for both training and validation. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.4, random_state=42)


I also used train_test_split to split the 60% data into training and 40% in test sets. 

### Experiment 1: Alpha Convergence

In [None]:
import matplotlib.pyplot as plt

LR = LogisticRegression() 
opt = NewtonOptimizer(LR)

loss_train = []
loss_test = []

iterations = 100
alpha = 0.1


for _ in range(iterations):
    train_loss = LR.loss(X_train, y_train)
    loss_train.append(train_loss.item())
    
    test_loss = LR.loss(X_test, y_test)
    loss_test.append(test_loss.item())
    
    opt.step(X_train, y_train, alpha)

    
# Plotting the loss
fig, ax = plt.subplots(figsize=(6, 6))  # Single axes, adjust figsize if needed
ax.plot(torch.arange(1, iterations + 1), loss_train, color="black")
ax.plot(torch.arange(1, iterations + 1), loss_test, color="orange")
ax.set_xlabel("Iterations")
ax.set_ylabel("Loss")
ax.set_title("Loss vs Iterations")
ax.legend(["Train Loss", "Test Loss"])


With an alpha of 0.1 both training and testing loss converge in between 20 and 40 iterations. Testing loss converges slightly earlier and at a slightly higher loss than training loss.

### Experiment 2: Newton vs Gradient descent

In [None]:
import matplotlib.pyplot as plt

LR = LogisticRegression() 
optn = NewtonOptimizer(LR)

n_loss_train = []
n_loss_test = []

iterations = 100
alpha = 0.1


for _ in range(iterations):
    train_loss = LR.loss(X_train, y_train)
    n_loss_train.append(train_loss.item())

    test_loss = LR.loss(X_test, y_test)
    n_loss_test.append(test_loss.item())
    
    optn.step(X_train, y_train, alpha)

LR = LogisticRegression() 
optg = GradientDescentOptimizer(LR)

g_loss_train = []
g_loss_test = []

iterations = 100
alpha = 0.1


for _ in range(iterations):
    train_loss = LR.loss(X_train, y_train)
    g_loss_train.append(train_loss.item())

    test_loss = LR.loss(X_test, y_test)
    g_loss_test.append(test_loss.item())

    optg.step(X_train, y_train, alpha, beta=0.9)

# Plotting the loss
fig, ax = plt.subplots(1, 2, figsize=(12, 6))  # Single axes, adjust figsize if needed
ax[0].plot(torch.arange(1, iterations + 1), n_loss_train, color="black")
ax[0].plot(torch.arange(1, iterations + 1), n_loss_test, color="orange")
ax[0].set_xlabel("Iterations")
ax[0].set_ylabel("Loss")
ax[0].set_title("Newton Optimizer Loss")
ax[0].legend(["Train Loss", "Test Loss"])

ax[1].plot(torch.arange(1, iterations + 1), g_loss_train, color="black")
ax[1].plot(torch.arange(1, iterations + 1), g_loss_test, color="orange")
ax[1].set_xlabel("Iterations")
ax[1].set_ylabel("Loss")
ax[1].set_title("Gradient Descent Loss")
ax[1].legend(["Train Loss", "Test Loss"])



In this plot, we see that with the Newton Optimizer the loss convergences faster than the gradient descent optimizer. For Newton optimizer, the loss reaches convergence at ~30-35 iterations whereas for the gradient optimizer the loss converges at ~50-60. They both share an alpha of 0.1 while gradient descent has a beta of 0.9.

### Experiment 3: Large Alpha

In [None]:
LR = LogisticRegression() 
opts = NewtonOptimizer(LR)

loss_train = []
loss_test = []

iterations = 100
alpha = 0.9


for _ in range(iterations):
    train_loss = LR.loss(X_train, y_train)
    loss_train.append(train_loss.item())
    
    test_loss = LR.loss(X_test, y_test)
    loss_test.append(test_loss.item())
    
    opts.step(X_train, y_train, alpha)

    
# Plotting the loss
fig, ax = plt.subplots(figsize=(6, 6))  # Single axes, adjust figsize if needed
ax.plot(torch.arange(1, iterations + 1), loss_train, color="black")
ax.plot(torch.arange(1, iterations + 1), loss_test, color="orange")
ax.set_xlabel("Iterations")
ax.set_ylabel("Loss")
ax.set_title("Loss vs Iterations")
ax.legend(["Train Loss", "Test Loss"])


## Adam Optimizer

In [None]:
from adam import LogisticRegression, AdamOptimizer

LR = LogisticRegression() 
adam = AdamOptimizer(LR)

loss_train = []
loss_test = []

iterations = 100
alpha = 0.9

n = X_train.shape[0]
for _ in range(iterations):
    
    
    train_loss = LR.loss(X_train, y_train)
    loss_train.append(train_loss.item())
    
    test_loss = LR.loss(X_test, y_test)
    loss_test.append(test_loss.item())
    
    adam.step(X_train, y_train, alpha)

    
# Plotting the loss
fig, ax = plt.subplots(figsize=(6, 6))  # Single axes, adjust figsize if needed
ax.plot(torch.arange(1, iterations + 1), loss_train, color="black")
ax.plot(torch.arange(1, iterations + 1), loss_test, color="orange")
ax.set_xlabel("Iterations")
ax.set_ylabel("Loss")
ax.set_title("Loss vs Iterations")
ax.legend(["Train Loss", "Test Loss"])
