## Question 3

In [27]:
import numpy as np

In [28]:
class LogRegSGD:
    def __init__(self, fixed_learning_rate = 0.01, batch_size = 10, max_iters = 10000):
        self.learning_rate = fixed_learning_rate
        self.batch_size = batch_size
        self.max_iterations = max_iters
        self.weights = None  

    def compute_loss(self, preds, t):
        e = 1e-9
        return -np.mean(t * np.log(preds + e) + (1 - t) * np.log(1 - preds + e))
    
    def sigmoid(self, s):
        return 1 / (1 + np.exp(-s))

    def train(self, features, labels):
        num_samples, num_features = features.shape
        self.weights = np.random.randn(num_features) * 0.001  

        for iteration in range(self.max_iterations):
            shuffled_indices = np.random.permutation(num_samples)
            shuffled_features = features[shuffled_indices]
            shuffled_labels = labels[shuffled_indices]

            for batch_start in range(0, num_samples, self.batch_size):
                batch_end = batch_start + self.batch_size
                X_batch = shuffled_features[batch_start:batch_end]
                y_batch = shuffled_labels[batch_start:batch_end]

                predictions = self.sigmoid(np.dot(X_batch, self.weights)).reshape(-1)
                gradient = np.dot(X_batch.T, (predictions - y_batch)) / len(y_batch)
                self.weights -= self.learning_rate * gradient
            full_predictions = self.sigmoid(np.dot(features, self.weights))
            loss = self.compute_loss(full_predictions, labels)

            if iteration % 100 == 0:
                print(f"Iteration {iteration}, Loss: {loss:.4f}")

    def predict_probabilities(self, features):
        return self.sigmoid(np.dot(features, self.weights))

    def predict(self, features):
        return (self.predict_probabilities(features) >= 0.5).astype(int)

## Question 4

In [29]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [30]:
data = load_breast_cancer()
X = data.data  
y = data.target 

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

unique, counts = np.unique(np.concatenate((y_train, y_val)), return_counts = True)
class_size = dict(zip(unique, counts))
class_size

{0: 186, 1: 297}

In [31]:
model = LogRegSGD(fixed_learning_rate = 0.01, batch_size = 24, max_iters = 1000)
model.train(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1

Iteration 0, Loss: 0.4840
Iteration 100, Loss: 0.0990
Iteration 200, Loss: 0.0825
Iteration 300, Loss: 0.0748
Iteration 400, Loss: 0.0701
Iteration 500, Loss: 0.0667
Iteration 600, Loss: 0.0641
Iteration 700, Loss: 0.0621
Iteration 800, Loss: 0.0605
Iteration 900, Loss: 0.0591


(0.9883720930232558, 1.0, 0.9833333333333333, 0.9915966386554621)

The model correctly classified 98.84% of test samples with 100% precision, which means indicates that when the model created no false positives. The model also had 98.33% recall which shows that the model identified 98.33% of all actual positive cases. The loss decreased from 0.4830 to 0.0591 over 1000 iterations, showing effective learning by the model. 