In [539]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd

In [540]:
df = pd.read_csv('wdbc.csv')
df.columns = [f'column_{i+1}' for i in range(df.shape[1])]
df.column_2 = df.column_2.map({'M': 0, 'B': 1})

The column with the target values is column_2 for M = Malignanant and B = Benign. Since this is a two class case will will use One-Hot encoding.

In [541]:
train_df, val_df = train_test_split(df, test_size=0.25, random_state=42)
val_df, test_df = train_test_split(val_df, test_size=0.4, random_state=42)

In [542]:
y_train = train_df['column_2'].values
y_val = val_df['column_2'].values

X_train = train_df.drop(columns=['column_2']).values
X_val = val_df.drop(columns=['column_2']).values

y_test = test_df['column_2'].values
X_test = test_df.drop(columns=['column_2']).values

def class_0(target):
    return np.array([i for i in target if i == 0])

def class_1(target):
    return np.array([i for i in target if i == 1])

print(class_0(y_train).shape)
print(class_0(y_val).shape)

print(class_1(y_train).shape)
print(class_1(y_val).shape)

(156,)
(34,)
(270,)
(51,)


156 Malignanants and 270 Benigns in the training set

34 Malignanants and 51 Benigns in the validation set

In this homework we will implement a binary logistic regression model using mini-batch Stochastic Gradient Descent

**Error Function**

$E(\textbf{w}) = -\ln{p(\textbf{t}|\textbf{w})} = - \sum_{n=1}^{N} (t_n \ln{y_n} + (1-t_n) \ln{(1-y_n)}$

**Gradient of Error Function**

$\nabla E(\textbf{w}) = \sum_{n=1}^{N} (y_n - t_n)\phi_n $

**Note:**

$y_n = \sigma (\textbf{w}^T \phi_n)$

the sigmoid activation function



**The Mini-Batch SGD algortithm:**

Input: $w^{(0)}$, learning rate $\eta_k \geq 0$, max number of iterations $k$


for $k = 0, \ldots,k-1$ do

Randomly select a batch $B_k \subset \{1,\ldots,N\}$ where $|B_k| << N$

$w^{(k+1)} = w^{(k)} - \eta_k \sum_{n \in B_k} \nabla f_n(w^{k})$

return $w^{(k)}$

In this case $\nabla f_n(w^{k}) = \nabla E(\textbf{w}) = \sum_{n=1}^{N} (y_n - t_n)\phi_n $

So we have $w^{(k+1)} = w^{(k)} - \eta_k \sum_{n \in B_k} (y_n - t_n)\phi_n$

**Fully expanded**: $w^{(k+1)} = w^{(k)} - \eta_k \sum_{n \in B_k} (\sigma (\textbf{w}^T \phi_n) - t_n)\phi_n$

In [543]:
print(X_train.shape)
print(y_train.shape)
w_init = np.zeros((X_train.shape[1]+1))
w_init1 = np.random.randn(X_train.shape[1]+1)
print(w_init.shape)

(426, 31)
(426,)
(32,)


In [544]:
def sigmoid(z):
    return np.clip(1 / (1 + np.exp(-z)), 1e-15, 1 - 1e-15)

def batch(X,y,batch_size):
    n_samples = X.shape[0]
    N = np.arange(n_samples)
    N_r = np.random.permutation(N)
    batch_indices = N_r[:batch_size]
    X_batch = X[batch_indices]
    y_batch = y[batch_indices]
    return X_batch, y_batch


print(batch(X_train, y_train, 10)[1])

def SGD(X,y,batch_size,eta,max_iter):
    X = np.c_[np.ones(X.shape[0]), X]
    n_samples, n_features = X.shape 
    w = np.random.normal(loc=0.0,scale=1.0,size=n_features) #this samples from a normal distribution with mean 0 and std 1`
    for i in range(0,max_iter,batch_size):
        X_batch, y_batch = batch(X,y,batch_size)
        w = w - eta * (X_batch.T @ (sigmoid(X_batch @ w) - y_batch)) / batch_size
    return w

# Hyperparameters to play with: #

batch_size = 32
eta = 0.00001
max_iter = 10000

################################

w_star = SGD(X_train, y_train, batch_size, eta, max_iter)




[0 1 0 0 1 0 0 0 1 1]


  return np.clip(1 / (1 + np.exp(-z)), 1e-15, 1 - 1e-15)


In [545]:
def logistic_regression(X,y,w):
    X = np.c_[np.ones(X.shape[0]), X]
    return -1*np.sum(y*np.log(sigmoid(X@w)+1e-15)+(1-y)*np.log(1-sigmoid(X@w)+1e-15)) / X.shape[0]

def rmse(X,y,w,N):
    return np.sqrt((2*logistic_regression(X,y,w)/N))

print(rmse(X_test,y_test,w_star,len(y_test))) #error on training set

0.6614600969770853


In [546]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

y_pred_prob = sigmoid(np.c_[np.ones(X_test.shape[0]), X_test] @ w_star)
y_pred = (y_pred_prob >= 0.5).astype(int)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('RMSE: ', rmse(X_test, y_test, w_star, len(y_val)))

Accuracy:  0.631578947368421
F1 Score:  0.7741935483870968
Precision:  0.631578947368421
Recall:  1.0
RMSE:  0.5416660487780195
