# Logistic regression revisited

### Logistic regression for binary classification

$h_w(x) \approx P(y=1 \vert w; x)$, the probability that $y = 1$ given $x$, parametrized by $w$.

$$
h(x) = \sigma(b + w_1\cdot x_1 + w_2\cdot x_2 + \ldots + w_n\cdot x_n) = \sigma(w^T\cdot x + b),
$$
where
$$
\sigma(x) = \frac{1}{1 + e^{-x}}.
$$

If $h_w(x) = p$, then for the odds
$$
\frac{p}{1-p} = \text{e}^{w^T\cdot x + b},
$$
and the log-odds (logit) is
$$
\log\frac{p}{1-p} = w^T\cdot x + b
$$

The two classes are separated by a hyperplane: 
$$
y = 1 \iff w^T\cdot x + b >= 0.
$$

The cost function is derived from the ML function:

$$
L(w, b) = \prod_{x\colon y=1}h_w(x)\cdot\prod_{x\colon y=0}(1 - h_w(x)) \rightarrow \text{max!},
$$
that is, the negative loglikelihood function should be minimized:
$$
J(w, b) = \frac{1}{m}\sum_{i=1}^m\left(-y^{(i)}\cdot\log h_w(x^{(i)}) - (1 - y^{(i)})\cdot\log (1 - h_w(x^{(i)}))\right)
$$

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as mcs

import tensorflow as tf
import tensorflow.keras as keras

In [None]:
with open("./data/log_reg_1.txt") as f:
    X = []
    y = []
    for line in f:
        x0, x1, label = line.split(',')
        X.append((float(x0), float(x1)))
        y.append(int(label))
        
X = np.array(X)
y = np.expand_dims(np.array(y), 1)

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()

In [None]:
def initialize_params(X):
    _, nr_features = X.shape
    w0 = np.zeros((nr_features, 1), dtype=np.float_)
    b = 0.0
    return w0, b


def activation(Z):
    return 1 / (1 + np.exp(-Z))


def predict(X, w, b):
    A = activation(np.matmul(X, w) + b)
    return np.round(A)


def calc_gradient(X, y, w, b):
    m = len(X)
    A = activation(np.matmul(X, w) + b)
    cost = (-1 / m) * np.sum(np.multiply(y, np.log(A)) + np.multiply(1 - y, np.log(1 - A)))
    
    dZ = A - y
    dw = (1 / m) * np.matmul(X.T, dZ)
    db = (1 / m) * np.sum(dZ)
    return cost, dw, db

In [None]:
def optimize(X, y, alpha, nr_iterations=10000):
    w, b = initialize_params(X)
    costs = []
    for _ in range(nr_iterations):
        cost, dw, db = calc_gradient(X, y, w, b)
        costs.append(cost)
        w = w - alpha * dw
        b = b - alpha * db
    return costs, w, b

In [None]:
m = np.mean(X, axis=0)
s = np.std(X, axis=0, ddof=1)


X = (X - m) / s

In [None]:
alpha = 0.01
costs, w, b = optimize(X, y, alpha, nr_iterations=10000)

In [None]:
figure = plt.figure(figsize=(16, 8))
plt.plot(costs)
plt.ylim(0, 0.7)
plt.show()

In [None]:
predictions = predict(X, w, b)


print(mcs.accuracy_score(predictions, y))
print(mcs.roc_auc_score(y, predictions))


mcs.confusion_matrix(y, predictions)


### Logistic regression in Tensorflow (2.x)

In [None]:
def optimize(X, y, alpha, nr_epochs):
    m, n = X.shape
    w = tf.Variable(tf.zeros((n, 1), dtype=np.float64)) # convert input to Tensor
    b = tf.Variable(0.0, dtype=np.float64)
    
    optimizer = tf.optimizers.SGD(learning_rate=alpha)
    losses = []
    for _ in range(nr_epochs):
        y_hat = tf.sigmoid(tf.add(tf.matmul(X, w), b))
        loss = tf.reduce_mean(tf.losses.binary_crossentropy(y, y_hat))
        dZ = tf.subtract(y_hat, y)
        dw = (1 / m) * tf.matmul(tf.transpose(X), dZ)
        db = (1 / m) * tf.reduce_sum(dZ)
        optimizer.apply_gradients(zip([dw, db], [w, b]))  # w and b are updated under the hood
        losses.append(loss)
    return losses, w, b

In [None]:
X_train = tf.Variable(X, dtype=np.float64)
Y_train = tf.Variable(y, dtype=np.float64)

costs, W, b = optimize(X_train, Y_train, 0.01, 10000)


predictions = tf.round(tf.sigmoid(tf.add(tf.matmul(X_train, W), b)))

In [None]:
costs = [cost.numpy() for cost in costs]

figure = plt.figure(figsize=(16, 8))
plt.plot(costs)
plt.ylim(0, 0.7)
plt.show()

In [None]:
print(mcs.accuracy_score(predictions, y))
print(mcs.roc_auc_score(y, predictions))


mcs.confusion_matrix(y, predictions)

In [None]:
def optimize(X, y, alpha, nr_epochs):
    m, n = X.shape
    w = tf.Variable(tf.zeros((n, 1), dtype=np.float64)) # convert input to Tensor
    b = tf.Variable(0.0, dtype=np.float64)

    optimizer = tf.optimizers.SGD(learning_rate=alpha)
    losses = []
    for _ in range(nr_epochs):
        with tf.GradientTape() as g:  # use gradient tape to avoid explicit differentiation
            y_hat = tf.sigmoid(tf.add(tf.matmul(X, w), b))
            loss = tf.reduce_mean(tf.losses.binary_crossentropy(y, y_hat))
            gradients = g.gradient(loss, [w, b])  # here is where dw and db are calculated
            optimizer.apply_gradients(zip(gradients, [w, b]))  # w and b are updated under the hood
            losses.append(loss)
    return losses, w, b

In [None]:
X_train = tf.Variable(X, dtype=np.float64)
Y_train = tf.Variable(y, dtype=np.int_)

costs, W, b = optimize(X_train, Y_train, 0.01, 10000)


predictions = tf.round(tf.nn.sigmoid(tf.add(tf.matmul(X_train, W), b)))

In [None]:
costs = [cost.numpy() for cost in costs]

figure = plt.figure(figsize=(16, 8))
plt.plot(costs)
plt.ylim(0, 0.7)
plt.show()

In [None]:
print(mcs.accuracy_score(predictions, y))
print(mcs.roc_auc_score(y, predictions))


mcs.confusion_matrix(y, predictions)

### Logistic regression in Keras

![](images/log_reg_nn.png)

In [None]:
keras.backend.clear_session()

In [None]:
model = keras.Sequential([
    keras.layers.Dense(units=1, activation='sigmoid', input_shape=(2,))
])


optimizer = keras.optimizers.SGD(learning_rate=0.01)


model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics='accuracy')

In [None]:
history = model.fit(X, y, verbose=0, epochs=1000)

In [None]:
model.summary()

In [None]:
figure = plt.figure(figsize=(16, 8))
plt.plot(history.history['loss'])
plt.ylim(0, 0.7)
plt.show()

In [None]:
predictions = np.round(model.predict(X))


print(mcs.accuracy_score(predictions, y))
print(mcs.roc_auc_score(y, predictions))


mcs.confusion_matrix(y, predictions)

### Adding hidden layers for triangle example

In [None]:
with open("./data/log_reg_3.txt") as f:
    X = []
    y = []
    for line in f:
        x0, x1, label = line.split(',')
        X.append((float(x0), float(x1)))
        y.append(int(float(label)))


X = np.array(X)
y = np.expand_dims(np.array(y), 1)

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()

![](images/simple_two_layer_nn.png)

In [None]:
model = keras.Sequential([
    keras.layers.Input(shape=(2,)),
    keras.layers.Dense(units=3, activation='sigmoid'),  # try different activation, different units
    keras.layers.Dense(units=1, activation='sigmoid')
])


optimizer = keras.optimizers.Adam(learning_rate=0.01)

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics='accuracy')

In [None]:
model.summary()

In [None]:
history = model.fit(X, y, verbose=0, epochs=200)  # try less/more epochs

In [None]:
predictions = np.round(model.predict(X))


print(mcs.accuracy_score(predictions, y))
print(mcs.roc_auc_score(y, predictions))


mcs.confusion_matrix(y, predictions)

In [None]:
h = 0.01
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
meshpoints = np.c_[xx.ravel(), yy.ravel()]

In [None]:
Z = model.predict(meshpoints)
Z = Z.reshape(xx.shape)

plt.figure(figsize=(16, 10))
plt.contourf(xx, yy, Z, alpha=0.1)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.show()

### Activation function

* no activation in hidden layers (does it something useful?)
* sigmoid activation (dominant approach before 2014), still OK for the last layer for binary classification
* $tanh$ activation (dominant approach between 2015 and 2016)
* $\text{relu}(x) = \max\{x, 0\}$
* leaky relu
* softmax for multiclass classification