In [None]:
#| hide
# Use this cell only in google colab 
#! pip install git+https://github.com/BorjaRequena/Neural-Network-Course.git
#! pip install nbdev

<a href="https://githubtocolab.com/BorjaRequena/Neural-Network-Course/blob/master/lectures_nb/02_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/></a>

In [None]:
#| hide
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd
from ipywidgets import interact

from lectures_ml.losses import BCE
from lectures_ml.optimizers import sgd

# Binary classification

In this section, we consider the classification task. In particular, we focus here on the binary classification, whcih means that a datapoint can either be in class $0$ or in class $1$.

Let us consider a concrete dataset, the [toxity dataset](https://online.stat.psu.edu/stat462/node/208/). In this dataset, scientists were studying the toxity of a substance to a population of insects. They therefore administrated a serie of toxic doses and measured the number of deaths in the population of $250$ individuals. The results of the study are shown in @fig-bar.

In [None]:
n_ind = 250
data = dict(doses=range(1,7),deaths=[28,53,93,126,172,197])
df = pd.DataFrame.from_dict(data)
df['p_dying'] = df['deaths']/n_ind

In [None]:
#| label: fig-bar
#| fig-cap: Number of deaths with respect to the number of doses.
#| code-fold: true
px.bar(data_frame=df, x='doses', y='deaths')

Here, the goal of the classification would be to predict, given a number of dosis whether an individual would be death (class $0$) or alive (class $1$).

# Logistic regression

We can here define a probability distribution of dying for each value of the doses by dividing by the total bnumber of individuals (here $250$). We can therefore see the classification task as fitting this probability distribution. A simple trial function is the sigmoid function

$$ \sigma(x) =\frac{1}{1+e^{-(ax+b)}}$$
and the goal is to find the values of $a$ and $b$ that fit the best the data.

::: {.callout-note}

Notice how we transformed our classification problem into a regression problem. We will come back to this probabilisitc view of machine lerning in the [next lecture](probabilistic_view.ipynb).            
:::

In [None]:
def sigmoid(x,a,b):
    return 1/(1 + np.exp(-(a*x+b)))

@figfig-fitsigmoid shows a trial of the fit for the values of $a=0.5$ and $b=-2$. You can play around with the values of $a$ and $b$ to find a more decent fit in the next figure (only in the notebook).

In [None]:
#| label: fig-fitsigmoid
#| fig-cap: Probability of deaths with respect to the number of doses.
#| code-fold: true
a, b = 0.5, -2

vecd = np.arange(0,7,0.1)

Fig = go.Figure()


Fig.add_bar(x=df['doses'], y=df['p_dying'], name='toxity dataset')

Fig.add_trace(go.Scatter(name=f'sigmoid: a={a}, b={b}',x=vecd,y=sigmoid(vecd,a,b)))

In [None]:
#| echo: false
def hlines(a,b):
    vecd = np.arange(0,7,0.1)
    Fig = go.FigureWidget()
    Fig.add_bar(x=df['doses'], y=df['p_dying'],name="dataset")
    Fig.add_trace(
        go.Scatter(
            line=dict(color="#00CED1", width=4),
            name="Sigmoid",
            x=vecd,
            y=sigmoid(vecd,a,b))
        )
    
    Fig.update_yaxes(range=[0, 1])

    Fig.show()

interact(hlines, a=(-5,5,0.1), b=(-5,5,0.1));

# The dataset 

Until now, we have considered the viewpoint of the whole population. But generally, the dataset is built from the data of each individual to which is associated a tuple $(x,y)$, where $x$ is the number of dosis and $y$ is the class ($0$: dead, $1$: alive).

In [None]:
for i, d in zip(df['doses'], df['deaths']):
    vec1 = np.zeros((n_ind,2))
    vec1[:,0], vec1[:d,1] = i, 1
    vec = vec1 if i ==1 else np.concatenate((vec,vec1))
    
np.random.shuffle(vec)
x, y = vec[:,0], vec[:,1]

In [None]:
print(x[:20])
print(y[:20])

#  Loss function: Binary Cross Entropy

We now want to automatically find the values of $a$ and $b$. To this end, we have to define a loss function. We will here use the binary cross entropy

$$ BCE = -\frac{1}{N}\sum_{i=1}^N y_i \log [\sigma(x_i,a,b)] +(1-y_i)\log[1-\sigma(x_i,a,b)].$$

Let us have a look of the loss landscape of the binary cross entropy with respect to the paramters $a$ and $b$, as depicted in @fig-loss_landscape. We actually observe that the landscape is convex for this choice of the Loss function. In this particular case, the latter can proven.

In [None]:
#| code-fold: true
#| code-summary: Code to generate the data for the plot

vec_a = np.arange(-5,5,0.1)
vec_b = np.arange(-5,5,0.1)
matz = np.zeros((vec_a.size,vec_b.size))

for i, a1 in enumerate(vec_a):
    for j, b1 in enumerate(vec_b):
        p = dict(a=a1, b=b1)
        matz[i,j] = BCE(x, y, sigmoid, params=p)

In [None]:
#| label: fig-loss_landscape
#| fig-cap: Binary Cross Entropy
#| code-fold: true

fig = go.Figure()

fig.add_contour(z=matz,x=vec_b, y=vec_a,hovertemplate=
                    'a:%{y:.2f}'
                    +'<br>b:%{x:.2f}</br>'
                    +'f:%{z:.2f}<extra></extra>')


d = dict(width=600,
         height=600,
         xaxis={'title':'b'},
         yaxis={'title':'a'}
       )

fig.update_layout(d)
fig.show()

::: {.callout-tip}

## Exercise 

What is the value loss for your guess? Is it close to the minimum?
:::

#  Gradient of the Loss function

Let us compute the gradient with respect to $a$ and $b$.

::: {.callout-tip}

## Exercise

What is the gradient of the sigmoid function $\sigma(x)$? 
:::

Although the problem is convex, there is no easy way to derive an analytical closed form. This comes from the non-linearity introduced by the sigmoid. Libraries like `scikit-learn`propose to use different algorithm such as e.g. conjugate gradient. In the next section, we will consider the stochastic gradient descent approach.

# Stochastic gradient descent

::: {.callout-tip}
## Exercise

Implement the function `grad_BCE(x, y, p)` where `p=dict(a=a, b=b)` is a dictionary containing the the parameters $a$ and $b$. The function should return an array of the form `np.array([grad_a, grad_b])`. 

:::

Having computed the gradients, we can now apply the stochastic gradient descent algorithm.

In [None]:
a0, b0 = 2, 1 

pini = dict(a=a0, b=b0)
ll = dict(loss=BCE, grads=grad_BCE, fun=sigmoid)

trackers = sgd(x, y, pini, ll, niter=int(1E2))

@fig-sgd shows the trajectory of the stochastic gradient descent algorithm. 

In [None]:
#| label: fig-sgd
#| fig-cap: Trajectory in the parameter space of the stochastic gradient descent algorithm.
#| code-fold: true

amin, amax = np.min(trackers['a']), np.max(trackers['a'])
bmin, bmax = np.min(trackers['b']), np.max(trackers['b'])

n = 100
stepa, stepb =(amax-amin)/n, (bmax-bmin)/n

vec_a = np.arange(amin-19*stepa, amax+20*stepa, stepa)
vec_b = np.arange(bmin-19*stepb, bmax+20*stepb,stepb)
matz = np.zeros((vec_a.size,vec_b.size))

for i, a1 in enumerate(vec_a):
    for j, b1 in enumerate(vec_b):
        p = dict(a=a1, b=b1)
        matz[i,j] = BCE(x, y, sigmoid, params=p)
        
fig = go.Figure()
fig.add_contour(z=matz,x=vec_b, y=vec_a,
                hovertemplate=
                    'a:%{y:.2f}'
                    +'<br>b:%{x:.2f}</br>'
                    +'f:%{z:.2f}<extra></extra>')
mask = np.arange(0,len(trackers['a']),100)
veca, vecb, vecl = np.array(trackers['a'])[mask],np.array(trackers['b'])[mask], np.array(trackers['loss'])[mask]
fig.add_scatter(x=vecb, y=veca, name=f'SGD',text=vecl, mode='lines+markers',
                hovertemplate=
                'a:%{y:.2f}'
                +'<br>b:%{x:.2f}</br>'
                +'f:%{text:.2f}<extra></extra>')


fig.show()

# Accuracy

So far, we did not discuss about the choice of the metric. The first metric that comes into mind is the accuracy, i.e.

$$ \text{acc}= \frac{\# \text{correct predictions}}{\#\text{dataset}},$$

which considers the ratio between the number of correct predictions and the number of elements in our training set.

let us compute the accuracuy in terms of the training.

In [None]:
veca, vecb, vecl = np.array(trackers['a']), np.array(trackers['b']), np.array(trackers['loss'])
mask = np.arange(0,len(veca),100)
veca, vecb, vecl = veca [mask], vecb[mask], vecl[mask] 

::: {.callout-tip}
## Exercise

Write the function `accuracy(x,y,a,b)`, which returns the accuracy for a given dataset and for the parameters $a$ and $b$. Choose the label with the following rule: $0$ if $\sigma_i<0.5$ else $1$.

:::

In [None]:
#| code-fold: true
def accuracy(x,y,a,b):
    yp = sigmoid(x, a, b)
    yp[yp>0.5] = 1
    yp[yp<=0.5] = 0
    return np.sum(yp==y)/y.size

In [None]:
vec_acc = np.zeros_like(veca)

for i,(a,b) in enumerate(zip(veca,vecb)):
    vec_acc[i] = accuracy(x,y,a,b)

In [None]:
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(y=vecl, name="Loss"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(y=vec_acc, name="Accuracy"),
    secondary_y=True,
)


# Set x-axis title
fig.update_xaxes(title_text="iterations")

# Set y-axes titles
fig.update_yaxes(title_text="Loss", secondary_y=False)
fig.update_yaxes(title_text="Accuracy", secondary_y=True)

fig.show()

In [None]:
from sklearn.metrics import confusion_matrix
yp = sigmoid(x, a, b)
yp[yp>0.5] = 1
yp[yp<=0.5] = 0
cm = confusion_matrix(y,yp)

In [None]:
X, Y =["Dead", "Alive"], ["Dead", "Alive"]
fig = px.imshow(cm, x=X, y=Y, text_auto=True,color_continuous_scale='Blues')


fig.show()

# 2D dataset

In [None]:
a, b = 2, 1
nsamples = 200
x = np.random.rand(200,2)*4-2
y = x[:,1] < a*x[:,0]+b

In [None]:
v = np.arange(x[:,0].min(),x[:,0].max(),0.1)
plt.scatter(x[:,0],x[:,1],c=y[:])
plt.plot(v,a*v+b)
plt.colorbar()

In [None]:
mu, sigma = 0,1
x[:,1] = x[:,1] + np.random.normal(loc=mu, scale=sigma, size=nsamples)

In [None]:
v = np.arange(x[:,0].min(),x[:,0].max(),0.1)
plt.scatter(x[:,0],x[:,1],c=y[:])
plt.plot(v,a*v+b)
plt.colorbar()