In [None]:
# Initialize OK
from client.api.notebook import Notebook
ok = Notebook('lab10.ok')

# Lab 10: Logistic Regression

**Collaboration Policy**

Data science is a collaborative activity. While you may talk with others about
the homework, we ask that you **write your solutions individually**. If you do
discuss the assignments with others please **include their names** at the top
of your solution.

## Due Date

This assignment is due at 11:59pm Monday, April 27th.

In this lab you will practice logistic regression.

# Collaborators  

Write names in this cell:

In [1]:
# Run this cell to set up your notebook
import numpy as np
import pandas as pd
import sklearn
import sklearn.datasets
import sklearn.linear_model
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import cufflinks as cf


%matplotlib inline
sns.set()
sns.set_context("talk")
py.init_notebook_mode(connected=False)
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

In this lab we will be working on the breast cancer dataset. This dataset can be easily loaded using the `sklearn.datasets.load_breast_cancer()` method.  
The data format is not a `pandas.DataFrame` so we will create a new DataFrame from it.

In [2]:
data = sklearn.datasets.load_breast_cancer()
# data is actually a dictionnary
print(data.keys())
print(data.DESCR)

In [3]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df.head()

Let us try to fit a simple model with only one feature.

In [4]:
# Define our features/target
X = df[["mean radius"]]
# Target data['target'] = 0 is malignant 1 is benign
Y = (data.target == 0)

In [5]:
# Split between train and test
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size=0.25, random_state=42)

print(f"Training Data Size: {len(x_train)}")
print(f"Test Data Size: {len(x_test)}")

### Question 1

Let's first fit a logistic regression model using the training set. 

For this problem, we will go one level of abstraction higher and simply use the existing LogisticRegression implementation in sklearn.

Fill in the code below so that you to compute the training and testing accuracy, defined as:

$$
\large
\text{Training Accuracy} = \frac{1}{n_{train\_set}} \sum_{i \in \text{train_set}} {\mathbb{1}_{y_i == \hat{y_i}}}
$$

$$
\large
\text{Testing Accuracy} = \frac{1}{n_{test\_set}} \sum_{i \in \text{test_set}} {\mathbb{1}_{y_i == \hat{y_i}}}
$$

where $\hat y_i $ is the prediction of our model, $ y_i $ the true value, and $\mathbb{1}_{y_i == \hat{y_i}}$ an indicator function. This means that $\mathbb{1}_{y_i == \hat{y_i}} = 1 $ if $ y_i = \hat{y_i}$, and $ \mathbb{1}_{y_i == \hat{y_i}} = 0 $ if $ y_i \neq \hat{y_i}$.

<!--
BEGIN QUESTION
name: q1
-->

In [6]:
lr = sklearn.linear_model.LogisticRegression(fit_intercept=True)

lr.fit(x_train,y_train) 
train_accuracy = ...
test_accuracy = ...

print(f"Train accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")

In [None]:
ok.grade("q1");

### Question 2
It seems we can a get very high test accuracy. Then how about precision and recall?  
- Precision (also called positive predictive value) is the fraction of true positives among the number of data points predicted as positive.  
- Recall (also known as sensitivity) is the fraction of true positives among the total number of data points with positive labels.

Precision is the ability of the classifier not to label as positive a sample that is negative while recall is the ability of the classifier to find all the positive samples.

To understand the link between recall/precision on the one hand and sensitivity/specificity on the other hand, it's useful to come back to a confusion matrix:

In [10]:
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(y_test, lr.predict(x_test))

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
class_names = ['False', 'True']
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

Then:
$$
\text{Precision} = \frac{n_{true\_positives}}{n_{true\_positives} + n_{false\_positives}}
$$

$$
\text{Recall} = \frac{n_{true\_positives}}{n_{true\_positives} + n_{false\_negatives}}
$$

As illustrated in the figure below:
![precision_recall](precision_recall.png)

Now let's compute the precision and recall for the test set using the model we got from question 1.  
Please do not use the `sklearn.metrics` for this computation.

<!--
BEGIN QUESTION
name: q2
-->

In [11]:
y_pred = lr.predict(x_test) 

precision = ...
recall = ...

print(f'precision = {precision:.4f}')
print(f'recall = {recall:.4f}')

In [None]:
ok.grade("q2");

Our precision is fairly high while our recall is a bit lower. Why might we observe these results? Please consider the following plots, which display the distribution of the target variable in the training and testing sets. 

In [14]:
fig, axes = plt.subplots(1, 2)
sns.countplot(y_train, ax=axes[0]);
sns.countplot(y_test, ax=axes[1]);

axes[0].set_title('Train')
axes[1].set_title('Test')
plt.tight_layout();

*Write your answer here, replacing this text.*

###  Question 3
Now let's try to analyze the cross entropy loss from logistic regression. Recall that loss would be:
$$L (\theta) = -\dfrac{1}{n} \sum_{i=1}^{n} (y_i \log (\sigma(\phi(x_i)^T\theta))) + (1-y_i) \log (1 - \sigma(\phi(x_i)^T\theta))) $$

where $\sigma(t) = \frac{1}{1 + \exp(-t)}$ and $\phi(x_i)$ is the feature vector corresponding to the data point $x_i$.

From lecture 23, we saw that this simplies down to:

$$L(\theta) = -\frac{1}{n} \sum_{i=1}^n \left( y_i \phi(x_i)^T \theta + \log \left(\sigma\left(-\phi(x_i)^T \theta\right) \right) \right) $$

In [15]:
theta = np.array([lr.coef_[0][0],
                  lr.intercept_[0]])
Phi = np.hstack([X,
                 np.ones([len(X), 1])])
print(theta)
print()
print(Phi)

<!--
BEGIN QUESTION
name: q3
-->

In [16]:
def lr_loss(theta, Phi, Y):
    '''
    Compute the cross entropy loss using Phi, Y and theta. Hint: # The notation B @ v means: 
    compute the matrix multiplication Bv 

    Args:
        theta: The model parameters. 
        Phi: The transformed input data \phi(X)
        Y: The label 

    Return:
        The cross entropy loss.
    '''
    loss = ...
    return loss

In [None]:
ok.grade("q3");

In [19]:
uvalues = np.linspace(-8,8,70)
vvalues = np.linspace(-5,5,70)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
lr_loss_values = np.array([lr_loss(t, Phi, Y) for t in thetas.T])
lr_loss_surface = go.Surface(name="Logistic Regression Loss",
        x=u, y=v, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        contours=dict(z=dict(show=True, color="gray", project=dict(z=True)))
    )

py.iplot(go.Figure(data=[lr_loss_surface]))

What remarks can you make on this plot?

*Write your answer here, replacing this text.*

# Submit
Make sure you have run all cells in your notebook in order before running the cell below, so that all images/graphs appear in the output.
**Please save before submitting!**

In [None]:
# Save your notebook first, then run this cell to submit.
ok.submit()