In [1]:
# Initialize Otter
import otter
grader = otter.Notebook("D6.ipynb")

# COGS 118A: Discussion Lab 6

### Instructions

You are responsible for making sure you pass all public tests on Gradescope. 


Note: D6 does not contain any quiz questions. When you submit this lab, you **must** submit your jupyter notebook only. 

**If you do not do this, you will not receive credit for the assignment.**

In this discussion lab, you will learn more about optimization methods and multi-class classification.

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import plot_confusion_matrix, f1_score, confusion_matrix

import matplotlib.pyplot as plt

## Optimization

You should think about machine learning algorithms and machine learning optimizers as two separate concepts.

A machine learning algorithm, such as lasso logistic regression, defines a hypothesis space over possible models (here a hypothesis is a particular parameterization). A machine learning optimizer is a program which tries to find the best model in the hypothesis space.

Here you will try your hand at selecting optimizers. For different problems, different optimizers might yield different results, might converge to a solution faster, or might not converge at all.

## Background:

Imagine you are a researcher investigating how genetic, environmental, and experimental conditions affect protein expressions. Specifically, you want to see which protein expressions are predictive of genomic factors (downs syndrome vs non-downs syndrome), environmental factors (shock conditioning vs no shock conditioning), and drug conditions (saline injection vs drug injection).

You have collected a dataset to answer this question and your goal now is to fit a model which can identify subsets of proteins that are discriminant between a classes (unique combinations of genomic, environmental factors, and drug condition categories).

[UCI repository](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression)

## Problem:

Your problem is to find the best lasso logistic regression model (this is logistic regression with an l1 regularization penalty) for this dataset. Here best is defined using f1-micro. To do this, you will use grid search over optimizers. This is a good opportunity to read through the [sklearn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). (Hint: not all solvers are compatible with `l1` regularization)

NOTE: you might run into a convergence error meaning that the optimizer never reached a stable solution. This is bad but don't panic. Maybe not all optimizers are well suited to the problem :) and _maybe_ one is.

Your plot should look something like the following:

![output](./index.png)

In [3]:
df = pd.read_excel('https://archive.ics.uci.edu/ml/machine-learning-databases/00342/Data_Cortex_Nuclear.xls')
y = df.loc[:, "class"]
X = df.loc[:, ~df.columns.isin(["class", "Genotype", "Behavior", "Treatment", "MouseID"])]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [4]:
# for debugging purposes!
# don't use this for the final submission, but this can help you iterate faster
subsample_df = df.groupby(by=["class", "MouseID"]).apply(lambda gdf: gdf.sample(frac=0.1))
sample_y = df.loc[:, "class"]
sample_X = df.loc[:, ~df.columns.isin(["class", "Genotype", "Behavior", "Treatment", "MouseID"])]
#sample_X = sample_X.iloc[:, :10]

In [None]:
def plot_results(gridsearchcv):
    params = gridsearchcv.cv_results_["params"]
    ys = gridsearchcv.cv_results_["mean_test_score"]
    xs = ['|'.join(str(v) for v in param.values()) for param in params]
    yerr = gridsearchcv.cv_results_["std_test_score"]
    plt.errorbar(xs, ys, yerr / np.sqrt(gridsearchcv.cv), fmt='.k')
    plt.ylabel("f1")
    plt.xlabel("params")
    
def check_for_convergence(gridsearchcv):
    return gridsearchcv.best_estimator_.steps[-1][1].n_iter_ < gridsearchcv.best_estimator_.steps[-1][1].max_iter


np.random.seed(31415) 

scaler = StandardScaler()
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
logistic = ...


pipe = Pipeline(steps=[("scaler", scaler), ("imputer", imp), ("logistic", logistic)])

# you don't need to modify the max_iter param. If you do, keep it under 1000
param_grid = {
     # hint, prefix your param names with logistic__ to pass it to the logistic step
     ...
    ...
}



# use f1_micro for scoring
# use 7 folds
gscv = ...


gscv.fit(X_train, y_train)

plot_results(gscv)

In [None]:
grader.check("Optimization")

## Multi-class Metrics

You may have noticed that the above problem is a multi-class problem. There are 8 different classes being predicted.

To validate your solution from the above section, you will write your own multi-class precision function. You might have noticed that precision is defined using false positives and true positives -- both of which are binary. In order to use precision (recall and f1) in multi-class problems you need measure these using a one-vs-rest strategy.

In [None]:
plot_confusion_matrix(gscv.best_estimator_, X_test, y_test)
plt.xticks(rotation = -45);

In [None]:
def multi_class_precision_macro(y_true, y_pred):
    """multi_class_precision_macro
    This function computes precision for multiclass problems
    
    How does it work?
    
    First, figure out the unique labels in your prediction problem
    Second, compute precision for each class in a one-vs-rest manner
    Third, take the (unweighted) average of all these precision scores
    
    This is inappropriate for imbalanced class settings. In those cases you would want to use a weighted average.
    """
    ...
    
multi_class_precision_macro(y_test, gscv.predict(X_test))

In [None]:
grader.check("Multi-class Metrics")

## Multi-class Classifiers

In the first section, we used an inherently multiclass classifier, logistic regression. Logistic regression accomplishes this using a generalization of the sigmoid function called the softmax function (or the gibbs or boltzman distribution if you are from physics :) ). Several other classifier algorithms can be inherently multiclass such as naive bayes, neural networks, and decision trees. However some algorithms can not, namely those that use linear decision boundaries such as svm algorithms.

For non-inherently mutli-class classifiers, you can still make multi-class predictions by a one-vs-rest strategy. This strategy involves training a classifier for every class.

Here you will train an linear classifier using a one-vs-rest strategy.

When we plot the decision boundaries, your plot should look something like this:

![decision_boundaries](./index2.png)

In [None]:
# train an SGD Classifier using a one vs rest strategy
# 
# To do this, create a pipeline like in first question.
# You will probably want to rescale your data AND impute missing values like in the above problem.

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA
from sklearn import datasets
np.random.seed(0)

iris = datasets.load_iris()
X = iris.data[:, :]  # we only take the first two features.
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

# In your pipeline add this step immediately before your SGDClassifier.
# This compresses our data to 2D for visualization purposes
# Don't worry if you aren't familiar with PCA...we're using purely for visualization purposes

# Initialize a Standar scaler object:
scaler = ...


# Initialize a simple imputer that replace the missing values by np.nan
# with the strategy of 'mean'
imp = ...


# initialize a PCA step for dimensionality reduction
pca_step = ('pca', PCA(n_components=2))

# Initialize a stochastic gradient descent Classifier with 
# the class_weight set as "balanced"
classifier = ...


# Put everything into a pipeline
# Pipeline(steps=[...,pca_step,...])
pipelines = ...

# initialize the classifier over the pipelines
onevrest_classifier = ...


onevrest_classifier.fit(X_train, y_train)
plot_confusion_matrix(onevrest_classifier, X_test, y_test)
plt.xticks(rotation = -45);

In [None]:
grader.check("Multi-class Classifiers")

## Visualize Your Multi-class Classifier

In [None]:
def plot_hyperplane(clf, min_x, max_x, linestyle):
    # get the separating hyperplane
    w = clf.coef_[0]
    a = -w[0] / w[1]
    xx = np.linspace(min_x - 5, max_x + 5)  # make sure the line is long enough
    yy = a * xx - (clf.intercept_[0]) / w[1]
    plt.plot(xx, yy, linestyle)
    


_X_reduced = pipe[:-1].fit(X_test, y_test).transform(X_test)

plot_hyperplane(onevrest_classifier.estimators_[0][-1], _X_reduced[:,0].min(), _X_reduced[:,0].max(), 'b--')
plot_hyperplane(onevrest_classifier.estimators_[1][-1], _X_reduced[:,0].min(), _X_reduced[:,0].max(), 'r--')
plot_hyperplane(onevrest_classifier.estimators_[2][-1], _X_reduced[:,0].min(), _X_reduced[:,0].max(), 'g--')


label_map = {
    0: "blue",
    1: "red",
    2: "green"
}

plt.scatter(_X_reduced[:,0], _X_reduced[:,1],
            s=100,
            c=[label_map[y] for y in y_test],
            linewidths=3,
            edgecolors=[label_map[y] for y in onevrest_classifier.predict(X_test)])
plt.ylim(_X_reduced[:,0].min(), _X_reduced[:,0].max());

# The End of D6
Be sure to click `save` before you submit this Discussion Labs on Gradescope.