### Using regularized logistic regression to classify email

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
from sklearn import linear_model

import scipy.io

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [3]:
# No modifications in this cell
# complete the functions in utils.py; then run the cell

# load the spam data in

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()

# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)

# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,typea,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print("best_lambda = %.3f" %best_lambda)

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True)
    else:
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='liblinear',fit_intercept=True)
    lreg.fit(X,ytrain)
    print("Coefficients = %s" %lreg.intercept_,lreg.coef_)
    predy = lreg.predict(Xt)
    print("Accuracy on set aside test set for %s = %.4f" %(typea, np.mean(predy==ytest)))

print("L2 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l2")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l2")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l2")

print("L1 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l1")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l1")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l1")

L2 Penalty experiments -----------
best_lambda = 0.100
Coefficients = [-4.86264099] [[-2.75863912e-02 -2.25520829e-01  1.21633316e-01  2.29715639e+00
   2.70389525e-01  2.32758703e-01  9.28696739e-01  2.95108646e-01
   1.62231043e-01  6.87327938e-02 -8.34295167e-02 -1.60453362e-01
  -4.73831093e-02  1.08099581e-02  1.89106435e-01  8.19579715e-01
   5.09743414e-01  3.96564290e-02  2.67580695e-01  3.46287865e-01
   2.60096924e-01  3.66028669e-01  7.25798414e-01  1.96647549e-01
  -3.15432669e+00 -4.03241347e-01 -1.25431026e+01 -6.03475492e-02
  -1.55900780e+00 -5.54192358e-02 -3.22608889e-02  4.09403864e-01
  -3.68453591e-01 -1.44128010e+00 -5.91171090e-01  4.43379921e-01
   4.24214409e-02 -1.56891563e-01 -4.54976317e-01 -1.02134658e-01
  -3.54371315e+00 -1.72832251e+00 -4.38062910e-01 -1.06014026e+00
  -9.18429801e-01 -1.75378492e+00 -1.67358763e-01 -9.57894856e-01
  -3.66419635e-01 -1.36363345e-01 -6.58938785e-02  2.06744541e-01
   1.70779761e+00  1.21449393e+00 -3.36700002e-01  1.56505