# Homework Lecture5

LDA and Logistic Classification and Feature Development for MNIST Image sample

## Preliminaries

### Imports

In [53]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import train_test_split, KFold

import sys
sys.path.append("../..")
from E4525_ML import mnist
from E4525_ML.multiclass_logistic import LogisticGDClassifier
%matplotlib inline

### Random Seed

In [71]:
seed=458
np.random.seed(seed)

### Data Directories

In [55]:
data_dir=r"../../raw/mnist/"

<div class="alert alert-block alert-info"> Problem 0 </div>
Make sure to **update** the file `mnist.py` on the `E4525_ML` directory (new version posted on Canvas).

You will need the **updated** version of that file to complete the last section of this notebook.

## Read Data

<div class="alert alert-block alert-info"> Problem 1.0 </div>
Read MNIST data set and labels,  also read the MNMIST test data set and test labels

In [56]:
    images_filename=data_dir+"train-images-idx3-ubyte.gz"
    labels_filename=data_dir+"train-labels-idx1-ubyte.gz"

    test_images_filename=data_dir+"t10k-images-idx3-ubyte.gz"
    test_labels_filename=data_dir+"t10k-labels-idx1-ubyte.gz"

    images=mnist.read_images(images_filename)
    labels=mnist.read_labels(labels_filename)
    
    images_test=mnist.read_images(test_images_filename)
    labels_test=mnist.read_labels(test_labels_filename)
    
    print(images.shape,labels.shape)

(60000, 28, 28) (60000,)


<div class="alert alert-block alert-info"> Problem 1.2 </div>
Use `skelearn`'s `train_test_split` function to separate the MNIST samples into  a 15% validation set and a  training sample.


In [57]:
images_train,images_val,labels_train,labels_val=train_test_split(images,labels,test_size=0.15)
print(images_train.shape,images_val.shape,labels_train.shape,labels_val.shape)

(51000, 28, 28) (9000, 28, 28) (51000,) (9000,)


## LDA

<div class="alert alert-block alert-info"> Problem 2.1 </div>
fit an LDA model on the training data set using `sklearns` `LinearDiscriminantAnalysis` classifier 

In [59]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
X_train=images_train.reshape((images_train.shape[0],-1))
X_train.shape

(51000, 784)

In [60]:
sk_model=LDA()
sk_model.fit(X_train,labels_train)



LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

<div class="alert alert-block alert-info"> Problem 2.2 </div>
Compute model accuracy on the training set

In [62]:
np.mean(sk_model.predict(X_train)==labels_train)

0.87211764705882355

<div class="alert alert-block alert-info"> Problem 2.3 </div>
Compute accuracy of the model on the validation set

In [63]:
X_val=images_val.reshape((images_val.shape[0],-1))
np.mean(sk_model.predict(X_val)==labels_val)

0.86477777777777776

## Logistic Regression

<div class="alert alert-block alert-info"> Problem 3.1 </div>

Use the `LogisticGDClassifier` class from `E4525_ML.multiclass_logistic` module to fit a logistic model

In [64]:
from E4525_ML.multiclass_logistic import LogisticGDClassifier as Logis
log_model=Logis()
log_model.fit(X_train,labels_train)

	 0 Loss = 2371.93402487 Train_Accuracy 0.07 
	 10 Loss = 266.953052084 Train_Accuracy 0.92 
	 20 Loss = 259.981781374 Train_Accuracy 0.924 
	 30 Loss = 254.43186044 Train_Accuracy 0.926 
	 40 Loss = 242.487337851 Train_Accuracy 0.924 
	 50 Loss = 239.245801525 Train_Accuracy 0.935 
	 60 Loss = 256.877972816 Train_Accuracy 0.924 
	 70 Loss = 284.373788324 Train_Accuracy 0.922 
	 80 Loss = 217.9984568 Train_Accuracy 0.944 
	 90 Loss = 242.064030514 Train_Accuracy 0.946 
	 99 Loss = 218.246159941 Train_Accuracy 0.936 


<div class="alert alert-block alert-info"> Problem 3.2 </div>
Compute model accuracy in the training data set

In [65]:
np.mean(log_model.predict(X_train)==labels_train)

0.93447058823529416

<div class="alert alert-block alert-info"> Problem 3.3 </div>
Compute model accuracy in the valuation data set

In [66]:
np.mean(log_model.predict(X_val)==labels_val)

0.9157777777777778

## Feature Engineering in one Dimension

In [67]:
N=50
N_val=1000

In [68]:
def f(x):
    return 10*(1-4*(np.abs(np.abs(x)-1)))

In [69]:
def generate_sample(N):
    X=np.random.uniform(-2,2,N)
    eta=f(X)
    eta.shape
    theta=1/(1+np.exp(-eta))
    Y= np.random.uniform(0,1,N)>theta
    return X,Y

In [72]:
X, Y = generate_sample(N)
X

array([ 1.22386687,  1.94009723, -1.468868  , -1.01121348,  0.90641874,
       -0.32683951, -0.28904029,  0.90133992,  1.25263169,  0.19143293,
        1.76843092,  1.17623794, -0.81947811,  0.10707658, -0.58869573,
       -1.16302342,  1.98926495, -0.71309513,  1.77739197, -0.27188633,
        1.06081216, -1.50050664,  1.18702852,  1.41192108,  1.84465139,
       -1.93555988,  1.4984523 , -0.48976963, -1.39272073,  0.20397415,
       -0.93569064, -0.77215659,  0.99958249,  0.98471474, -0.23990945,
       -1.66209593, -0.07223514,  0.8410368 , -0.91840206,  1.31399206,
       -0.58626123, -0.55390412, -0.673419  ,  1.60400415, -0.24098103,
       -0.4793224 , -0.10693736, -0.43410016, -0.29290766,  0.69823331])

<div class="alert alert-block alert-info"> Problem 4.0 </div>
Generate 
1. a training sample of variables $X$ and $Y$ with $N$ data samples
2. a valuation set with   $N_{val}$ samples
3. a test set with $N_{val}$ samples

In [73]:
x_train, Y_train = generate_sample(N)
x_val, Y_val = generate_sample(N_val)
x_test, Y_test = generate_sample(N_val)

<div class="alert alert-block alert-info"> Problem 4.1 </div>
What is the proportion of positive class ($Y=1$) samples on the training data?

In [74]:
np.mean(Y_train==1)

0.80000000000000004

<div class="alert alert-block alert-info"> Problem 4.2 </div>
Write a function able to generate the feature matrix
$$
    H_{i,d}= h_d(x_i)
$$
for $i=1,\dots N$ and $d=1,\dots D$

where the functions $h_d(x)$ are defined as 
$$
    h_d(x) = x^d
$$

[HINT] be careful to include $h_D$ in the range of functions

In [75]:
def featurematrix(data, D):
    K=data.shape[0]
    feature = np.empty((K,D))
    for i in range(K):
        for d in range(0, D):
            feature[i][d] = pow(data[i], d+1)
    
    return feature
    

<div class="alert alert-block alert-info"> Problem 4.3 </div>
1. Train  a logistic regression model (use sklearn `LogisticRegression` class) over the training data you already generated. 
2. Use the valuation set  to select the best value of $D$ using accuracy as selection criteria.
3. Plot accuracy on the  training and valuation sets as a function of $D$.

[HINT]
1. You only need to consider the range $D=1,\dots 10$.
2. Remember to disable regularization by setting the parameter $C$ of the `LogisticRegression` class to a very large number.



In [76]:
logistic_model=LogisticRegression(C=1e30, multi_class="multinomial",solver="sag")
for D in range(1,11):
    train_feature = featurematrix(x_train, D)
    logistic_model.fit(train_feature,Y_train)
    val_feature = featurematrix(x_val, D)
    val_pred=logistic_model.predict(val_feature)
    print("D: ", D, "Accuracy: ", np.mean(val_pred==Y_val))

# We can see from the result that when D = 4, the accuracy is the highest

D:  1 Accuracy:  0.736
D:  2 Accuracy:  0.736
D:  3 Accuracy:  0.736
D:  4 Accuracy:  0.855
D:  5 Accuracy:  0.736
D:  6 Accuracy:  0.736
D:  7 Accuracy:  0.736
D:  8 Accuracy:  0.736
D:  9 Accuracy:  0.736
D:  10 Accuracy:  0.736




<div class="alert alert-block alert-info"> Problem 4.4 </div>
Use the test set  to measure the accuracy for the optimal classifier you have found
(do not use data from the  valuation set to train the classifier)

In [77]:
logistic_model=LogisticRegression(C=1e30)
train_feature = featurematrix(x_train, 4)
logistic_model.fit(train_feature,Y_train)
test_feature = featurematrix(x_test, 4)
test_pred=logistic_model.predict(test_feature)
np.mean(test_pred==Y_test)

0.95599999999999996

## Feature Engineering for MNIST sample

<div class="alert alert-block alert-info"> Problem 5.1 </div>
In this problem we will use `mnist.ImageFeatureModel` class to find the optimal number of orientations $\theta$  of the oriented gradients
features for the MNIST data set.

1. use `mnist.ImageFeatureModel` to generate image oriented gradient features.
2. use  `LogisticGDClassifier` as the base model
3. set the block size to 4 ) this is to reduce memory use)
4. select the best number of orientations by performing  5-Fold cross-validation on the full MNIST data set.
5. Consider only [1,2,4,8] as possible values for the orientation
6. Plot number of orientations vs validation accuracy

[HINT] 
1. the `valiation_model` function below will be useful to perform cross-validation
2. If you run into memory trouble (your computer crashes), reduce the size of the data set.
Make sure to  indicate this clearly on your solution.
3. This problem is computationally expensive, make sure to allocate time to resolve it.

In [79]:
def validate_model(model,K,X,Y):
    folder=KFold(K,shuffle=True)
    folds=folder.split(X,Y)
    val_error=0.0
    fold_count=0
    for fold in folds:
        train_idx,val_idx=fold
        x_train=X[train_idx]
        y_train=Y[train_idx]
        x_val=X[val_idx]
        y_val=Y[val_idx]     
        model.fit(x_train,y_train)
        y_pred=model.predict(x_val)
        val_err=np.mean(y_val==y_pred)
        val_error+=val_err
        fold_count+=1
        print(fold_count,val_err)
    return val_error/K
      

In [80]:
block_size = 4
orientation = np.array([1, 2, 4, 8])
accuracy = np.zeros(4)
for i in range(len(orientation)):
    model = mnist.ImageFeatureModel(
        LogisticGDClassifier(), size=block_size, orientations=orientation[i])
    accuracy[i] = validate_model(model, 5, images, labels)

	 0 Loss = 2388.6717466 Train_Accuracy 0.123 
	 10 Loss = 101.567132172 Train_Accuracy 0.974 
	 20 Loss = 108.200190576 Train_Accuracy 0.969 
	 30 Loss = 107.034848477 Train_Accuracy 0.962 
	 40 Loss = 99.6739601263 Train_Accuracy 0.963 
	 50 Loss = 80.7404456935 Train_Accuracy 0.967 
	 60 Loss = 86.0237234208 Train_Accuracy 0.972 
	 70 Loss = 60.891146112 Train_Accuracy 0.977 
	 80 Loss = 73.0153536287 Train_Accuracy 0.974 
	 90 Loss = 93.5922380985 Train_Accuracy 0.97 
	 99 Loss = 73.9992439612 Train_Accuracy 0.978 
1 0.968166666667
	 0 Loss = 2473.78984343 Train_Accuracy 0.119 
	 10 Loss = 118.479214471 Train_Accuracy 0.967 
	 20 Loss = 120.981832936 Train_Accuracy 0.963 
	 30 Loss = 89.7642579433 Train_Accuracy 0.974 
	 40 Loss = 150.644621158 Train_Accuracy 0.959 
	 50 Loss = 86.8415686286 Train_Accuracy 0.975 
	 60 Loss = 59.3527496484 Train_Accuracy 0.985 
	 70 Loss = 58.4929466785 Train_Accuracy 0.982 
	 80 Loss = 62.0274692707 Train_Accuracy 0.975 
	 90 Loss = 73.6858430843 Tr

	 90 Loss = 90.3925540946 Train_Accuracy 0.968 
	 99 Loss = 72.7022778505 Train_Accuracy 0.977 
2 0.966166666667
	 0 Loss = 2388.16993443 Train_Accuracy 0.085 
	 10 Loss = 119.305221127 Train_Accuracy 0.966 
	 20 Loss = 107.770562063 Train_Accuracy 0.97 
	 30 Loss = 106.802842737 Train_Accuracy 0.969 
	 40 Loss = 88.4442387743 Train_Accuracy 0.975 
	 50 Loss = 79.7961908414 Train_Accuracy 0.979 
	 60 Loss = 81.6976464758 Train_Accuracy 0.977 
	 70 Loss = 120.848059898 Train_Accuracy 0.971 
	 80 Loss = 104.317817184 Train_Accuracy 0.967 
	 90 Loss = 93.9221202845 Train_Accuracy 0.97 
	 99 Loss = 93.5227499975 Train_Accuracy 0.972 
3 0.968
	 0 Loss = 2444.99264353 Train_Accuracy 0.083 
	 10 Loss = 146.111151133 Train_Accuracy 0.955 
	 20 Loss = 113.105192161 Train_Accuracy 0.96 
	 30 Loss = 86.3600833794 Train_Accuracy 0.974 
	 40 Loss = 143.773745788 Train_Accuracy 0.957 
	 50 Loss = 100.319152081 Train_Accuracy 0.97 
	 60 Loss = 112.132092715 Train_Accuracy 0.961 
	 70 Loss = 85.959811

In [None]:
accuracy

In [None]:
plt.plot(orientation,accuracy)
plt.title("Number of orientations vs Validation accuracy")
plt.xlabel("Orientation")
plt.ylabel("Accuracy")
plt.legend()

<div class="alert alert-block alert-info"> Problem 5.2 </div>

Fit the model with the optimal number of orientations to the full MNIST data set and estimate its accuracy on the MNIST test set


In [None]:
model= mnist.ImageFeatureModel(LogisticGDClassifier(),size=block_size,orientations=8)

In [None]:
model.fit(images,labels,images_test,labels_test)
Y_predicted=model.predict(images_test)
print("accuracy",np.mean(Y_predicted==labels_test))