# Supervised Learning

In supervised learning we are trying to learn a model that maps input X to output Y, Y=f(X).
Both X and Y can have many different characteristics.

## Linear Regression

We will generate a simple dataset, y=3.2*x+1.3+eps

In [None]:
import numpy as np
import random
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
x=np.random.random((200,1))*20
y=3.2*x+1.3+(-5+np.random.random((200,1))*10)

In [None]:
plt.scatter(x,y)
plt.show()

Our function is in the form of y=Wx+b, where x,y are constant, and W,b are the function parameters.

Let's try to estimate the parameters:
Let w=3 and b=1.5 for our first model and w=3.5 and b=0.9 for another model

In [None]:
prediction1=3*x+1.5
prediction2=3.5*x+0.9

In [None]:
plt.scatter(x,y)
plt.plot(x,prediction1,'r-')
plt.plot(x,prediction2,'g-')
plt.show()

Which model is better? How can we estimate the "goodness" of a model?

MSE=Mean Square Error

In [None]:
mse1=np.mean(np.square(prediction1-y))
print "MSE1 value=",mse1
mse2=np.mean(np.square(prediction2-y))
print "MSE2 value=",mse2

In [None]:
A = np.concatenate([x,np.ones_like(x)],axis=1)
w, b = np.linalg.lstsq(A, y)[0]
print w,b

In [None]:
A.shape

In [None]:
prediction3=w*x+b

In [None]:
MSE_MIN=np.mean(np.square(prediction3-y))
print "MSE_MIN value=",MSE_MIN

## Cats Vs. Dogs

### Data Preparation

In [None]:
from skimage import io, transform,color,img_as_ubyte
import glob
import pickle
from sklearn import linear_model
import os


LOAD_DATA=True

IMG_SHAPE=(50,50,3)
IMG_SIZE=IMG_SHAPE[0]*IMG_SHAPE[1]*IMG_SHAPE[2]

### Examine Data

In [None]:
files=glob.glob('../data/cats_Vs_dogs/train/*jpg')
sub=random.sample(files,10)

for i in sub:
    img=io.imread(i)
    print "img type",img.dtype
    print "file name",i
    print "image shape",img.shape

In [None]:
io.imshow(io.imread(sub[0]))

In [None]:
io.imshow(io.imread(sub[1]))

In [None]:
def resize_and_crop(img):
    w,h,c=img.shape
    scale_ratio=max(float(IMG_SHAPE[0])/w,float(IMG_SHAPE[1])/h)
    resized_img=transform.rescale(img,scale=scale_ratio)
    crop=resized_img[:int(IMG_SHAPE[0]),:int(IMG_SHAPE[1]),:int(IMG_SHAPE[2])]
    flat=np.reshape(crop,[int(IMG_SHAPE[0]*IMG_SHAPE[1])*IMG_SHAPE[2]])
    return img_as_ubyte(crop),img_as_ubyte(flat)

In [None]:
#Prepare training set (data & labels)
if LOAD_DATA and os.path.exists('../data/cats_Vs_dogs/training.pkl'):
    print ("Found pickle file, loading data")
    fin=open('../data/cats_Vs_dogs/training.pkl','r')
    training_set=pickle.load(fin)
    fin.close()
    data=training_set['data']
    labels=training_set['labels']
else:
    data=np.zeros((len(files),IMG_SIZE),dtype=np.uint8)
    labels=np.zeros(len(files),dtype=np.uint8)
    
    for idx,f in enumerate(files):
        if idx%1000==0:
            print "idx=",idx
        crop,flat=resize_and_crop(io.imread(f))
        data[idx,:]=flat
        if 'dog.' in f:
            labels[idx]=1
    print "finished"
    training_set={'data':data,'labels':labels}
    s=(data.size+labels.size)/(1024.)**3
    print ("dataset size=%f.2G" %s)
    print "dumpling dataset"
    fo=open('../data/cats_Vs_dogs/training.pkl','w')
    pickle.dump(training_set,fo)
    fo.close()

Lets see our transformed images

In [None]:
io.imshow(np.reshape(random.choice(data),[50,50,3]))

In [None]:
#Prepare test set (data & labels)
files=glob.glob('../data/cats_Vs_dogs/test/*jpg')
if LOAD_DATA and os.path.exists('../data/cats_Vs_dogs/test.pkl'):
    print ("Found pickle file, loading data")
    fin=open('../data/cats_Vs_dogs/test.pkl','r')
    test_set=pickle.load(fin)
    fin.close()
    data=test_set['data']
    labels=test_set['labels']
else:
    data=np.zeros((len(files),IMG_SIZE),dtype=np.uint8)
    labels=np.zeros(len(files),dtype=np.uint8)

    for idx,f in enumerate(files):
        if idx%1000==0:
            print "idx=",idx
        crop,flat=resize_and_crop(io.imread(f))
        data[idx,:]=flat
        if 'dog.' in f:
            labels[idx]=1
    print "finished"
    test_set={'data':data,'labels':labels}
    print "dumping dataset"
    fo=open('../data/cats_Vs_dogs/test.pkl','w')
    pickle.dump(test_set,fo)
    fo.close()

# Logistic Regression (The Easy Way)

In [None]:
#train a logistic regression classifier
#to save time we will train only on a very small portion of the training set
import time
for i in range(-2,1):
    start=time.time()
    logreg = linear_model.LogisticRegression(C=10**i,tol=1e-5,verbose=0)
    logreg.fit(training_set['data'][:5000],training_set['labels'][:5000])
    print ("i=%d, accuracy=%.4f" %(i,logreg.score(test_set['data'],test_set['labels'])))
    print time.time()-start
print "finished"

In [None]:
p=logreg.predict(test_set['data'])
accurate=p==test_set['labels']
wrong,=np.where(accurate==False)
correct,=np.where(accurate==True)
io.imshow(io.imread(files[wrong[0]]))
print "Model Prediction:",p[wrong[0]]

In [None]:
io.imshow(io.imread(files[correct[0]]))
print "Model Prediction:",p[correct[0]]

## Logistic Regression (The Hard Way)

Let's go back to basic and look at linear regression again

In [None]:
data=training_set['data']

#lets look at a single image
img=random.choice(data)
io.imshow(np.reshape(img,[50,50,3]))
print "img shape:",img.shape

#We want our output to be a class score for each of the two classes, meaning shape (2,)
#So if we have Wx=y we need W to be in shape(2,7500) (we neglect the bias in this example)
w=np.random.random((2,7500))
print "w shape:",w.shape

#Multiply Wx
cat_score,dog_score=w.dot(img)
print "cat score=",cat_score
print "dog score=",dog_score



In [None]:
img=random.choice(data)
io.imshow(np.reshape(img,[50,50,3]))
cat_score,dog_score=w.dot(img)
print "cat score=",cat_score
print "dog score=",dog_score

## How unhappy are we from our model? (Loss)

We need to define a loss function that will tell us how bad our model is.<br>
The total loss is the average loss of all instances, where a specific loss is a function of the model's prediction and the actual label:<br>
$$L=\frac{1}{N} \sum_i L_i(f(x_i,W),y_i)$$

The Sigmoid function

$$f(x)=\frac{1}{1+e^{-x}}$$

In [None]:
def sigmoid(x):
    return 1.0/(1+np.exp(-x))

In [None]:
#Not vectorized sigmoid
x=np.arange(-10,10,0.05)
y=map(sigmoid,x)
plt.plot(x,y)
plt.show()

In [None]:
#vectorized sigmoid
y=sigmoid(x)
plt.plot(x,y)
plt.show()

# Softmax

$$P(Y=k|X=x_i)=\frac{e^{s_k}}{\sum_j{e^{s_j}}}$$

In [None]:
def softmax(x):
    return np.exp(x)/np.sum(np.exp(x))

In [None]:
v=[0.5,0.9,2.1,-0.2]
print softmax(v)

In [None]:
x=np.random.rand(200)*6-3
x.sort()
plt.plot(range(200),x)
plt.show()

In [None]:
plt.plot(range(200),softmax(x))
plt.show()

In [None]:
#Let's check our model
img=random.choice(data)
io.imshow(np.reshape(img,[50,50,3]))
cat_score,dog_score=w.dot(img)
s_cat,s_dog=softmax([cat_score,dog_score])
print "cat score=",cat_score
print "dog score=",dog_score
print "cat softmax score=",s_cat
print "dog softmax score=",s_dog

In [None]:
def softmax(x):
    x -= np.max(x)
    return np.exp(x)/np.sum(np.exp(x),axis=0)

Softmax represent the probability of the image belonging to each class.<br>
In order to turn it into loss we take the negative log of the softmax of the actual class<br>
$$L_i=-Log\bigg(\frac{e^{s_{y_i}}}{\sum_j{e^{s_j}}}\bigg)$$


## Regularization

To support better generalization with try to enforce simplicity on the model.<br>
There are few ways to do it, one of the common one is to force the weight of the model to remain small, we achieve that by adding a loss on the weights

$$L=\frac{1}{N} \sum_i L_i(f(x_i,W),y_i)+\lambda R(W)$$

In [None]:
#There are many regularization functions & methods:
print "L2 norm",np.linalg.norm(w,ord=2) #L2 norm
print "L1 norm",np.sum(np.abs(w))

## Optimization

In [None]:
fin=open('../data/cats_Vs_dogs/training.pkl','r')
training_set=pickle.load(fin)
fin.close()
train_data=training_set['data']
train_labels=training_set['labels']
#train_labels=np.reshape(train_labels,(train_labels.shape[0],1))
fin=open('../data/cats_Vs_dogs/test.pkl','r')
test_set=pickle.load(fin)
fin.close()
test_data=test_set['data']
test_labels=test_set['labels']

train_data = np.concatenate([train_data,np.ones([train_data.shape[0],1])],axis=1)
test_data = np.concatenate([test_data,np.ones([test_data.shape[0],1])],axis=1)
print train_data.shape

### Random search

In [None]:
# train_data is of shape (20000,7501)
# train_labels is of shape (20000,1)

bestloss = float("inf") # Python assigns the highest possible float value
for num in xrange(1000):
    w = np.random.randn(2, 7501) * 0.000001 # generate random parameters
    prediction = w.dot(train_data.T)
    logits = softmax(prediction)
    class_logit=logits.T[np.arange(20000),train_labels]
    total_loss = np.sum(-np.log(class_logit)) # get the loss over the entire training set
    if total_loss < bestloss: # keep track of the best solution
        bestloss = total_loss
        bestW = w
    if num%100==0:
        print 'in attempt %d the loss was %f, best %f' % (num, total_loss, bestloss)
print 'in attempt %d the loss was %f, best %f' % (num, total_loss, bestloss)

In [None]:
#lets check our model
output=bestW.dot(test_data.T)
prediction=np.argmax(output.T,axis=1)
correct=np.sum((prediction==test_labels))
print "classified correctly %d images out of %d, accuracy= %.4f" %(correct,test_labels.shape[0],float(correct)/test_labels.shape[0])

### Local Random Search

In [None]:
w = np.random.randn(2, 7501) * 0.000001
bestloss = float("inf")
for num in xrange(1000):
    step_size = 0.000001
    Wtry = w + np.random.randn(2, 7501) * step_size
    prediction = Wtry.dot(train_data.T)
    logits = softmax(prediction)
    class_logit=logits.T[np.arange(20000),train_labels]
    total_loss = np.sum(-np.log(class_logit)) # get the loss over the entire training set
    if total_loss < bestloss: # keep track of the best solution
        bestloss = total_loss
        bestW = w
    if num%100==0:
        print 'in attempt %d the loss was %f, best %f' % (num, total_loss, bestloss)
print 'in attempt %d the loss was %f, best %f' % (num, total_loss, bestloss)

In [None]:
#lets check our new model
output=bestW.dot(test_data.T)
prediction=np.argmax(output.T,axis=1)
correct=np.sum((prediction==test_labels))
print "classified correctly %d images out of %d, accuracy= %.4f" %(correct,test_labels.shape[0],float(correct)/test_labels.shape[0])