The methods 
We will be using sqaured error as our cost function.

$$
V(s) \leftarrow  V(s) + \alpha \left( G_s - V(s) \right)
$$

kNN is simple.  Once a metric is chosen, you examine all points in the training set to determine which k are closest to the test point.  The choose the label most prevelent among those k.

In [3]:
import numpy as np
from sortedcontainers import SortedList



#Defining the class to have fit and predict functions like scikit-learn
class KNN(object):
    def __init__(self,k):
        self.k=k
        
    def fit(self,x,y):
        self.x=x
        self.y=y
        
    def predict(self,Xtest):
        y=np.zeros(len(Xtest))
        for i,x in enumerate(Xtest): #returns a count and object of the iterable
            sl=SortedList(load=self.k) #from sorted containers, a sorted list that will make it easy
            #to track distances
            for j,xt in enumerate(self.x): #this is training X
                diff=x-xt
                d=diff.dot(diff) #dot product
                if len(sl)<self.k:
                    sl.add((d,self.y[j])) #add the distance and the class
                else:
                    if d<sl[-1][0]:
                        del sl[-1]
                        sl.add((d,self.y[j])) 
                #at the end of this for loop we have the k closest elements
            votes={}
            for _,v in sl: # v is the vote for a class
                votes[v]=votes.get(v,0)+1
                max_vote=0
                max_class=-1
                for v,count in votes.items():
                    if count >max_vote:
                        max_vote=count
                        max_class=v
                        
                y[i]=max_class
        return(y)
                
    def score(self,x,y):
        p=self.predict(x)

        return( np.mean(p==y))
            
            
            
    

In [1]:
data_folder="C:/Users/craig_arl/Documents/GitHub/machine_learning_examples/mnist_csv/"
xtrn=data_folder+"Xtrain.txt"
xtest=data_folder+"Xtest.txt"
ytrn=data_folder+"label_train.txt"
ytest=data_folder+"label_test.txt"


import pandas as pd
def get_data(f_name_X,f_name_Y,limit=None):
    print("Reading in and transforming data...")
    df_x = pd.read_csv(f_name_X)
    df_y = pd.read_csv(f_name_Y)
    df=df_y.join(df_x)
   
    data = df.as_matrix()
    np.random.shuffle(data)
    X = data[:, 1:] / 255.0 # data is from 0..255
    Y = data[:, 0]
    if limit is not None:
        X, Y = X[:limit], Y[:limit]
    return X, Y

#copied:
def get_xor():
    X = np.zeros((200, 2))
    X[:50] = np.random.random((50, 2)) / 2 + 0.5 # (0.5-1, 0.5-1)
    X[50:100] = np.random.random((50, 2)) / 2 # (0-0.5, 0-0.5)
    X[100:150] = np.random.random((50, 2)) / 2 + np.array([[0, 0.5]]) # (0-0.5, 0.5-1)
    X[150:] = np.random.random((50, 2)) / 2 + np.array([[0.5, 0]]) # (0.5-1, 0-0.5)
    Y = np.array([0]*100 + [1]*100)
    return X, Y

def get_donut():
    N = 200
    R_inner = 5
    R_outer = 10

    # distance from origin is radius + random normal
    # angle theta is uniformly distributed between (0, 2pi)
    R1 = np.random.randn(N//2) + R_inner
    theta = 2*np.pi*np.random.random(N//2)
    X_inner = np.concatenate([[R1 * np.cos(theta)], [R1 * np.sin(theta)]]).T

    R2 = np.random.randn(N//2) + R_outer
    theta = 2*np.pi*np.random.random(N//2)
    X_outer = np.concatenate([[R2 * np.cos(theta)], [R2 * np.sin(theta)]]).T

    X = np.concatenate([ X_inner, X_outer ])
    Y = np.array([0]*(N//2) + [1]*(N//2))
    return X, Y

In [4]:
x_train,y_train=get_data(xtrn,ytrn)
x_test,y_test=get_data(xtest,ytest)

Reading in and transforming data...
Reading in and transforming data...


In [37]:
knn=KNN(5)
knn.fit(x_train,y_train)
knn.score(x_train,y_train)

Naive Bayes

In [5]:
import numpy as np
from scipy.stats import norm 
from scipy.stats import multivariate_normal as mvn

class NaiveBayes(object):
    def fit(self, x,y,smoothing=10e-3):
        self.gaussians=dict()
        self.priors=dict()
        labels=set(y.astype(int))
        
        for c in labels:
            current_x=x[y==c]
            self.gaussians[c]={'mean':current_x.mean(axis=0),
                              'var': current_x.var(axis=0)+smoothing} #axis 0 down 1 across
            self.priors[c]=float(len(y[y==c]))/len(y)
    def score(self,x,y):
        p=self.predict(x)
        return( np.mean(p==y))
    
    def predict(self,x):
        N,D=x.shape
        K=len(self.gaussians)
        P=np.zeros((N,K)) #matrix for the probabilities of each class
        for c,g in self.gaussians.items():
         
            mean,var = g['mean'],g['var']
            P[:,c]=mvn.logpdf(x,mean=mean,cov=var)+np.log(self.priors[c])   #for each x prob it is class c
        return(np.argmax(P,axis=1))
    
    

In [6]:
x_train,y_train=get_data(xtrn,ytrn)
x_test,y_test=get_data(xtest,ytest)


Reading in and transforming data...
Reading in and transforming data...


In [30]:
model=NaiveBayes()
model.fit(x_train,y_train)
model.score(x_test,y_test)
model.score(x_train,y_train)

0.71254250850170031

A generic (non-naive) bayesian classifier that removes the conditional independence assumption.  In the code, we replace the variance by the covariance in fit.

In [36]:
import numpy as np
from scipy.stats import norm 
from scipy.stats import multivariate_normal as mvn

class Bayes(object):
    
    
    def fit(self, x,y,smoothing=10e-3):
        N,D=x.shape
        print(N,D)
        self.gaussians=dict()
        self.priors=dict()
        labels=set(y.astype(int))
        
        for c in labels:
            current_x=x[y==c]
            self.gaussians[c]={'mean':current_x.mean(axis=0),
                              'cov': np.cov(current_x.T)+np.eye(D)*smoothing} #axis 0 down 1 across
          
            self.priors[c]=float(len(y[y==c]))/len(y)
            
            
    def score(self,x,y):
        p=self.predict(x)
        return( np.mean(p==y))
    
    def predict(self,x):
        N,D=x.shape
        K=len(self.gaussians)
        P=np.zeros((N,K)) #matrix for the probabilities of each class
        for c,g in self.gaussians.items():
         
            mean,cov = g['mean'],g['cov']
            P[:,c]=mvn.logpdf(x,mean=mean,cov=cov)+np.log(self.priors[c])   #for each x prob it is class c
        return(np.argmax(P,axis=1))
    
    

In [37]:
modelb=Bayes()
modelb.fit(x_train,y_train)
modelb.score(x_test,y_test)
#modelb.score(x_train,y_train)

4999 20


0.71342685370741488

In [102]:
mn1=np.array([-1,2,3])
mn2=np.array([1,-1,3.5])
mn3=np.array([0,2,-1])
cov=np.matrix('1,-.2,.35;-.2,1.4,1;.35,1,1.3')
cov2=np.matrix('1.9,.2,.35;.2,3.4,1;.35,1,2.3')
rv=mvn.rvs(mn,cov) #generates a random sample
X1=np.random.multivariate_normal(mn1,cov,1000)
X2=np.random.multivariate_normal(mn2,cov,1000)
X3=np.random.multivariate_normal(mn3,cov2,1000)
y1=np.zeros(1000)
y2=np.ones(1000)
y3=np.ones(1000)*2

X=np.concatenate((X1,X2,X3),axis=0)
X.shape
Y=np.concatenate((y1,y2,y3),axis=0)


In [103]:
model=NaiveBayes()
model.fit(X,Y)
model.score(X,Y)

0.93166666666666664

In [104]:
model=Bayes()
model.fit(X,Y)
model.score(X,Y)

3000 3


0.98133333333333328