In [46]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [27]:
# opening data and cleaning it into usable format
with open('in.dta','r') as tsv:
    raw = [line.strip().split('\t') for line in tsv]

split = []
for pt in raw:
    split.append(pt[0].split())
    
in_data = []
for pt in split:
    in_data.append([np.array([float(pt[0]),float(pt[1])]),float(pt[2])])
    
with open('out.dta','r') as tsv:
    raw = [line.strip().split('\t') for line in tsv]

split = []
for pt in raw:
    split.append(pt[0].split())
    
out_data = []
for pt in split:
    out_data.append([np.array([float(pt[0]),float(pt[1])]),float(pt[2])])

In [33]:
# non linear transform
def phi (pt):
    return [np.array([1,pt[0][0],pt[0][1],(pt[0][0])**2,(pt[0][1])**2, pt[0][1]*pt[0][0], np.abs(pt[0][0] - pt[0][1]), np.abs(pt[0][0]+pt[0][1])]),pt[1]]

In [91]:
#transform pts
trans_data = []
for pt in in_data:
    trans_data.append(phi(pt))

X = []
y = []
for pt in trans_data:
    X.append(pt[0])
    y.append(pt[1])

# standard linear regression
X = np.array(X)
y = np.array(y)

xp = np.linalg.pinv(X)
w = xp @ y


In [92]:
errors = 0
for pt in trans_data:
    if not np.sign(w.dot(pt[0])) == np.sign(pt[1]):
        errors += 1
print("E_in: " + str(float(errors)/float(len(trans_data))))

E_in: 0.02857142857142857


In [93]:
errors = 0
for pt in out_data:
    trans = phi(pt)
    if not np.sign(w.dot(trans[0])) == np.sign(pt[1]):
        errors += 1

print("E_out: " + str(float(errors)/float(len(out_data))))

E_out: 0.084


In [120]:
# regularized linear regression
X = np.matrix(X)
y = np.matrix(y)
I = np.matrix(np.identity(len(trans_data[0][0])))
lam = 10**(-3)
L = X.transpose()*X + lam * I
w_reg = np.array(np.linalg.inv(L)*X.transpose()*y.transpose())
w_reg = w_reg.transpose()

In [121]:
errors = 0
for pt in trans_data:
    if not np.sign(w_reg.dot(pt[0])) == np.sign(pt[1]):
        errors += 1
print("E_in: " + str(float(errors)/float(len(trans_data))))

E_in: 0.02857142857142857


In [122]:
errors = 0
for pt in out_data:
    trans = phi(pt)
    if not np.sign(w_reg.dot(trans[0])) == np.sign(pt[1]):
        errors += 1

print("E_out: " + str(float(errors)/float(len(out_data))))

E_out: 0.08


In [123]:
X = np.matrix(X)
y = np.matrix(y)
I = np.matrix(np.identity(len(trans_data[0][0])))
lam = 10**(3)
L = X.transpose()*X + lam * I
w_reg = np.array(np.linalg.inv(L)*X.transpose()*y.transpose())
w_reg = w_reg.transpose()

In [124]:
errors = 0
for pt in trans_data:
    if not np.sign(w_reg.dot(pt[0])) == np.sign(pt[1]):
        errors += 1
print("E_in: " + str(float(errors)/float(len(trans_data))))

E_in: 0.37142857142857144


In [125]:
errors = 0
for pt in out_data:
    trans = phi(pt)
    if not np.sign(w_reg.dot(trans[0])) == np.sign(pt[1]):
        errors += 1

print("E_out: " + str(float(errors)/float(len(out_data))))

E_out: 0.436


In [133]:
# regularized linear regression for various k values: get min E_out
ks = [2,1,0,-1,-2]
E_min = 10
k_min = 60
for k in ks:
    X = np.matrix(X)
    y = np.matrix(y)
    I = np.matrix(np.identity(len(trans_data[0][0])))
    lam = 10**(k)
    L = X.transpose()*X + lam * I
    w_reg = np.array(np.linalg.inv(L)*X.transpose()*y.transpose())
    w_reg = w_reg.transpose()
    
    errors = 0
    for pt in out_data:
        trans = phi(pt)
        if not np.sign(w_reg.dot(trans[0])) == np.sign(pt[1]):
            errors += 1
            
    E = float(errors)/float(len(out_data))
    if E < E_min:
        E_min = E
        k_min = k



In [134]:
print ("the k with the smallest E_out is: " + str(k_min))

the k with the smallest E_out is: -1


In [130]:
E_min

0.056

In [145]:
ks = list(range(-20,20))

E_min = 10
k_min = 60
for k in ks:
    X = np.matrix(X)
    y = np.matrix(y)
    I = np.matrix(np.identity(len(trans_data[0][0])))
    lam = 10**(k)
    L = X.transpose()*X + lam * I
    w_reg = np.array(np.linalg.inv(L)*X.transpose()*y.transpose())
    w_reg = w_reg.transpose()
    
    errors = 0
    for pt in out_data:
        trans = phi(pt)
        if not np.sign(w_reg.dot(trans[0])) == np.sign(pt[1]):
            errors += 1
            
    E = float(errors)/float(len(out_data))
    if E < E_min:
        E_min = E
        k_min = k

In [146]:
k_min

-1