In [1]:
from random import sample
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import numba as nb

#@nb.njit
def my_sign(num):
    if num > 0:
        return 1
    else:
        return -1

@nb.jit
def mistake(w, x):
    return np.dot(w, x[:-1])*x[-1] <= 0

def PLA(data, dimension):
    w = np.zeros(dimension) #(w1,w2), w0=0
    #print(w)
    run = True
    updates = 0
    while run:
        run = False
        for x in data:
            #if my_sign(np.dot(w, x[0:dimension])) != my_sign(x[dimension]):
            if mistake(w, x):
                w +=  x[0:dimension]*x[dimension]
                run = True
                updates += 1
    return w, updates

def gen_pic(samples, num=0, savefile=False, show=False):
    data_set = []
    for i in range(samples):
        x1, x2 = sample(list(np.linspace(-25,25,10000)), 2)
        y = 0
        if x1 > x2:
            y = 1
        else:
            y = -1
        data_set.append(np.array([x1,x2,y]))

    wf = PLA(data_set)
    
    pd_data = pd.DataFrame(data_set, columns = ["x1", "x2", "y"])
    #print(pd_data)
    
    my_dpi = 250
    plt.figure(figsize=(1000/my_dpi, 1000/my_dpi), dpi=my_dpi)
    
    g = pd_data.loc[pd_data['y'] == 1]
    plt.scatter(g["x1"], g["x2"], marker="o", color="blue")

    b = pd_data.loc[pd_data['y'] == -1]
    plt.scatter(b["x1"], b["x2"], marker="x", color="red")

    x = np.linspace(-30,30,1000)
    plt.plot(-wf[1]*x, wf[0]*x, c="purple")

    plt.xlim(-30, 30)
    plt.ylim(-30, 30)
    plt.title(f"PLA found line {wf[0]:.3f}x+{wf[1]:.3f}y=0")
    
    if savefile:
        plt.savefig(f'number {num}.png', bbox_inches='tight', dpi=my_dpi)
    if show:
        plt.show()
    plt.clf()



In [2]:
@nb.jit
def arr_swap(arr, i1, i2):
    r1, r2 = arr[i1].copy(), arr[i2].copy()
    arr[i1], arr[i2] = r2, r1
@nb.jit
def shuffle(arr, times=1000):
    for i in range(times):
        indexs = sample(range(0,len(arr)), 2)
        arr_swap(arr, indexs[0], indexs[1])

In [3]:
import csv
file_name = "hw1_15_train.txt"
df = pd.read_csv(file_name, sep=" |\t", engine="python", header=None)
w0 = [1.0 for i in range(400)]
df.insert(0, 5, w0, True)
df.columns = range(0,6)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.97681,0.10723,0.64385,0.29556,1
1,1.0,0.67194,0.2418,0.83075,0.42741,1
2,1.0,0.20619,0.23321,0.81004,0.98691,1
3,1.0,0.51583,0.055814,0.92274,0.75797,1
4,1.0,0.70893,0.10836,0.33951,0.77058,1


In [9]:
start = time.time()

w, updates = PLA(df.to_numpy(), 5)

end = time.time()
print(f"PLA time: {(end-start):.3f}s")

#w = w.astype("float32")
print(f"updates:{updates}")
print(w)

PLA time: 0.001s
updates:45
[-3.         3.0841436 -1.583081   2.391305   4.5287635]


In [11]:
update_list = []
start = time.time()

cycles = 2000
for i in range(cycles):
    df1 = df.copy().to_numpy()
    shuffle(df1)
    
    w, updates = PLA(df1, 5)
    #print(f"on cycle {i+1}")
    update_list.append(updates)

end = time.time()
print(f"{cycles} cycles took {end-start:.3f}s")
print(f"average updates: {np.average(update_list)} times")



2000 cycles took 10.408s
average updates: 39.7565 times


In [26]:
def pkt_PLA(data, dimension, update_limit):
    #data = df_data.to_numpy()
    w = np.zeros(dimension)
    pkt = w.copy()
    pkt_error = check_error(data, pkt)
    
    updates = 0
    while True:
        #for i, x in data.iterrows():
        for x in data:
            #start = time.time()
            if mistake(w,x):
                
                w +=  x[0:dimension]*x[dimension]

                new_error = check_error(data, w)
                if  new_error < pkt_error:
                    pkt = w.copy()
                    pkt_error = new_error
                    #print(f"errors: {new_error}")
                
                updates += 1
                if updates >= update_limit:
                    return pkt
            #end = time.time()
            #print(f"1 iteration took {end-start:.3f}s")
            
# @nb.jit(nopython=True)
def check_error(data, w):
    error = 0
    dimension = len(w)
    #start = time.time()

    #for i, x in data.iterrows():
    for x in data:
        if mistake(w, x):
            error += 1
    #end = time.time()
    #print(f"checking error took {end-start:.3f}s")
    return error


In [27]:
file_name = "hw1_18_train.txt"
df_new = pd.read_csv(file_name, sep=" |\t", engine="python", header=None)
w0 = np.ones(df_new.shape[0])
df_new.insert(0, 5, w0, True)
df_new.columns = range(0,6)
df_new.head()

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.94544,0.42842,0.79833,0.16244,-1
1,1.0,0.85365,0.084168,0.5682,0.49221,-1
2,1.0,0.17095,0.82127,0.98444,0.51486,-1
3,1.0,0.51412,0.92124,0.42323,0.097934,-1
4,1.0,0.28147,0.71434,0.075309,0.9116,1


In [28]:
file_name = "hw1_18_train.txt"
df_veri = pd.read_csv(file_name, sep=" |\t", engine="python", header=None)
w0 = np.ones(df_veri.shape[0])
df_veri.insert(0, 5, w0, True)
data_veri = df_veri.to_numpy()

start = time.time()

w = pkt_PLA(df_new.to_numpy(), 5, 100)

end = time.time()
print(f"pocket PLA took {end-start:.3f} s")
print(f"error rate: {check_error(data_veri, w)/len(data_veri):.3f}")

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1mDirect iteration is not supported for arrays with dimension > 1. Try using indexing instead.[0m
[0m[1m[1] During: typing of intrinsic-call at <ipython-input-26-b8b3811fc43b> (35)[0m
[1m
File "<ipython-input-26-b8b3811fc43b>", line 35:[0m
[1mdef check_error(data, w):
    <source elided>
    #for i, x in data.iterrows():
[1m    for x in data:
[0m    [1m^[0m[0m


In [15]:
file_name = "hw1_18_train.txt"
df_veri = pd.read_csv(file_name, sep=" |\t", engine="python", header=None)
w0 = np.ones(df_veri.shape[0])
df_veri.insert(0, 5, w0, True)
data_veri = df_veri.to_numpy()

error_list = []
t_start = time.time()

cycles = 2000
for i in range(cycles):
    #print(f"on cycle {i+1}")
    df1 = df_new.copy().to_numpy()
    shuffle(df1)
    #start = time.time()
    w = pkt_PLA(df1, 5, 100)
    #end = time.time()
    #print(f"PLA took {end-start:.3f}s")
    error_list.append(check_error(data_veri, w))

t_end = time.time()
print(f"{cycles} cycles took {t_end-t_start:.3f}s")
print(f"average error rate: {np.average(error_list)/len(data_veri):.4f} %")

2000 cycles took 78.737s
average error rate: 0.1064 %
