In [1]:
import simpleml
import timeit
import time
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

### 1. linear regression

In [3]:
# simple ML toolkit
model = simpleml.Regression()
model.load_data('test_data/x.csv', 'test_data/y_bios.csv')

time_start = time.time_ns()
model.train()
time_end = time.time_ns()
reg_time = time_end - time_start
print("Duration:", reg_time / 1e9, "seconds")

# model.test()
weight = model.weight

Duration: 3.0643e-05 seconds
Loading data success!
Training success!


In [128]:
X = np.array(pd.read_csv('test_data/x.csv', header=None))
y = np.array(pd.read_csv('test_data/y_bios.csv', header=None)).flatten()

In [134]:
# pure python
phi_train_x = np.concatenate((np.ones((len(X), 1)), X), axis=1)

time_start = time.time_ns()
weight_m1 = np.dot(np.dot(np.linalg.inv(np.dot(phi_train_x.T, phi_train_x)), phi_train_x.T), y)
time_end = time.time_ns()
reg_time = time_end - time_start
print("Duration:", reg_time / 1e9, "seconds")

Duration: 0.00022519 seconds


In [138]:
# sklearn
model_lr = LinearRegression(fit_intercept=True)

time_start = time.time_ns()
model_lr.fit(X, y)
time_end = time.time_ns()
reg_time = time_end - time_start
print("Duration:", reg_time / 1e9, "seconds")

# model_lr.predict(X)
print("weight: ", model_lr.intercept_, model_lr.coef_)

Duration: 0.001341748 seconds
weight:  -10.013136241488397 [3.9928595  6.96655997]


### 2. logistic regression (classification)

In [8]:
# simple ML toolkit
model = simpleml.Log_regression()
model.load_data('test_data/x_train_cls.csv', 'test_data/t_train_cls.csv', 10)

time_start = time.time_ns()
model.train(50)
time_end = time.time_ns()
reg_time = time_end - time_start
print("Duration:", reg_time / 1e9, "seconds")

weight_lrml = np.array(model.weight).reshape((10, 784))

Loading data success!
Duration: 0.35549526 seconds
Training success!


In [3]:
test_x = pd.read_csv('test_data/x_test_cls.csv', header=None).to_numpy().flatten()
test_y = pd.read_csv('test_data/t_test_cls.csv', header=None).to_numpy().flatten()
pred = np.array(model.test(test_x))
print("Test acc:", np.sum(test_y == pred) / len(test_y))

Test acc:Testing success!
 0.8875


In [6]:
train_x = pd.read_csv('test_data/x_train_cls.csv', header=None).to_numpy()
# train_y = pd.read_csv('test_data/t_train_cls.csv', header=None).to_numpy()
n_cls = 10
train_label = np.zeros([n_cls, 1])
for i in range(n_cls):
    label_temp = np.zeros([n_cls, 1])
    label_temp[i, 0] = 1
    label_temp = np.repeat(label_temp, 128, axis=1)
    train_label = np.hstack((train_label, label_temp[:, 32:]))
train_label = train_label[:, 1:]
train_y = train_label

In [4]:
import cv2
# parameters
n_cls = 10
n_img = 128
img_h = 28
img_w = 28
train_test_ratio = (128-32)/32

# load image data
train_image = np.zeros([1, img_h*img_w])
test_image = np.zeros([1, img_h*img_w])
train_label = np.zeros([n_cls, 1])
test_label = np.zeros([n_cls, 1])

for i in range(n_cls):
    temp = []
    for j in range(n_img):
        img = cv2.imread('MNIST/{}/{}.jpg'.format(i, j), cv2.IMREAD_GRAYSCALE)
        img = img / 255.0
        img = img.flatten()
        temp.append(img)
    temp = np.array(temp)
    test_image = np.vstack((test_image, temp[:32,:]))
    train_image = np.vstack((train_image, temp[32:,:]))
    # print(temp[:32,:].shape, temp[32:,:].shape, temp.shape)
    
    label_temp = np.zeros([n_cls, 1])
    label_temp[i, 0] = 1
    label_temp = np.repeat(label_temp, n_img, axis=1)
    test_label = np.hstack((test_label, label_temp[:, :32]))
    train_label = np.hstack((train_label, label_temp[:, 32:]))
    # print(label_temp.shape)

train_image = train_image[1:, :]
test_image = test_image[1:, :]
train_label = train_label[:, 1:]
test_label = test_label[:, 1:]

In [9]:
# pure python
epoch = 50
lr = 0.0005

time_start = time.time_ns()

weight = np.zeros([n_cls, img_h*img_w])
for iter in range(epoch):
    train_a = np.dot(weight, train_image.T)
    train_y = np.exp(train_a) / np.sum(np.exp(train_a), axis=0)
    weight -= lr * np.dot((train_y - train_label), train_image)

time_end = time.time_ns()
reg_time = time_end - time_start
print("Duration:", reg_time / 1e9, "seconds")

Duration: 1.143997697 seconds


In [13]:
np.min(np.isclose(weight_lrml, weight))

True

In [40]:
# sklearn
model_lr = LogisticRegression(penalty=None, max_iter=50)
train_label_sk = np.argmax(train_label, axis=0)

time_start = time.time_ns()
model_lr.fit(train_image, train_label_sk)
time_end = time.time_ns()
reg_time = time_end - time_start
print("Duration:", reg_time / 1e9, "seconds")

# model_lr.predict(X)
print("weight: ", model_lr.coef_)

Duration: 0.110849601 seconds
weight:  [[-0.00325456 -0.00264466 -0.05871713 ...  0.          0.
   0.        ]
 [-0.01173491 -0.01438416 -0.01554023 ...  0.          0.
   0.        ]
 [ 0.03735465 -0.07179375  0.0390509  ...  0.          0.
   0.        ]
 ...
 [ 0.02773822  0.01843406 -0.01666703 ...  0.          0.
   0.        ]
 [-0.03077941  0.02429813 -0.01246601 ...  0.          0.
   0.        ]
 [-0.04236132 -0.04137545 -0.02110152 ...  0.          0.
   0.        ]]


### 3. k-means

In [2]:
# hyper paramenter
K = 20
iteration = 100

In [5]:
# simple ML toolkit
model = simpleml.Kmeans()
model.load_data('test_data/cat.csv', K)

time_start = time.time_ns()
model.train(iteration)
time_end = time.time_ns()
reg_time = time_end - time_start
print("Duration:", reg_time / 1e9, "seconds")

# model.test()
center = model.center
classes = model.classes

Loading data success!
Duration: 1.416096252 seconds
Training success!


In [59]:
img = pd.read_csv('test_data/cat.csv', header=None).to_numpy()

In [128]:
# pure python
time_start = time.time_ns()

center = img[np.random.randint(len(img), size=K)]
old_center = center
for i in range(iteration):
    distance = np.sum((img.reshape(-1, 1, 3) - center)**2, axis=2)
    classes = np.argmin(distance, axis=1)
    class_onehot = np.eye(K)[classes]
    temp_sum = np.sum(class_onehot.reshape(-1, K, 1) * img.reshape(-1, 1, 3), axis=0)
    center = temp_sum / np.sum(class_onehot, axis=0).reshape(K, 1)

time_end = time.time_ns()
reg_time = time_end - time_start
print("Duration:", reg_time / 1e9, "seconds")


Duration: 5.862847863 seconds


In [127]:
# sklearn
model = KMeans(n_clusters=K, n_init=1, max_iter=100, init='random')

time_start = time.time_ns()
model.fit(img)
time_end = time.time_ns()
reg_time = time_end - time_start
print("Duration:", reg_time / 1e9, "seconds")
print("Iterations:", model.n_iter_)
center = model.cluster_centers_
classes = model.predict(img)

Duration: 0.248065873 seconds
Iterations: 100


### data generation

In [None]:
import cv2
import time
import numpy as np
image = cv2.imread("./hw3.jpg")
img = image.reshape((-1, 3)) / 255.0
# np.savetxt('cat.csv', img, delimiter=',')

In [8]:
import cv2
# parameters
n_cls = 10
n_img = 128
img_h = 28
img_w = 28
train_test_ratio = (128-32)/32

# load image data
train_image = np.zeros([1, img_h*img_w])
test_image = np.zeros([1, img_h*img_w])
train_label = np.zeros([1])
test_label = np.zeros([1])

for i in range(n_cls):
    temp = []
    for j in range(n_img):
        img = cv2.imread('MNIST/{}/{}.jpg'.format(i, j), cv2.IMREAD_GRAYSCALE)
        img = img / 255.0
        img = img.flatten()
        temp.append(img)
    temp = np.array(temp)
    test_image = np.vstack((test_image, temp[:32,:]))
    train_image = np.vstack((train_image, temp[32:,:]))
    # print(temp[:32,:].shape, temp[32:,:].shape, temp.shape)
    
    # for j in range(n_img):
    #     train_label.append(i)
    #     test_label.append(i)
    label_temp = np.array([i])
    label_temp = np.repeat(label_temp, n_img)
    test_label = np.hstack((test_label, label_temp[:32]))
    train_label = np.hstack((train_label, label_temp[32:]))
    # print(label_temp.shape)

train_image = train_image[1:, :]
test_image = test_image[1:, :]
train_label = train_label[1:]
test_label = test_label[1:]

In [9]:
np.savetxt('x_train_cls.csv', train_image, delimiter=',')
np.savetxt('t_train_cls.csv', train_label, delimiter=',')
np.savetxt('x_test_cls.csv', test_image, delimiter=',')
np.savetxt('t_test_cls.csv', test_label, delimiter=',')

### test

In [41]:
# k-means
def kmeans(data, K, iter):
    # initialize
    center = data[np.random.randint(len(data), size=K)]

    for i in range(iter):
        # 找離每個點最近的群(中心)，將其分類為該群
        distance = np.sum((data.reshape(-1, 1, 3) - center)**2, axis=2)
        classes = np.argmin(distance, axis=1)
        class_onehot = np.eye(K)[classes]
        # 更新各群的中心
        temp_sum = np.sum(class_onehot.reshape(-1, K, 1) * data.reshape(-1, 1, 3), axis=0)
        center = temp_sum / np.sum(class_onehot, axis=0).reshape(K, 1)
    
    return center, class_onehot

In [46]:
# print center
def print_means(mean, k, model):
    print("---------- %s K = %d ----------" %(model, k))
    print("         B          G          R   ")
    for i in range(k):
        print("%2d: %10f %10f %10f" %(i, mean[i, 0]*255, mean[i, 1]*255, mean[i, 2]*255))

In [52]:
K = 7
iteration = 100

time_start = time.time()
center, class_onehot = kmeans(img, K, iteration)
time_end = time.time()
print("time cost:", time_end-time_start)
print_means(center, K, 'k-means')

time cost: 3.7064788341522217
---------- k-means K = 7 ----------
         B          G          R   
 0:  16.688316  40.974668  18.574869
 1: 105.116698 139.539326  81.580168
 2: 199.202706 208.755111 229.836969
 3: 102.572856 123.817821 148.772353
 4: 149.783546 165.975425 190.046789
 5:  30.091969  72.410471  32.778405
 6:  72.017115 101.387775  59.115892
