In [1]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression

In [2]:
# cost or loss function
def cost(Y, Yhat):
    return np.mean((Yhat - Y) ** 2)

In [3]:
path = '../../data/KNN_Linear_Regression/SAT_GPA.csv'

df = pd.read_csv(path)
df.head()

Unnamed: 0,SAT,GPA
0,1714,2.4
1,1664,2.52
2,1760,2.54
3,1685,2.74
4,1693,2.83


In [4]:
X = df["SAT"]
y = df["GPA"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=30/84, random_state=1)

In [5]:
X_train = X_train.to_numpy().reshape(-1, 1)
X_test = X_test.to_numpy().reshape(-1, 1)

y_train = y_train.to_numpy().reshape(-1, 1)
y_test = y_test.to_numpy().reshape(-1, 1)

In [6]:
d0 = 1
d1 = h = 100 # size of hidden layer
d2 = C = 1
# initialize parameters randomly
W1 = 0.01*np.random.randn(d0, d1)
b1 = np.zeros((d1, 1))
W2 = 0.01*np.random.randn(d1, d2)
b2 = np.zeros((d2, 1))

N = X_train.T.shape[1]
eta = 0.0005 # learning rate

start_time = time.time()

for i in range(10000):
    ## Feedforward
    Z1 = np.dot(W1.T, X_train.T) + b1
    A1 = np.where(Z1 > 0, Z1, 0.01 * Z1)  # LeakyReLU
    Z2 = np.dot(W2.T, A1) + b2
    Yhat = Z2

    # print loss after each 1000 iterations
    if i %1000 == 0:
        # compute the loss: average cross-entropy loss
        loss = cost(y_train, Yhat)
        print("iter %d, loss: %f" %(i, loss))

    # backpropagation
    E2 = (Yhat - y_train.T)/N
    dW2 = np.dot(A1, E2.T)
    db2 = np.sum(E2, axis = 1, keepdims = True)
    E1 = np.dot(W2, E2)
    E1 = np.where(Z1 > 0, E1, 0.01 * E1)  # Gradient of LeakyReLU
    dW1 = np.dot(X_train.T, E1.T)
    db1 = np.sum(E1, axis = 1, keepdims = True)
    
    # Gradient clipping
    clip_value = 1.0
    dW1 = np.clip(dW1, -clip_value, clip_value)
    dW2 = np.clip(dW2, -clip_value, clip_value)
    db1 = np.clip(db1, -clip_value, clip_value)
    db2 = np.clip(db2, -clip_value, clip_value)
    
    # Gradient Descent update
    W1 += -eta*dW1
    b1 += -eta*db1
    W2 += -eta*dW2
    b2 += -eta*db2

training_time = time.time() - start_time
print(f"\nTraining Time: %f s" % training_time)

iter 0, loss: 4.356787
iter 1000, loss: 0.257125
iter 2000, loss: 0.236572
iter 3000, loss: 0.224616
iter 4000, loss: 0.213480
iter 5000, loss: 0.213319
iter 6000, loss: 0.213300
iter 7000, loss: 0.213351
iter 8000, loss: 0.213305
iter 9000, loss: 0.213324

Training Time: 0.512405 s


In [7]:
Z1 = np.dot(W1.T, X_train.T) + b1
A1 = np.maximum(Z1, 0)
Z2 = np.dot(W2.T, A1) + b2

Z2

array([[4.1165482 , 3.98285102, 3.67785434, 4.04343255, 3.96613887,
        4.20846501, 3.76977115, 3.54206814, 3.98494004, 3.46477447,
        3.81990759, 4.02463139, 3.75514802, 4.05178863, 3.87004403,
        4.06850077, 3.68829943, 3.44179526, 3.525356  , 3.98911808,
        3.66532023, 3.62980692, 3.72590176, 4.04761059, 3.52117796,
        3.41881606, 3.65278612, 3.64025201, 3.88675618, 4.07894587,
        3.8094625 , 3.89929029, 3.4689525 , 3.48148661, 4.16668464,
        3.64860808, 3.69247747, 4.16459562, 3.65278612, 3.79275035,
        3.62353986, 3.70918961, 3.75932606, 3.62980692, 3.65905317,
        3.42090508, 3.61100575, 3.89929029, 3.43343919, 4.15415053,
        3.8094625 , 4.07058979, 3.56922538, 4.02672041]])

In [8]:
num_runs = 10
prediction_times = []

for _ in range(num_runs):
    start_time = time.time()
    
    Z1 = np.dot(W1.T, X_train.T) + b1
    A1 = np.maximum(Z1, 0)
    Z2 = np.dot(W2.T, A1) + b2
    
    prediction_time = time.time() - start_time
    prediction_times.append(prediction_time)

avg_prediction_time = np.mean(prediction_times)
print("Average Predict Time: %f s" % avg_prediction_time)

Average Predict Time: 0.000100 s


In [9]:
print("R square:", r2_score(y_train, Z2[0]))
print("MSE:", mean_squared_error(y_train, Z2[0]))

R square: -2.5791586907433435
MSE: 0.2843202163193636


In [10]:
d0 = 1
d1 = h = 100 # size of hidden layer
d2 = C = 1
# initialize parameters randomly
W1 = 0.01*np.random.randn(d0, d1)
b1 = np.zeros((d1, 1))
W2 = 0.01*np.random.randn(d1, d2)
b2 = np.zeros((d2, 1))

N = X_test.T.shape[1]
eta = 0.0005 # learning rate

start_time = time.time()

for i in range(10000):
    ## Feedforward
    Z1 = np.dot(W1.T, X_test.T) + b1
    A1 = np.where(Z1 > 0, Z1, 0.01 * Z1)  # LeakyReLU
    Z2 = np.dot(W2.T, A1) + b2
    Yhat = Z2

    # print loss after each 1000 iterations
    if i %1000 == 0:
        # compute the loss: average cross-entropy loss
        loss = cost(y_test, Yhat)
        print("iter %d, loss: %f" %(i, loss))

    # backpropagation
    E2 = (Yhat - y_test.T)/N
    dW2 = np.dot(A1, E2.T)
    db2 = np.sum(E2, axis = 1, keepdims = True)
    E1 = np.dot(W2, E2)
    E1 = np.where(Z1 > 0, E1, 0.01 * E1)  # Gradient of LeakyReLU
    dW1 = np.dot(X_test.T, E1.T)
    db1 = np.sum(E1, axis = 1, keepdims = True)
    
    # Gradient clipping
    clip_value = 1.0
    dW1 = np.clip(dW1, -clip_value, clip_value)
    dW2 = np.clip(dW2, -clip_value, clip_value)
    db1 = np.clip(db1, -clip_value, clip_value)
    db2 = np.clip(db2, -clip_value, clip_value)
    
    # Gradient Descent update
    W1 += -eta*dW1
    b1 += -eta*db1
    W2 += -eta*dW2
    b2 += -eta*db2

training_time = time.time() - start_time
print(f"\nTraining Time: %f s" % training_time)

iter 0, loss: 17.853502
iter 1000, loss: 0.135222
iter 2000, loss: 0.138426
iter 3000, loss: 0.135594
iter 4000, loss: 0.138681
iter 5000, loss: 0.138793
iter 6000, loss: 0.138769
iter 7000, loss: 0.138743
iter 8000, loss: 0.138717
iter 9000, loss: 0.138692

Training Time: 0.427711 s


In [11]:
Z1 = np.dot(W1.T, X_test.T) + b1
A1 = np.maximum(Z1, 0)
Z2 = np.dot(W2.T, A1) + b2

Z2

array([[3.19381602, 3.55500691, 3.47696064, 3.41161957, 3.76555035,
        3.71109946, 3.45699531, 3.27004726, 3.59493756, 3.18474087,
        3.38983922, 3.42250975, 3.32631318, 3.40254442, 3.01049803,
        3.36987389, 3.55500691, 3.32631318, 3.34990857, 3.13392004,
        3.42976987, 3.23919176, 3.65120348, 3.4007294 , 3.41524963,
        3.44247508, 3.54956182, 3.50600111, 3.28638253, 3.65664857]])

In [12]:
num_runs = 10
prediction_times = []

for _ in range(num_runs):
    start_time = time.time()
    
    Z1 = np.dot(W1.T, X_test.T) + b1
    A1 = np.maximum(Z1, 0)
    Z2 = np.dot(W2.T, A1) + b2
    
    prediction_time = time.time() - start_time
    prediction_times.append(prediction_time)

avg_prediction_time = np.mean(prediction_times)
print("Average Predict Time: %f s" % avg_prediction_time)

Average Predict Time: 0.000000 s


In [13]:
print("R square:", r2_score(y_test, Z2[0]))
print("MSE:", mean_squared_error(y_test, Z2[0]))

R square: 0.1464407477745484
MSE: 0.04607740459313581


In [17]:
linR = LinearRegression()

start_time = time.time()

linR.fit(X_train, y_train)

training_time = time.time() - start_time
print("Training Time: %f s" % training_time)


num_runs = 10
prediction_times = []

for _ in range(num_runs):
    start_time = time.time()
    y_pred = linR.predict(X_test)
    prediction_time = time.time() - start_time
    prediction_times.append(prediction_time)
    
avg_prediction_time = np.mean(prediction_times)
print("Average Predict Time: %f s" % avg_prediction_time)

print("R square:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

Training Time: 0.002999 s
Average Predict Time: 0.000102 s
R square: 0.05950572997060655
MSE: 0.05077038868090673


In [21]:
d0 = 1
d1 = h = 75 # size of hidden layer
d2 = C = 1
# initialize parameters randomly
W1 = 0.01*np.random.randn(d0, d1)
b1 = np.zeros((d1, 1))
W2 = 0.01*np.random.randn(d1, d2)
b2 = np.zeros((d2, 1))

N = X_train.T.shape[1]
eta = 0.0005 # learning rate

start_time = time.time()

for i in range(10000):
    ## Feedforward
    Z1 = np.dot(W1.T, X_train.T) + b1
    A1 = np.where(Z1 > 0, Z1, 0.01 * Z1)  # LeakyReLU
    Z2 = np.dot(W2.T, A1) + b2
    Yhat = Z2

    # print loss after each 1000 iterations
    if i %1000 == 0:
        # compute the loss: average cross-entropy loss
        loss = cost(y_train, Yhat)
        print("iter %d, loss: %f" %(i, loss))

    # backpropagation
    E2 = (Yhat - y_train.T)/N
    dW2 = np.dot(A1, E2.T)
    db2 = np.sum(E2, axis = 1, keepdims = True)
    E1 = np.dot(W2, E2)
    E1 = np.where(Z1 > 0, E1, 0.01 * E1)  # Gradient of LeakyReLU
    dW1 = np.dot(X_train.T, E1.T)
    db1 = np.sum(E1, axis = 1, keepdims = True)
    
    # Gradient clipping
    clip_value = 1.0
    dW1 = np.clip(dW1, -clip_value, clip_value)
    dW2 = np.clip(dW2, -clip_value, clip_value)
    db1 = np.clip(db1, -clip_value, clip_value)
    db2 = np.clip(db2, -clip_value, clip_value)
    
    # Gradient Descent update
    W1 += -eta*dW1
    b1 += -eta*db1
    W2 += -eta*dW2
    b2 += -eta*db2

training_time = time.time() - start_time
print(f"\nTraining Time: %f s" % training_time)


Z1 = np.dot(W1.T, X_test.T) + b1
A1 = np.maximum(Z1, 0)
Z2 = np.dot(W2.T, A1) + b2

print("R square:", r2_score(y_test, Z2[0]))
print("MSE:", mean_squared_error(y_test, Z2[0]))

iter 0, loss: 15.533006
iter 1000, loss: 0.160218
iter 2000, loss: 0.159284
iter 3000, loss: 0.157566
iter 4000, loss: 0.155446
iter 5000, loss: 0.158287
iter 6000, loss: 0.156989
iter 7000, loss: 0.157734
iter 8000, loss: 0.158166
iter 9000, loss: 0.158173

Training Time: 0.569203 s
R square: 0.017371492386684118
MSE: 0.053044907183653754


In [22]:
d0 = 1
d1 = h = 50 # size of hidden layer
d2 = C = 1
# initialize parameters randomly
W1 = 0.01*np.random.randn(d0, d1)
b1 = np.zeros((d1, 1))
W2 = 0.01*np.random.randn(d1, d2)
b2 = np.zeros((d2, 1))

N = X_train.T.shape[1]
eta = 0.0005 # learning rate

start_time = time.time()

for i in range(10000):
    ## Feedforward
    Z1 = np.dot(W1.T, X_train.T) + b1
    A1 = np.where(Z1 > 0, Z1, 0.01 * Z1)  # LeakyReLU
    Z2 = np.dot(W2.T, A1) + b2
    Yhat = Z2

    # print loss after each 1000 iterations
    if i %1000 == 0:
        # compute the loss: average cross-entropy loss
        loss = cost(y_train, Yhat)
        print("iter %d, loss: %f" %(i, loss))

    # backpropagation
    E2 = (Yhat - y_train.T)/N
    dW2 = np.dot(A1, E2.T)
    db2 = np.sum(E2, axis = 1, keepdims = True)
    E1 = np.dot(W2, E2)
    E1 = np.where(Z1 > 0, E1, 0.01 * E1)  # Gradient of LeakyReLU
    dW1 = np.dot(X_train.T, E1.T)
    db1 = np.sum(E1, axis = 1, keepdims = True)
    
    # Gradient clipping
    clip_value = 1.0
    dW1 = np.clip(dW1, -clip_value, clip_value)
    dW2 = np.clip(dW2, -clip_value, clip_value)
    db1 = np.clip(db1, -clip_value, clip_value)
    db2 = np.clip(db2, -clip_value, clip_value)
    
    # Gradient Descent update
    W1 += -eta*dW1
    b1 += -eta*db1
    W2 += -eta*dW2
    b2 += -eta*db2

training_time = time.time() - start_time
print(f"\nTraining Time: %f s" % training_time)

Z1 = np.dot(W1.T, X_test.T) + b1
A1 = np.maximum(Z1, 0)
Z2 = np.dot(W2.T, A1) + b2

print("R square:", r2_score(y_test, Z2[0]))
print("MSE:", mean_squared_error(y_test, Z2[0]))

iter 0, loss: 5.748342
iter 1000, loss: 0.148538
iter 2000, loss: 0.140860
iter 3000, loss: 0.140944
iter 4000, loss: 0.142364
iter 5000, loss: 0.143029
iter 6000, loss: 0.143265
iter 7000, loss: 0.143344
iter 8000, loss: 0.143370
iter 9000, loss: 0.143341

Training Time: 0.467271 s
R square: 0.04518765001488845
MSE: 0.05154331681846294
