In [5]:
import numpy as np 
import pandas as pd 
import math 

In [6]:
# sampling x 
x1 = np.random.normal(3 , 2, (1000000,) ) 
x2 = np.random.normal(-1, 2 , (1000000,)) 
x = np.zeros((1000000, 3)) 

x[:, 0] = 1 # adding intercept term 
x[:, 1] = x1 
x[:, 2] = x2 
# print(np.var(x[:, 1]))  

# sampling y 

original_hypothesis = np.array([3,1,2]) 
y = np.matmul(x, original_hypothesis) 
y += np.random.normal(0, math.sqrt(2) , y.shape)  
# print(np.mean(y)) 



In [7]:
def compute_batch_error(X, Y, theta, batch_size, batch_number):
    X_batch = X[batch_size  * (batch_number - 1)   : batch_size * (batch_number - 1) + batch_size, :] 
    Y_batch = Y[ batch_size  * (batch_number - 1)   : batch_size * (batch_number - 1) + batch_size] 

    Z = Y_batch - np.matmul(X_batch, theta)
    error = np.matmul(np.transpose(Z), Z) / (2 * batch_size) 
    return error  

In [8]:
def compute_batch_gradient(X, Y, theta, batch_size, batch_number):
    X_batch = X[batch_size  * (batch_number - 1)   : batch_size * (batch_number - 1) + batch_size, :] 
    Y_batch = Y[ batch_size  * (batch_number - 1)   : batch_size * (batch_number - 1) + batch_size] 
    Z_batch = Y_batch - np.matmul(X_batch, theta) 
    # print(Z)
    gradient = np.zeros(theta.size) 
    for j in range(theta.size):
        X_j = X_batch[:, j] 
        gradient[j] = np.sum(Z_batch * X_j)/ batch_size 
    return gradient

In [29]:
def minibatch_gradient_descent(X, Y, batch_size, k, gamma):
    current_batch_number = 1 
    theta = np.zeros(X[0].size) 
    # print(theta) 
    initial_error = compute_batch_error(X, Y, theta,  batch_size, int(current_batch_number) ) 
    epsilon = 0.000001 * initial_error 

    # print(f"factor is {pow(batch_size, -0.5)}")
    #we decrease epsilon for large batch sizes, since smaller batch sizes converge more noisily to the 
    # optima, thus they require a larger epsilon 
     
    epsilon = pow(batch_size, -0.5) * epsilon 
    max_count = int(1/epsilon) 
    # to avoid oscillations, we introduce this cap  


    learning_parameter = 0.001
    MOD = Y.size / batch_size 
    max_count = max(max_count, MOD)  
    print(f"maxcount is {max_count}")
    no_of_iterations = 0 


    error_arr = []
    running_avg = 0 
    previous_running_avg = 0 

    for i in range(k):
        grad = compute_batch_gradient(X, Y, theta, batch_size, int(current_batch_number) ) 
        theta = theta + learning_parameter * grad 
        curr_error = compute_batch_error(X, Y,theta,  batch_size, int(current_batch_number) )
        error_arr.append(curr_error) 
        running_avg += curr_error 
        current_batch_number = (current_batch_number + 1) 
        if (current_batch_number > MOD):  current_batch_number -= MOD 
        # no_of_iterations += 1


    running_avg /= k
    last_count = 0  # no of consecutive times difference in averages is less than epsilon  
    # epsilon = 0.000001 * running_avg 
    
    while (last_count < gamma and no_of_iterations < max_count):
        
        
        previous_running_avg = running_avg 
        previous_error = error_arr.pop(0) 

        # print(f" batch size, batch num {batch_size} {current_batch_number}") 
        grad = compute_batch_gradient(X, Y, theta, batch_size, int(current_batch_number) ) 
        theta = theta + learning_parameter * grad 
        
        curr_error = compute_batch_error(X, Y,theta,  batch_size, int(current_batch_number) ) 
        error_arr.append(curr_error) 

        running_avg = running_avg + (curr_error - previous_error)/k  

        # print(f"current theta is {theta}") 
        # print(f"last count {last_count}")
        # print(f"average error is {running_avg}") 
        
        if (abs(previous_running_avg - running_avg) < epsilon) :  last_count += 1 
        else: last_count = 0
        
        no_of_iterations += 1 
        current_batch_number = (current_batch_number + 1) 
        if (current_batch_number > MOD):  current_batch_number -= MOD 

    print(f"no of iterations is {no_of_iterations}") 
    # print(f"learned theta is {theta}") 
    return theta 
    

In [10]:
def batch_gradient_descent(X, Y):
    theta = np.zeros(X[0].size) 
    print(theta) 
    initial_error = compute_batch_error(X, Y, theta,  Y.size , 1 ) 
    error = initial_error 
    epsilon = 0.0001 * initial_error 
    # print(f"intial error is {initial_error}")
    learning_parameter = 0.001
    no_of_iterations = 0 

    previous_error = 2*error 
    while ( previous_error - error > epsilon) : 

        previous_error = error 
        grad = compute_batch_gradient(X, Y, theta, Y.size, 1 ) 
        theta = theta + learning_parameter * grad 
        
        error = compute_batch_error(X, Y,theta,  Y.size , 1 ) 


        # print(f"current theta is {theta}") 

        no_of_iterations += 1 

    print(f"no of iterations is {no_of_iterations}") 
    print(f"learned theta is {theta}") 
    return theta 
    

In [24]:
def compute_error(X, Y, hypothesis):
    Z = (Y - np.matmul(X, hypothesis)) 
    m = Y.size 
    error = np.matmul(np.transpose(Z), Z)/ (2 * m) 
    return error 

In [31]:
test_set = pd.read_csv("./ass1_data/data/q2/q2test.csv") 
test_set = test_set.to_numpy() 
temp = np.zeros((10000, 4)) 
temp[:,0] = 1 
temp[:, 1] = test_set[:, 0] 
temp[:, 2] = test_set[:, 1] 
temp[:, 3] = test_set[:, 2]

test_set = temp 

# print(test_set) 

test_input = test_set[:, 0 : 3] 
test_output = test_set[:, 3] 

model1 = minibatch_gradient_descent(x , y , 1, 10,2) 
error1 = compute_error(test_input, test_output, model1) 
print("model1 error1", model1, error1 )
model2 = minibatch_gradient_descent(x, y, 100, 10,1)  
model3 = minibatch_gradient_descent(x, y , 10000, 5,1) 
model4 = minibatch_gradient_descent(x, y, 1000000, 1,1) 

maxcount is 1000000.0
no of iterations is 90164
model1 error1 [3.01095502 0.97275792 1.96257408] 1.097125074271597
maxcount is 516992
no of iterations is 2400
maxcount is 5211175
no of iterations is 4522
maxcount is 52476196
no of iterations is 19078


In [32]:
error1 = compute_error(test_input, test_output, model1) 
error2 = compute_error(test_input, test_output, model2) 
error3 = compute_error(test_input, test_output, model3) 
error4 = compute_error(test_input, test_output, model4) 

print(model1) 
print(model2) 
print(model3)
print(model4)

print("error wrt learned models") 
print(error1) 
print(error2)
print(error3) 
print(error4)

og_error1 = compute_error(test_input, test_output, original_hypothesis) 

print("error wrt original model") 
print(og_error1) 


[3.01095502 0.97275792 1.96257408]
[1.5520758  1.31782229 1.89540753]
[2.18568377 1.17847891 1.94092125]
[2.98348266 1.00425365 1.99914597]
error wrt learned models
1.097125074271597
7.067409341304615
2.9051798719931075
0.9839013787157844
error wrt original model
0.9829469215
