In [0]:
import numpy as np
import pandas as pd

In [0]:
#Metrics
def mean_square_error(y, y_pred):
    return np.mean(np.square(y - y_pred))

def root_mean_square_error(y, y_pred):
    return mean_square_error(y, y_pred)**0.5
  
def r2_score(y, y_pred):
    y_avg = y.mean()
    ss_total = np.sum(np.square(y - y_avg))
    ss_err = np.sum(np.square(y - y_pred))
    return 1 - ss_err/ss_total

In [0]:
#Utils
def add_biases(X):
    return np.hstack((X, np.ones((X.shape[0],1))))
  
def linear(X, w):
    return np.matmul(w, X.T)
  
def preprocess(X_train, X_test):
    #Standardize
    X_mean, X_std = X_train.mean(axis=0), X_train.std(axis=0)
    X_train = (X_train - X_mean)/X_std
    X_test = (X_test - X_mean)/X_std
    
    #Remove NaN in 37 column
    np.nan_to_num(X_train, copy=False)
    np.nan_to_num(X_test, copy=False)
    
    #Biases
    X_train = add_biases(X_train)
    X_test = add_biases(X_test)
    
    return X_train, X_test

In [0]:
#SGD  
def compute_gradient_on_batch(X, y, w):
  
    batch_size, feature_count = X.shape
    grad = np.zeros((batch_size, feature_count))
    
    for i in range(batch_size):
        grad[i, :] = X[i, :]*(w @ X[i] - y[i])
    
    return grad

def stochastic_gradient_descent(X, y, lr=1e-2, max_epoch=10, 
                                batch_size=1):
    weight_dist = np.inf
    w = np.random.randn(X.shape[1])    #init w
    errors = []
    
    iters = X.shape[0] // batch_size
    if (X.shape[0] % batch_size > 0):
        iters += 1
    
    for epoch in range(max_epoch):
        for i in range(iters):
            
            X_batch = X[i*batch_size:(i+1)*batch_size]
            y_batch = y[i*batch_size:(i+1)*batch_size]

            grad = compute_gradient_on_batch(X_batch, y_batch, w)
            w = w - lr * np.mean(grad, axis=0)
            
        errors.append(mean_square_error(y, linear(X, w)))
    
    return w, errors


In [0]:
#Folds
class KFolds:
    def __init__(self, X, y, k=5):
      
        self.k = k
        
        fold_size = X.shape[0]//k
        if X.shape[0] % k > 0:
            fold_size += 1
            
        self.X_folds = [X[i*fold_size:(i+1)*fold_size] for i in range(k)]
        self.y_folds = [y[i*fold_size:(i+1)*fold_size] for i in range(k)]
        
    def get_fold(self, fold_i):
        
        X_cv = self.X_folds[fold_i]
        y_cv = self.y_folds[fold_i]
        
        X_train = np.concatenate([self.X_folds[i] for i in range(self.k) if i != fold_i])
        y_train = np.concatenate([self.y_folds[i] for i in range(self.k) if i != fold_i])
        
        return X_train, y_train, X_cv, y_cv

In [6]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip
!unzip -q Dataset.zip

--2019-09-11 11:52:41--  https://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19055526 (18M) [application/x-httpd-php]
Saving to: ‘Dataset.zip’


2019-09-11 11:52:47 (22.5 MB/s) - ‘Dataset.zip’ saved [19055526/19055526]



In [0]:
train_df = pd.read_csv('Dataset/Training/Features_Variant_1.csv',header=None)

y = train_df[53].values
X = train_df.drop(columns=[53]).values

#Shuffle
p = np.random.permutation(X.shape[0])
X, y = X[p], y[p]

In [55]:
kfold = KFolds(X,y,k=5)
rmse_list = []
r2_list = []
weights_list = []

for i in range(kfold.k):
    X_train, y_train, X_cv, y_cv = kfold.get_fold(i)
    X_train, X_cv = preprocess(X_train, X_cv)
    
    w, errors = stochastic_gradient_descent(X_train, y_train, lr=1e-2, max_epoch=20, batch_size=1000)
    y_pred = linear(X_cv, w)
    
    rmse_list.append(root_mean_square_error(y_cv, y_pred))
    r2_list.append(r2_score(y_cv, y_pred))
    weights_list.append(w)

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [40]:
print("Folds\t│{}\t{}\t{}\t{}\t{}\t│mean\tstd".format(*range(kfold.k)))
print("────────┼───────────────────────────────────────┼─────────────")
print("RMSE\t│{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}\t│{:.2f}\t{:.2f}".format(*rmse_list, np.array(rmse_list).mean(), np.array(rmse_list).std()))
print("R2\t│{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t│{:.3f}\t{:.3f}".format(*r2_list, np.array(r2_list).mean(), np.array(r2_list).std()))

Folds	│0	1	2	3	4	│mean	std
────────┼───────────────────────────────────────┼─────────────
RMSE	│28.0	26.2	32.3	33.6	28.2	│29.66	2.80
R2	│0.333	0.324	0.207	0.264	0.367	│0.299	0.057
