Added two methods: normal_equation() and x_power_with_ones()

In [46]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from csv import reader
from sklearn.preprocessing import StandardScaler
from random import randrange
import operator
import functools


def linear_grad_func(theta, x, y):
    # compute gradient
    grad = np.dot((linear_val_func(theta, x) - y).T, np.c_[np.ones(x.shape[0]), x])
    grad = grad / x.shape[0]

    return grad


def linear_val_func(theta, x):
    # forwarding
    return np.dot(np.c_[np.ones(x.shape[0]), x], theta) # above is theta.T


def linear_cost_func(theta, x, y):
    # compute cost (loss)
    y_hat = linear_val_func(theta, x)
    cost = np.mean((y_hat - y) ** 2)
    return cost


def linear_grad_desc(theta, X_train, Y_train, lr, max_iter, tolerance):
    cost = linear_cost_func(theta, X_train, Y_train)
    RMSE_iter = []
    RMSE_iter.append(np.sqrt(np.sum((linear_val_func(theta, X_train) - Y_train) ** 2) / Y_train.shape[0]))
    cost_change = 1
    i = 1

    while cost_change > tolerance and i < max_iter:
        pre_cost = cost
        # compute gradient
        grad = linear_grad_func(theta, X_train, Y_train)

        # update gradient
        theta = theta - lr * grad

        # compute loss
        cost = linear_cost_func(theta, X_train, Y_train)
        RMSE_iter.append(np.sqrt(np.sum((linear_val_func(theta, X_train) - Y_train) ** 2) / Y_train.shape[0]))
        cost_change = abs(cost - pre_cost)
        i += 1

    return theta, RMSE_iter


def load_dataset(filename):
    '''Loads an example of market basket transactions from a provided csv file.
    Returns: A list (database) of lists (transactions). Each element of a transaction is
    an item.
    '''
    with open(filename, 'r') as dest_f:
        data_iter = reader(dest_f, delimiter=',', quotechar='"')
        data = [data for data in data_iter]
        data_array = np.asarray(data)

    return data_array


# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

def normal_equation(X_train, Y_train, power):
    X = x_power_with_ones(X_train, power)
    X_t = np.transpose(X)
    return np.dot(np.dot(np.linalg.inv(np.dot(X_t, X)),X_t),Y_train)

def x_power_with_ones(X, power):
    col = X.shape[1]
    x_res = np.ones(X.shape[0])
    for i in range(col):
        for j in range(power):
            x_res = np.c_[x_res, X[:,i]**(j+1)]
            
    return x_res


def linear_regression(dataset, n_folds, lr, max_iter, tolerance, power):
    # split dataset into training and testing
    dataset_split = cross_validation_split(dataset, n_folds)
    RMSE_train = []
    RMSE_test = []
    SSE_train = []
    SSE_test = []

    for i in range(n_folds):
        test = np.array(dataset_split[i])
        train = list(dataset_split)
        train.pop(i)
        train = np.array(functools.reduce(operator.add, train))

        # Normalize X_Train
        X_train = train[:, :-1]
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        # Get the mean and std to normalize the test dataset
        X_test = test[:, :-1]
        X_test = scaler.transform(X_test)

        Y_train = train[:, -1]
        Y_test = test[:, -1]

        Y_train = Y_train[:, None]
        Y_test = Y_test[:, None]

        # Linear regression
        #  Initialize the weights for the gradient descent algorithm to afll zeros
        # theta = np.zeros((1, X_train.shape[1] + 1))
        # theta = np.random.rand(1, X_train.shape[1] + 1)

        #fitted_theta, RMSE_iter = linear_grad_desc(theta, X_train, Y_train, lr, max_iter, tolerance)
        
        fitted_theta = normal_equation(X_train, Y_train, power)

        RMSE_test.append(np.sqrt(np.sum((linear_val_func(fitted_theta, X_test) - Y_test) ** 2) / Y_test.shape[0]))
        RMSE_train.append(np.sqrt(np.sum((linear_val_func(fitted_theta, X_train) - Y_train) ** 2) / Y_train.shape[0]))
        SSE_test.append(np.sum((linear_val_func(fitted_theta, X_test) - Y_test) ** 2))
        SSE_train.append(np.sum((linear_val_func(fitted_theta, X_train) - Y_train) ** 2))
        
        print('Train RMSE: {}'.format(RMSE_train[i]))
        print('Test RMSE: {}'.format(RMSE_test[i]))
    print('Overall Mean Train RMSE: {}'.format(np.sum(RMSE_train) * 1. / len(RMSE_train)))
    print('Overall Mean Test RMSE: {}'.format(np.sum(RMSE_test) * 1. / len(RMSE_test)))
    print('Overall Mean Train SSE: {}'.format(np.sum(SSE_train) * 1. / len(SSE_train)))
    print('Overall Mean Test SSE: {}'.format(np.sum(SSE_test) * 1. / len(SSE_test)))
    print('std of train SSE: {}'.format(np.std(np.array(SSE_train), axis=0)))
    print('std of test SSE: {}'.format(np.std(np.array(SSE_test), axis=0)))


# def sklearn_linear_regression(dataset, n_folds):
#     # split dataset into training and testing

#     X_train = X[:-20, :]
#     X_test = X[-20:, :]

#     Y_train = Y[:-20, None]
#     Y_test = Y[-20:, None]

#     # Linear regression
#     regressor = linear_model.LinearRegression()
#     regressor.fit(X_train, Y_train)
#     print('Coefficients: {}'.format(regressor.coef_))
#     print('Intercept: {}'.format(regressor.intercept_))
#     print('MSE:{}'.format(np.mean((regressor.predict(X_test) - Y_test) ** 2)))


def main():
    dataset = load_dataset("housing.csv")
    dataset = dataset.astype(float)

    print('Housing dataset Linear Regression')
    linear_regression(dataset, n_folds=10, lr=0.0004, max_iter=1000, tolerance=0.005, power=1)
    print ('')

    dataset = load_dataset("yachtData.csv")
    dataset = dataset.astype(float)
    print('Yacht dataset Linear Regression')
    linear_regression(dataset, n_folds=10, lr=0.001, max_iter=1000, tolerance=0.001, power=1)
    print ('')

    dataset = load_dataset("concreteData.csv")
    dataset = dataset.astype(float)
    print('Concrete dataset Linear Regression')
    linear_regression(dataset, n_folds=10, lr=0.0007, max_iter=1000, tolerance=0.0001, power=1)
    print ('')


#    print('sklearn Linear Regression Example')
#    sklearn_linear_regression(dataset, n_folds=10)

if __name__ == "__main__":
    main()

Housing dataset Linear Regression
Train RMSE: 4.80009547819
Test RMSE: 3.7543550955
Train RMSE: 4.62072294055
Test RMSE: 5.48236539308
Train RMSE: 4.6050940701
Test RMSE: 5.69388475359
Train RMSE: 4.55196857585
Test RMSE: 5.97710794646
Train RMSE: 4.72833364322
Test RMSE: 4.59546524217
Train RMSE: 4.65704941198
Test RMSE: 5.15721097804
Train RMSE: 4.65375192764
Test RMSE: 5.2363752184
Train RMSE: 4.75917245416
Test RMSE: 4.20131756538
Train RMSE: 4.73666553542
Test RMSE: 4.40507279861
Train RMSE: 4.762078298
Test RMSE: 4.20411316236
Overall Mean Train RMSE: 4.68749323351
Overall Mean Test RMSE: 4.87072681536
Overall Mean Train SSE: 9890.32834051
Overall Mean Test SSE: 1210.81354949
std of train SSE: 323.863180461
std of test SSE: 343.107909111

Yacht dataset Linear Regression
Train RMSE: 9.01954505127
Test RMSE: 7.95523199933
Train RMSE: 8.8198033178
Test RMSE: 9.82110838563
Train RMSE: 8.91119839628
Test RMSE: 9.0183455602
Train RMSE: 8.64581160997
Test RMSE: 11.1964980526
Train RMSE:

In [45]:
dataset = load_dataset("sinData_Train.csv")
dataset = dataset.astype(float)
print('Concrete dataset Linear Regression')
linear_regression(dataset, n_folds=10, lr=0.0007, max_iter=1000, tolerance=0.0001, power=1)

Concrete dataset Linear Regression
Train RMSE: 4.9769080253
Test RMSE: 3.96602822622
Train RMSE: 4.72693626786
Test RMSE: 6.13234759156
Train RMSE: 4.85088743215
Test RMSE: 5.55525182328
Train RMSE: 4.85630339254
Test RMSE: 5.16539524127
Train RMSE: 4.99409416884
Test RMSE: 3.80345854118
Train RMSE: 4.83615662675
Test RMSE: 5.3729712614
Train RMSE: 4.88198999256
Test RMSE: 4.95386858338
Train RMSE: 4.91361748667
Test RMSE: 4.68165749362
Train RMSE: 4.85565594259
Test RMSE: 5.2407223289
Train RMSE: 4.87273472309
Test RMSE: 5.03706613166
Overall Mean Train RMSE: 4.87652840583
Overall Mean Test RMSE: 4.99087672225
Overall Mean Train SSE: 2140.70508347
Overall Mean Test SSE: 253.508251316
std of train SSE: 62.4870327665
std of test SSE: 65.0111709395


In [35]:
aa = np.array([[1, 3], [2, 5]])
aa

array([[1, 3],
       [2, 5]])

In [36]:
x_power_with_ones(aa, 3)

array([[   1.,    1.,    1.,    1.,    3.,    9.,   27.],
       [   1.,    2.,    4.,    8.,    5.,   25.,  125.]])