In [36]:
import numpy as np
import scipy.io as scio
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

data_path="boston.mat"
data = np.array(scio.loadmat(data_path)['boston'])
df = pd.DataFrame(data, columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']) 

gamma = []
for i in range (-40, -25):
    gamma.append(2**i)

sigma = []
for i in range (0, 13):
    sigma.append(2**(7+i*0.5))

# Variables in order:
#  CRIM     per capita crime rate by town
#  ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
#  INDUS    proportion of non-retail business acres per town
#  CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
#  NOX      nitric oxides concentration (parts per 10 million)
#  RM       average number of rooms per dwelling
#  AGE      proportion of owner-occupied units built prior to 1940
#  DIS      weighted distances to five Boston employment centres
#  RAD      index of accessibility to radial highways
#  TAX      full-value property-tax rate per $10,000
#  PTRATIO  pupil-teacher ratio by town
#  B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
#  LSTAT    % lower status of the population
#  MEDV     Median value of owner-occupied homes in $1000's
  

In [37]:
def Gaussian_kernel_matrix(x_train, sigma):# arguments: array and scalar
    l = len(x_train)
    K = np.zeros((l,l))
    for i in range (0,l):
        K[i,i] = 1
        for j in range (i+1,l):
            K[i,j] = np.exp(-np.linalg.norm(x_train[i]-x_train[j])
                            /(2*sigma))
            K[j,i] = K[i,j]
    K = np.mat(K)
    return K

         CRIM    ZN  INDUS  CHAS     NOX     RM    AGE      DIS   RAD    TAX  \
425  15.86030   0.0  18.10   0.0  0.6790  5.896   95.4   1.9096  24.0  666.0   
280   0.03578  20.0   3.33   0.0  0.4429  7.820   64.5   4.6947   5.0  216.0   
422  12.04820   0.0  18.10   0.0  0.6140  5.648   87.6   1.9512  24.0  666.0   
504   0.10959   0.0  11.93   0.0  0.5730  6.794   89.3   2.3889   1.0  273.0   
216   0.04560   0.0  13.89   1.0  0.5500  5.888   56.0   3.1121   5.0  276.0   
88    0.05660   0.0   3.41   0.0  0.4890  7.007   86.3   3.4217   2.0  270.0   
258   0.66351  20.0   3.97   0.0  0.6470  7.333  100.0   1.8946   5.0  264.0   
167   1.80028   0.0  19.58   0.0  0.6050  5.877   79.2   2.4259   5.0  403.0   
197   0.04666  80.0   1.52   0.0  0.4040  7.107   36.6   7.3090   2.0  329.0   
34    1.61282   0.0   8.14   0.0  0.5380  6.096   96.9   3.7598   4.0  307.0   
184   0.08308   0.0   2.46   0.0  0.4880  5.604   89.8   2.9879   3.0  193.0   
343   0.02543  55.0   3.78   0.0  0.4840

In [None]:
def Gaussian_kernel(Xi,Xj,sigma): # arguments: arrays and scalar
    K = np.exp(-np.linalg.norm(Xi - Xj)
           / (2 * sigma))
    return K

In [None]:
def alpha_star(K, gamma, y): # arguments: matrix, scalar and array
    l = len(K)
    I = np.mat(np.identity(l))
    a = (K + gamma*l*I).I*y
    return a

In [None]:
def Cross_validation_parameter_search(X_train, Y_train, gamma, sigma):
    kf = KFold(n_splits=5)
    sum_SSE = np.zeros((len(gamma), len(sigma)))
    best_error = 0
    for p1 in range (0, len(gamma)):
        for p2 in range (0, len(sigma)):
            for train_index, test_index in kf.split(X_train):
                x_train, x_test = X_train[train_index], X_train[test_index]
                y_train, y_test = Y_train[train_index], Y_train[test_index]
                K = Gaussian_kernel_matrix(x_train, sigma[p2])
                a = alpha_star(K, gamma[p1], y_train)
                y_predicted = np.zeros((len(y_test), 1))
                for i in range(0, len(y_test)):
                    for j in range(0, len(x_test)):
                        y_predicted[i, 0] += a[j, 0] * Gaussian_kernel(x_train[j], x_test[i], sigma[p2])
                SSE = np.linalg.norm(y_predicted - y_test)
                sum_SSE[p1, p2] += SSE
            if (p1 == 0 and p2 == 0) or (sum_SSE[p1, p2] < best_error):
                best_error = sum_SSE[p1, p2]
                gamma_star, sigma_star = gamma[p1], sigma[p2]
    # print('Smallest error is', np.amin(sum_SSE))

    return gamma_star, sigma_star

In [None]:
def split_into_train_test(df):
    train, test = train_test_split(df, test_size=0.33)
    train.reset_index(inplace=True)
    test.reset_index(inplace=True)

    # convert data-frames into arrays
    X_train = np.array(train[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
                              'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']])
    Y_train = np.array(train[['MEDV']])
    X_test = np.array(test[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
                            'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']])
    Y_test = np.array(test[['MEDV']])
    return X_train, Y_train, X_test, Y_test

In [None]:
# main
X_train, Y_train, X_test, Y_test = split_into_train_test(df)

gamma_star, sigma_star = Cross_validation_parameter_search(X_train, Y_train, gamma, sigma)
# gamma_star = 2**-25, sigma_star = 2**7

K = Gaussian_kernel_matrix(X_train, sigma_star)
a = alpha_star(K, gamma_star, Y_train)
y_predicted = np.zeros((len(Y_test), 1))
for i in range(0, len(Y_test)):
    for j in range(0, len(X_train)):
        y_predicted[i, 0] += a[j, 0] * Gaussian_kernel(X_train[j], X_test[i], sigma_star)