#### Import Libraries

In [1]:
import pandas as pd
import random
import numpy as np

#### Dowload the data

In [2]:
df = pd.read_csv('http://www0.cs.ucl.ac.uk/staff/M.Herbster/boston-filter/Boston-filtered.csv') #save as dataframe

In [3]:
train_size = round(2*len(df)/3) # size of training data=2/3
test_size = len(df)-train_size # size of test data=1/3
n = len(df) #total rows in data

#### Below function takes input as dataframe and train size and randomly splits the rows in 2/3 and 1/3 parts

In [4]:
def sample_gen(data,train_size): #input data and train size
    np.random.seed(20) #for reproducability of results

    n = len(data) # number of rows
    ind = list(range(0,n)) #generate list of numbers in range n
    train_ind = random.sample(range(0,n),train_size) #randomly select numbers in above range and generate numbers=train size
    test_ind = [x for x in ind if x not in train_ind] # remaining indices are used for test
    return data.iloc[train_ind,:] , data.iloc[test_ind, : ] #generate train and test data
train , test = sample_gen(df,train_size) # use the function to generate train and test data

In [5]:
np.random.seed(20)

mse_train =[] # this will be used to store MSE of train data
mse_test =[] # this will be used to store MSE of test data
bias =[] # bias term
for i in range(0,20):
    train , test = sample_gen(df,train_size) # use the function to generate train and test data
    y_train = train[['MEDV']].copy() # dependent column of training set
    y_test  = test[['MEDV']].copy() # dependent column of testing set
    X = np.array([np.ones(train_size)]) # array of 1's
    X = pd.DataFrame(X.transpose()) #array of 1's converted to data frame

    X = np.array(X.values) #set X array
    Y = np.array(y_train.values) # set y array
    weight = np.matmul(np.linalg.inv(np.matmul(X.transpose() , X)) , np.matmul(X.transpose() , Y)) #calculate weight



    X_test = np.array([np.ones(test_size)]).transpose() # testing data
    y_train[['pred']]= np.matmul((X),np.array([weight[0]]).transpose() ) #calculate prediction on training data
    y_test[['pred']]= np.matmul((X_test),np.array([weight[0]]).transpose() ) #calculate prediction on testing data
  

    mse_train.append(np.mean((y_train['MEDV']-y_train['pred'])**2)) #MSE of training data
    mse_test.append(np.mean((y_test['MEDV']-y_test['pred'])**2)) #MSE of testing data
    bias.append(weight[0]) # Bias term
    


In [6]:
data = {
  "Model": ['Naive Regression'],
  "MSE train mean": [np.mean(mse_train)],
  #"MSE train standard deviation": [np.std(mse_train)],
  "MSE test mean": [np.mean(mse_test)],'Bias':[np.mean(bias)], 'Weights':['NA']
 # "MSE test standard deviation": [np.std(mse_test)]
}
results = pd.DataFrame(data)
results # result summarised in data frame

Unnamed: 0,Model,MSE train mean,MSE test mean,Bias,Weights
0,Naive Regression,83.85424,85.757485,22.413086,


In [7]:
np.random.seed(20)

features = list(df.columns[:-1]) # all columns except last
train_size = round(2*len(df)/3) # train size
test_size = len(df)-train_size #test size
n = len(df) # length of data
for index,feature in enumerate(features): #iterate through index and values of features
    mse_train =[]
    mse_test =[]
    train , test = sample_gen(df,train_size) #generate train and test data
    w=[]
    bias =[]
    for i in range(20): # repeat 20 times
        train , test = sample_gen(df,train_size) # generate train data
        X_train = train[[feature]] # training data
        X_test =  test[[feature]] # test data
        X_train.insert(loc = 0,column = 'bias',value = 1) # introduce bias column of 1's in train data
        X_test.insert(loc = 0,column = 'bias',value = 1) # introduce bias column of 1's in test  data
        y_train = train[['MEDV']].copy() #y train data
        y_test  = test[['MEDV']].copy() # y test data
        X = X_train.values
        Y = y_train.values
        
        weight = np.matmul(np.linalg.inv(np.matmul(X.transpose() , X)) , np.matmul(X.transpose() , Y)) # weights calculated

        y_train[['pred']] = np.matmul(X,np.array([weight[0:2]]).transpose() ).reshape(train_size,1) #prediction of training data
        y_test[['pred']] = np.matmul(X_test.values,np.array([weight[0:2]]).transpose() ).reshape(test_size,1) #prediction of testing data
        mse_train.append(np.mean((y_train['MEDV']-y_train['pred'])**2)) # append MSE of train data
        mse_test.append(np.mean((y_test['MEDV']-y_test['pred'])**2)) # append MSE of test data
        bias.append(weight[0]) # append MSE of bias term data
        w.append(weight[1]) # append weights
    row = pd.Series(['Linear Regression Attribute '+str(index+1)+' ('+feature+')', np.mean(mse_train),  np.mean(mse_test),np.mean(bias),np.mean(w)], index=results.columns) 
    new_df = pd.DataFrame([row])
    results = pd.concat([results, new_df], axis=0, ignore_index=True)
results # final results
   

Unnamed: 0,Model,MSE train mean,MSE test mean,Bias,Weights
0,Naive Regression,83.85424,85.757485,22.413086,
1,Linear Regression Attribute 1 (CRIM),71.753371,72.753983,24.073311,-0.4266
2,Linear Regression Attribute 2 ( ZN ),73.09337,74.495918,20.953716,0.138712
3,Linear Regression Attribute 3 (INDUS ),61.970392,70.614088,29.5935,-0.643551
4,Linear Regression Attribute 4 (CHAS),82.75619,80.516174,22.14895,6.229866
5,Linear Regression Attribute 5 (NOX),69.005939,69.450155,41.73108,-34.486055
6,Linear Regression Attribute 6 (RM),45.380517,40.391782,-33.980492,9.000418
7,Linear Regression Attribute 7 (AGE),71.702562,74.270703,30.921553,-0.121337
8,Linear Regression Attribute 8 (DIS),79.340185,79.146762,18.468042,1.087821
9,Linear Regression Attribute 9 (RAD),72.633918,71.411576,26.472865,-0.410863


In [8]:
np.random.seed(20)
# same code as above but with all features
train , test = sample_gen(df,train_size)
for i in range(20):
    train , test = sample_gen(df,train_size)
    X_train = train[features]
    X_test =  test[features]
    X_train.insert(loc = 0,column = 'bias',value = 1)
    X_test.insert(loc = 0,column = 'bias',value = 1)
    y_train = train[['MEDV']].copy()
    y_test  = test[['MEDV']].copy()
    X = X_train.values
    Y = y_train.values
        
    weight = np.matmul(np.linalg.inv(np.matmul(X.transpose() , X)) , np.matmul(X.transpose() , Y)) 
    y_train[['pred']] = np.matmul(X,np.array([weight[0:14]]).transpose() ).reshape(train_size,1)
    y_test[['pred']] = np.matmul(X_test.values,np.array([weight[0:14]]).transpose() ).reshape(test_size,1)
    mse_train.append(np.mean((y_train['MEDV']-y_train['pred'])**2))
    mse_test.append(np.mean((y_test['MEDV']-y_test['pred'])**2))
    bias.append(weight[0])
row = pd.Series(['Linear Regression (All Attributes)', np.mean(mse_train),  np.mean(mse_test),np.mean(bias),'Total 12 weights '], index=results.columns) 
new_df = pd.DataFrame([row])
results = pd.concat([results, new_df], axis=0, ignore_index=True)
results  

Unnamed: 0,Model,MSE train mean,MSE test mean,Bias,Weights
0,Naive Regression,83.85424,85.757485,22.413086,
1,Linear Regression Attribute 1 (CRIM),71.753371,72.753983,24.073311,-0.4266
2,Linear Regression Attribute 2 ( ZN ),73.09337,74.495918,20.953716,0.138712
3,Linear Regression Attribute 3 (INDUS ),61.970392,70.614088,29.5935,-0.643551
4,Linear Regression Attribute 4 (CHAS),82.75619,80.516174,22.14895,6.229866
5,Linear Regression Attribute 5 (NOX),69.005939,69.450155,41.73108,-34.486055
6,Linear Regression Attribute 6 (RM),45.380517,40.391782,-33.980492,9.000418
7,Linear Regression Attribute 7 (AGE),71.702562,74.270703,30.921553,-0.121337
8,Linear Regression Attribute 8 (DIS),79.340185,79.146762,18.468042,1.087821
9,Linear Regression Attribute 9 (RAD),72.633918,71.411576,26.472865,-0.410863


In [10]:
results.round(2).to_csv('Question4.csv', index=False)