In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import load_boston

boston_dataset = load_boston()

In [3]:
X = boston_dataset['data']
y = boston_dataset['target']
feature_names = [x.lower() for x in boston_dataset['feature_names']]

In [4]:
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train_val, test = train_test_split(df, test_size=100, random_state=42)

In [7]:
train_val.shape, test.shape

((406, 14), (100, 14))

In [8]:
train_val.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,target
86,0.05188,0.0,4.49,0.0,0.449,6.015,45.1,4.4272,3.0,247.0,18.5,395.99,12.86,22.5
75,0.09512,0.0,12.83,0.0,0.437,6.286,45.0,4.5026,5.0,398.0,18.7,383.23,8.94,21.4
477,15.0234,0.0,18.1,0.0,0.614,5.304,97.3,2.1007,24.0,666.0,20.2,349.48,24.91,12.0
15,0.62739,0.0,8.14,0.0,0.538,5.834,56.5,4.4986,4.0,307.0,21.0,395.62,8.47,19.9
332,0.03466,35.0,6.06,0.0,0.4379,6.031,23.3,6.6407,1.0,304.0,16.9,362.25,7.83,19.4


In [9]:
test.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,target
173,0.09178,0.0,4.05,0.0,0.51,6.416,84.1,2.6463,5.0,296.0,16.6,395.5,9.04,23.6
274,0.05644,40.0,6.41,1.0,0.447,6.758,32.9,4.0776,4.0,254.0,17.6,396.9,3.53,32.4
491,0.10574,0.0,27.74,0.0,0.609,5.983,98.8,1.8681,4.0,711.0,20.1,390.11,18.07,13.6
72,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52,22.8
452,5.09017,0.0,18.1,0.0,0.713,6.297,91.8,2.3682,24.0,666.0,20.2,385.09,17.27,16.1


In [10]:
train_val.to_csv('./data/train.csv', index=False)
test[feature_names].to_csv('./data/test.csv', index=False)

In [11]:
train, val = train_test_split(train_val, test_size=100, shuffle=False)

In [12]:
train.shape, val.shape

((306, 14), (100, 14))

In [13]:
train.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,target
86,0.05188,0.0,4.49,0.0,0.449,6.015,45.1,4.4272,3.0,247.0,18.5,395.99,12.86,22.5
75,0.09512,0.0,12.83,0.0,0.437,6.286,45.0,4.5026,5.0,398.0,18.7,383.23,8.94,21.4
477,15.0234,0.0,18.1,0.0,0.614,5.304,97.3,2.1007,24.0,666.0,20.2,349.48,24.91,12.0
15,0.62739,0.0,8.14,0.0,0.538,5.834,56.5,4.4986,4.0,307.0,21.0,395.62,8.47,19.9
332,0.03466,35.0,6.06,0.0,0.4379,6.031,23.3,6.6407,1.0,304.0,16.9,362.25,7.83,19.4


In [14]:
val.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,target
462,6.65492,0.0,18.1,0.0,0.713,6.317,83.0,2.7344,24.0,666.0,20.2,396.9,13.99,19.5
251,0.21409,22.0,5.86,0.0,0.431,6.438,8.9,7.3967,7.0,330.0,19.1,377.07,3.59,24.8
494,0.27957,0.0,9.69,0.0,0.585,5.926,42.6,2.3817,6.0,391.0,19.2,396.9,13.59,24.5
464,7.83932,0.0,18.1,0.0,0.655,6.209,65.4,2.9634,24.0,666.0,20.2,396.9,13.22,21.4
303,0.1,34.0,6.09,0.0,0.433,6.982,17.7,5.4917,7.0,329.0,16.1,390.43,4.86,33.1


In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression(fit_intercept=True)
model.fit(train[feature_names], train.target)

LinearRegression()

In [16]:
def print_metrics(target, pred, dataset_name):
    print(f'{dataset_name} MSE: {mean_squared_error(target, pred)}')
    print(f'{dataset_name} R^2: {r2_score(target, pred)}')

    
def eval_model(model, train, val, test, feature_names=feature_names, target_name='target'):
    train_pred = model.predict(train[feature_names])
    print_metrics(train[target_name], train_pred, 'Train')
    
    val_pred = model.predict(val[feature_names])
    print_metrics(val[target_name], val_pred, 'Validation')
    
    test_pred = model.predict(test[feature_names])
    print_metrics(test[target_name], test_pred, 'Test')

In [17]:
eval_model(model, train, val, test)

Train MSE: 21.453116655188396
Train R^2: 0.7644237898563074
Validation MSE: 22.535747337107104
Validation R^2: 0.6881599636505362
Test MSE: 24.48859175953373
Test R^2: 0.6725688936876795


In [18]:
scala_test_pred = pd.read_csv('./data/predictions.csv', header=None).values.ravel()

In [19]:
print_metrics(test['target'], scala_test_pred, 'Test')

Test MSE: 24.488591759533584
Test R^2: 0.6725688936876814


In [20]:
scala_weights = pd.read_csv('./data/weights.csv', header=None).values.ravel()

In [21]:
scala_weights

array([-1.08344326e-01,  3.67314948e-02, -2.94490067e-03,  2.31707646e+00,
       -1.57982855e+01,  4.26203177e+00,  3.73473767e-03, -1.52205641e+00,
        2.93579041e-01, -1.38676831e-02, -8.66920693e-01,  1.22967018e-02,
       -5.37287450e-01,  3.11595927e+01])

In [22]:
model.coef_

array([-1.08344326e-01,  3.67314948e-02, -2.94490067e-03,  2.31707646e+00,
       -1.57982855e+01,  4.26203177e+00,  3.73473767e-03, -1.52205641e+00,
        2.93579041e-01, -1.38676831e-02, -8.66920693e-01,  1.22967018e-02,
       -5.37287450e-01])

In [23]:
model.intercept_

31.159592730426894