In [2]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import json
import pandas as pd

In [3]:
data = sio.loadmat("boston.mat")

In [4]:
data["boston"]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 3.9690e+02, 4.9800e+00,
        2.4000e+01],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 3.9690e+02, 9.1400e+00,
        2.1600e+01],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 3.9283e+02, 4.0300e+00,
        3.4700e+01],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 5.6400e+00,
        2.3900e+01],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 3.9345e+02, 6.4800e+00,
        2.2000e+01],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 7.8800e+00,
        1.1900e+01]])

In [5]:
df = pd.DataFrame.from_records(data["boston"], columns=["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "BLACK", "LSTAT", "MEDV"])

In [29]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=1/3)

In [33]:
def calculate_mse(predicted, output):
    total = 0
    for y1, y2 in zip(predicted, output):
        total += ((y1 - y2) ** 2)
    return total / len(predicted)

def naive_regression(train_set, test_set):
    train_ones = np.ones((len(train_set), len(train_set)))
    test_ones = np.ones((len(test_set), len(test_set)))
    train_mse = calculate_mse(
        np.linalg.lstsq(train_ones, train_set["MEDV"])[0],
        list(train_set["MEDV"])
    )
    test_mse = calculate_mse(
        np.linalg.lstsq(test_ones, test_set["MEDV"])[0],
        list(test_set["MEDV"])
    )
    return train_mse, test_mse

In [34]:
train_mse_ave, test_mse_ave = 0, 0
for _ in range(20):
    train, test = train_test_split(df, test_size=1/3)
    train_mse, test_mse = naive_regression(train, test)
    train_mse_ave += train_mse
    test_mse_ave += test_mse
print(train_mse_ave / 20, test_mse_ave /20)



3.809748792384099e+26 3.5368090666860514e+27


In [35]:
# f = c is the average of all the output values so that mse is reduced

In [108]:
### single regression

In [96]:
from collections import defaultdict

def get_prediction(weights, xs):
    predicted = []
    for x in xs:
        predicted.append(x * weights[0] + weights[1])
    return predicted

def single_attr_regression():
    train_mses = defaultdict(int)
    test_mses = defaultdict(int)
    for attr in df.columns[:-1]:
        for _ in range(20):
            train, test = train_test_split(df, test_size=1/3)
            xs = train[attr]
            ys = train["MEDV"]
            feature_matrix = []
            for attr_val in xs:
                feature_matrix.append([attr_val, 1])
            weight = np.linalg.lstsq(feature_matrix, ys)[0]
            train_mses[attr] += calculate_mse(get_prediction(weight, xs), ys)
            test_mses[attr] += calculate_mse(get_prediction(weight, test[attr]), test["MEDV"])
    for key in train_mses:
        train_mses[key] /= 20
        test_mses[key] /= 20
    return train_mses, test_mses

In [97]:
single_attr_regression()



(defaultdict(int,
             {'AGE': 73.88134430479434,
              'BLACK': 73.68884487908194,
              'CHAS': 81.16463035498536,
              'CRIM': 71.13968241182597,
              'DIS': 78.85716721087809,
              'INDUS': 64.47064841447379,
              'LSTAT': 38.44191281643897,
              'NOX': 68.81917327442122,
              'PTRATIO': 62.73194016874686,
              'RAD': 72.34832512917372,
              'RM': 43.78222803786089,
              'TAX': 65.5366146679282,
              'ZN': 73.11604863223906}),
 defaultdict(int,
             {'AGE': 70.01600700106209,
              'BLACK': 78.03805991786986,
              'CHAS': 83.86330112802588,
              'CRIM': 73.85021859717854,
              'DIS': 80.20163857248215,
              'INDUS': 65.39468687518753,
              'LSTAT': 38.89767608243805,
              'NOX': 69.78392360937791,
              'PTRATIO': 62.89616046358227,
              'RAD': 72.08181955662278,
              'RM': 4

In [145]:
def get_predicted_for_mult(weight, xs):
    predicted = []
    for _, x in xs.iterrows():
        total = 0.0
        x = x.tolist()
        for w, x_ in zip(weight, x):
            total += x_ * w
        predicted.append(total)
    return predicted

def multi_param_linear_regression():
    train_mse, test_mse = 0, 0
    for _ in range(1):
        train, test = train_test_split(df, test_size=1/3)
        ys = train["MEDV"]
        xs = train.drop(["MEDV"], axis=1)
        xs["bias"] = np.ones(len(xs))
        weight = np.linalg.lstsq(xs, ys)[0]
        predicted = get_predicted_for_mult(weight, xs)
        train_mse += calculate_mse(predicted, ys)
        
        # test the model again test set
        ys = test["MEDV"]
        xs = test.drop(["MEDV"], axis=1)
        xs["bias"] = np.ones(len(xs))
        predicted = get_predicted_for_mult(weight, xs)
        test_mse += calculate_mse(predicted, ys)
    return train_mse / 20, test_mse / 20
multi_param_linear_regression()



(1.0731778701751797, 1.1711125486816631)