In [4]:
import pandas as pd
import numpy as np
from io import StringIO
import sys

sys.setrecursionlimit(10000)

In [5]:
column_names = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS","RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]

#reads file and returns a pandas dataFrame
def read_data(file_path):
    data = []
    #read all data
    with open(file_path, 'r') as file:
        lines = file.readlines()
    #only start after line 22 --> where actual values start
    for line in lines[22:]:
        line = line.strip()
        line = line.split()
        array_of_floats = [float(val) for val in line]
        data.append(array_of_floats)
    #data is read with \n as delimiter, so merge such that we have a row of 14 columns
    merged_data = []
    for i in range(0, len(data), 2):
        merged_array = data[i] + data[i + 1] if i + 1 < len(data) else data[i]
        merged_data.append(merged_array)
    #once merged convert the 2D array into pandas dataframe
    df = pd.DataFrame(merged_data, columns=column_names)
    return df

In [9]:
def normalize(df, variables):
    length = len(variables) - 1
    for i in range(length):
        if variables[i] != 'bias':
            mean = df[variables[i]].mean()
            std = df[variables[i]].std()
            if mean != 0:
                df.loc[:, variables[i]] = (df[variables[i]] - mean)/std
    return df

def sum_squared_error(df, thetas):
    x = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    m = len(y)
    y_hat = hypothesis(df, thetas)
    squared_error = (y - y_hat) ** 2
    summed = np.sum(squared_error)
    return summed

def hypothesis(df, theta):
    x = df.iloc[:, :-1].values
    return np.dot(x, theta)

def cost_function(df, theta):
    x = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    m = len(y)
    
    y_hat = np.dot(x, theta)
    error = y - y_hat
    squared = np.square(error)
    summed = np.sum(squared)
    return summed / (2*m)

def differentiation(df, theta):
    x = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    m = len(y)
    
    y_hat = np.dot(x, theta)
    error = y_hat - y
    final = np.dot(x.T, error) / m
    print(f"shapes, x:{x.shape}, x.T:{x.T.shape}, error:{error.shape}")
    return final

def gradient_descent(df, theta, alpha, cost_difference):
    x = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    m = len(y)
    prev_cost = cost_function(df, theta)
    theta = theta - (alpha * differentiation(df, theta))
    current_cost = cost_function(df, theta)

    if(abs(prev_cost - current_cost) > cost_difference):
        return gradient_descent(df, theta, alpha, cost_difference)
    return theta 

def add_bias_to_df(df):
    df['bias'] = 1
    return df

In [10]:
file_path = "boston.txt"
alpha = 0.01
cost_difference = 0.00000001

In [11]:
variables = ['bias',"CRIM", "ZN", "INDUS", "CHAS", "RM", "AGE", "DIS","RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV", "NOX"]
thetas = np.zeros(len(variables) - 1)
#read data
df = read_data(file_path)
print(df.shape)
df = add_bias_to_df(df)
#split train/test
partial_data = df[variables]
train_partial_data = normalize(partial_data[0:456], variables)
test_partial_data = normalize(partial_data[456:], variables)
thetas = gradient_descent(train_partial_data, thetas, alpha, cost_difference)
squared_error = sum_squared_error(test_partial_data, thetas)
print(f"Squared_error:{squared_error}")

(506, 14)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:

shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14),

shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14), x.T:(14, 456), error:(456,)
shapes, x:(456, 14),

In [None]:
variables = ['bias',"DIS","RAD","NOX"]
thetas = np.zeros(len(variables) - 1)
#read data
df = read_data(file_path)
df = add_bias_to_df(df)
#split train/test
partial_data = df[variables]
train_partial_data = normalize(partial_data[0:456], variables)
test_partial_data = normalize(partial_data[456:], variables)
thetas = gradient_descent(train_partial_data, thetas, alpha, cost_difference)
squared_error = sum_squared_error(test_partial_data, thetas)
print(thetas)
print(f"Squared_error:{squared_error}")

In [573]:
variables = ['bias',"AGE","TAX","MEDV"]
thetas = np.zeros(len(variables) - 1)
#read data
df = read_data(file_path)
df = add_bias_to_df(df)
#split train/test
partial_data = df[variables]
train_partial_data = normalize(partial_data[0:456], variables)
test_partial_data = normalize(partial_data[456:], variables)
thetas = gradient_descent(train_partial_data, thetas, alpha, cost_difference)
squared_error = sum_squared_error(test_partial_data, thetas)
print(thetas)
print(f"Squared_error:{squared_error}")

[22.94098851 -1.52668354 -3.65470227]
Squared_error:1953.6771260168853


In [575]:
variables = ['bias',"CRIM", "ZN", "INDUS", "CHAS", "RM", "AGE", "DIS","RAD", "TAX", "PTRATIO", "B", "LSTAT","NOX" ,"MEDV"]
thetas = np.zeros(len(variables) - 1)
#read data
df = read_data(file_path)
df = add_bias_to_df(df)
#split train/test
partial_data = df[variables]
train_partial_data = normalize(partial_data[0:456], variables)
test_partial_data = normalize(partial_data[456:], variables)
thetas = gradient_descent(train_partial_data, thetas, alpha, cost_difference)
squared_error = sum_squared_error(test_partial_data, thetas)
print(f"Squared_error:{squared_error}")

Squared_error:2081.765178889304
