### Linear Regression

In [11]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
file_path = 'HousingData.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [5]:
# dropping rows of missing values
data.dropna(inplace=True)

In [6]:
# Separate the features 
X = data.drop(columns='MEDV').values
y = data['MEDV'].values

In [7]:
# Normalize selected features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [9]:
# make training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Display the shapes of above sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((315, 13), (79, 13), (315,), (79,))

We have:
315 training samples
79 testing samples above

In [13]:
# compute the Mean Squared Error
def compute_mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

In [14]:
# perform gradient descent
def gradient_descent(X, y, w, b, learning_rate, epochs):
    n = len(y)
    for epoch in range(epochs):
        y_pred = np.dot(X, w) + b
        dw = -(2/n) * np.dot(X.T, (y - y_pred))
        db = -(2/n) * np.sum(y - y_pred)
        w -= learning_rate * dw
        b -= learning_rate * db
        if epoch % 100 == 0:
            mse = compute_mse(y, y_pred)
            print(f'Epoch {epoch}: MSE = {mse}')
    return w, b

In [15]:
# Main Linear Regression function
def linear_regression(X, y, learning_rate=0.01, epochs=1000):
    n_features = X.shape[1]
    w = np.zeros(n_features)
    b = 0
    w, b = gradient_descent(X, y, w, b, learning_rate, epochs)
    return w, b

In [None]:
# Train the Linear Regression model
learning_rate = 0.01
epochs = 1000
w, b = linear_regression(X_train, y_train, learning_rate, epochs)


In [None]:
# Prediction
def predict(X, w, b):
    return np.dot(X, w) + b

In [12]:
# Evaluate the model on the testing set
y_pred = predict(X_test, w, b)
mse_test = compute_mse(y_test, y_pred)
print(f'Test MSE: {mse_test}')

Epoch 0: MSE = 566.1774920634922
Epoch 100: MSE = 27.145261340532247
Epoch 200: MSE = 17.68102392100222
Epoch 300: MSE = 17.1909172400336
Epoch 400: MSE = 17.00683016346267
Epoch 500: MSE = 16.901868050866195
Epoch 600: MSE = 16.83782009877636
Epoch 700: MSE = 16.79664967413214
Epoch 800: MSE = 16.76891739784327
Epoch 900: MSE = 16.74949525207982
Test MSE: 31.77943012544064


Mean squared Error is 31.77 indicating that the model learned to fit the training data.