General Linear Regression Models (Simple, Multiple, Polynomial Linear Regression)

Simple Linear Regression (California Housing Dataset) : 

Imports: 

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split


Loading the Dataset

In [2]:
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
print(df.head())
#Find the Exploratory Data Analysis of the California Housing Dataset in the Data_Analysis Folder


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  


In [3]:
print("Column Names: ")
print(df.columns)

X = df['AveRooms']
y = df['MedInc']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Column Names: 
Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')


In [4]:
#Funtions for simple linear regression:

# Helper Function for MSE: 
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)


# Helper Function to fit model:
def fit_linear_regression(X, y):
    X = np.c_[np.ones(X.shape[0]), X]
    theta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
    return theta[0], theta[1]

# Helper Function to make predictions: 
def predict(X, theta_0, theta_1):
    return theta_0 + theta_1 * X


# Helper function to get R2 score for model evaluation: 
def r2_score(y_true, y_pred):
    numerator = np.sum((y_true - y_pred) ** 2)
    denominator = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (numerator / denominator)


In [5]:
#Training the model
intercept, slope = fit_linear_regression(X_train, y_train)
print(f"Intercept: {intercept:.2f}, Slope: {slope:.2f}")


Intercept: 2.42, Slope: 0.27


In [6]:
#Predictions on the test set
y_pred = predict(X_test, intercept, slope)
#Calculate mean squared error and R-squared score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")


Mean Squared Error: 3.26
R-squared: 0.08


Multiple Linear Regression (Boston Dataset) : 

Imports: 

In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import kaggle
import os


Loading Dataset: 

In [8]:
dataset = 'altavish/boston-housing-dataset'
kaggle.api.dataset_download_files(dataset, path='./', unzip=True)
df = pd.read_csv('HousingData.csv')
os.remove('./HousingData.csv')
df = df.dropna()


Dataset URL: https://www.kaggle.com/datasets/altavish/boston-housing-dataset


Preprocessing dataset: 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df, df.iloc[:,-1].astype(int), test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))


Defining helper functions

In [10]:
def train_multiple_regression(X, y):
    X_transpose = np.transpose(X)
    X_transpose_X = np.dot(X_transpose, X)
    X_transpose_y = np.dot(X_transpose, y)
    weights = np.linalg.inv(X_transpose_X).dot(X_transpose_y)
    return weights

def predict(X, weights):
    return np.dot(X, weights)


Training the Model: 

In [11]:
weights = train_multiple_regression(X_train, y_train)
y_pred = predict(X_test, weights)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print("Weights:", weights)


Mean Squared Error: 3.6533832508866917
Weights: [ 2.15714286e+01  2.40771956e-02  3.21029624e-03  8.47338438e-02
  2.62464556e-03 -3.94713386e-02  2.88832855e-02  1.20811162e-02
  9.86655622e-03  2.81568738e-02 -4.76075354e-02 -4.00220286e-02
  7.46278580e-03 -6.08202083e-03  9.08792200e+00]


Polynomial Regression (Airfoil Dataset): 

Imports: 

In [13]:
pip install ucimlrepo

Collecting ucimlrepo
  Using cached ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Using cached ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [119]:
from sklearn.model_selection import train_test_split
import pandas as pd


Ploynomial Regression Model (AirFoil Dataset)

Loading the Airfoil Dataset

In [120]:
# Load the Airfoil dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat"
column_names = ["Frequency", "Angle of Attack", "Chord Length", "Free-stream Velocity", "Suction Side Displacement Thickness", "Sound Pressure Level"]
df = pd.read_csv(url, sep="\t", header=None, names=column_names)
X = df.iloc[:, :-1].values  # Convert to numpy array
y = df.iloc[:, -1].values   # Convert to numpy array
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Defining functions

In [121]:

# Function to create polynomial features
def create_polynomial_features(X, degree):
    n_features = X.shape
    X_polynomial = X.copy()
    
    for i in range(2, degree + 1):
        for j in range(n_features):
            X_polynomial = np.hstack((X_polynomial, (X[:, j] ** i).reshape(-1, 1)))
    
    return X_polynomial

# Function to fit polynomial regression
def fit_polynomial_regression(X, y, degree):
    X_polynomial = create_polynomial_features(X, degree)
    #Adding intercept form to the polynomial
    X_polynomial = np.hstack((np.ones((X_polynomial.shape[0], 1)), X_polynomial))
    '''Coefficients are important to get the best fit equation
    These coefficients are given by normal function :
    Theta (⊝) = ((X^T)*(y))/(X^T * X) ------ ($)
    '''
    coefficients = np.linalg.inv(X_polynomial.T @ X_polynomial) @ X_polynomial.T @ y
    '''The np.linalg.inv computes inverse of a matrix
    The @ is the multiplication operator in python after Python 3.5
    (I here used Python 3.12.3)
    X_polynomial.T @ X_polynomial give the dot product of Transpose of 
    X_polynomial and X_polynomial'''
    #For a better understanding read the equation in marked with '$' from left to right
    # and the line of code corresponding to this equation marked with '&' from right to left
    return coefficients

# Function to predict using polynomial regression
def predict_polynomial_regression(X, coefficients, degree):
    X_polynomial = create_polynomial_features(X, degree)
    #Adding intercept to the polynomial created in the above line of code
    X_polynomial = np.hstack((np.ones((X_polynomial.shape[0], 1)), X_polynomial))
    
    return X_polynomial @ coefficients

# Function to calculate mean squared error
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)


Fiting the Polynomial Regression Model: 

In [122]:
# Normalize the features
X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0)
X_test = (X_test - X_test.mean(axis=0)) / X_test.std(axis=0)

# Fit the polynomial regression model
degree = 2
coefficients = fit_polynomial_regression(X_train, y_train, degree)

# Predict using the polynomial regression model
predictions = predict_polynomial_regression(X_test, coefficients, degree)

# Calculate mean squared error
error = mse(y_test, predictions)
print(f"The mean squared error associated with fitting the polynomial regression model was found to be {error}")

The mean squared error associated with fitting the polynomial regression model was found to be 21.922857179735153
