In [2]:
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Explanation of the Linear Regression Implementation:
**Loading the Dataset:**
The mpg dataset is loaded, and we extract the horsepower as the input feature (X) and mpg as the target (y). Missing values are dropped.

**Train-Test Split:**
The data is split into training and testing sets, with 80% of the data used for training and 20% for testing.

**Linear Regression:**
The linear regression model is implemented using the equation
𝑦
pred
=
𝜃
0
+
𝜃
1
×
horsepower
y
pred
​
 =θ
0
​
 +θ
1
​
 ×horsepower. Coefficients are initialized arbitrarily, and the cost functions (MSE and R²) are computed.

**Model Evaluation:**
MSE and R² are calculated on both the training and test sets to assess the model's performance.

**Visualization:**
A scatter plot of the actual mpg values vs. horsepower is created, with the predicted values plotted as a line.

In [3]:
# Load the Auto MPG dataset
autompg = sns.load_dataset('mpg')

In [4]:
# Extract 'horsepower' as the feature (X) and 'mpg' as the target (y)
X = autompg['horsepower'].values
y = autompg['mpg'].values

In [5]:
# Split into training and testing sets for better model validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Linear Regression Function
def linear_regression(X, y, cost_func='mse'):
    # Convert X into a 2D array to enable matrix operations (this ensures proper shape for multiplication)
    X = np.array(X).reshape(-1, 1)
    # Add a bias term (intercept) to X by concatenating a column of 1's. This allows the model to learn an intercept term.
    X_b = np.c_[np.ones((len(X), 1)), X]

    # Define the initial coefficients (theta), which include the intercept and slope.
    # These are the model's parameters that will be used to make predictions.
    theta = np.array([1, 0.5])

    # Compute the predicted values (y_pred) by multiplying the feature matrix (X_b) by the coefficients (theta).
    # This calculates the estimated mpg values based on horsepower. NOTE: Look up dot product in python :)

    # If 'mse' is selected as the cost function, calculate the Mean Squared Error between the actual values (y) and predicted values (y_pred).

    # If 'r_squared' is selected as the cost function, calculate the R-squared metric to measure the fit of the model.

    # Return both the selected cost function value and the predicted values (y_pred).

# MSE Function: Mean Squared Error
def mse(y_true, y_pred):
    # Compute the squared differences between actual values (y_true) and predicted values (y_pred).

    # Compute the mean of these squared differences to get the MSE.

    # Return the computed MSE.

# R-squared Function
def r_squared(y_true, y_pred):
    # Calculate the total sum of squares (SS_total), which represents the variance in the actual values (y_true).

    # Calculate the residual sum of squares (SS_res), which represents the variance in the errors (y_true - y_pred).

    # Use the formula R² = 1 - (SS_res / SS_total) to compute R-squared, which measures how well the model explains the variance in the data.

    # Return the computed R-squared value.






# Train the model on the training data
mse_train, _ = linear_regression(X_train, y_train, cost_func='mse')
r2_train, _ = linear_regression(X_train, y_train, cost_func='r_squared')
print("Training Data: MSE:", mse_train, "R²:", r2_train)

# Test the model on the test data
mse_test, y_pred_test = linear_regression(X_test, y_test, cost_func='mse')
r2_test, _ = linear_regression(X_test, y_test, cost_func='r_squared')
print("Test Data: MSE:", mse_test, "R²:", r2_test)

# Visualization: Plot the actual vs predicted values for the test data
#Put all the matplotlib code here

Answer the following questions:


1.   Are the results surprising?
2.   What do they tell you about your implementation?