In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

In [None]:
data = sns.load_dataset('mpg')
data.head()

In [None]:
fig = px.scatter(data, x="displacement", y="mpg", trendline="ols")
fig

In [None]:
model = px.get_trendline_results(fig).iloc[0][0]
model.summary()

# Making the Design Matrix

In [None]:
X = data[ ['displacement'] ]
y = data[ ['mpg'] ]

In [None]:
X.head()

Adding the constant term:

In [None]:
data["constant"] = 1
X = data[['displacement', "constant"]]

In [None]:
X.head()

In [None]:
X.head()

$$
(X^T X)^{-1} X^T y
$$

In [None]:
X.T @ X

In [None]:
X.T @ y

## Solving the Linear System

In [None]:
from numpy.linalg import inv, solve

The exact math from lecture.

$$
(X^T X)^{-1} X^T y
$$

In [None]:
inv(X.T @ X) @ (X.T @ y)

More numerically stable and computationally efficient.

$$
A\theta = b
$$

$$
X^T X \theta = X^T y
$$

In [None]:
solve(X.T @ X, X.T @ y)

## Using a software package:

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression(fit_intercept=False)
model.fit(X, y)
print(model.coef_)

## Making Predictions

In [None]:
theta = solve(X.T @ X, X.T @ y)

In [None]:
theta

In [None]:
data['yhat'] = (X @ theta)
data.head()

In [None]:
data['yhat'] = model.predict(X)
data.head()

## Examining the Residuals

In [None]:
data["residual"] = data["mpg"] - data["yhat"] 

In [None]:
fig = px.scatter(data, x="displacement", y="residual")
fig.add_trace(go.Scatter(x=[50, 475], y=[0,0], name = "Model"))

In [None]:
data['residual'].sum()

## Root Mean Squared Error

$$
\sqrt{\frac{1}{n} \sum_{i=1}^n  \left(y_i - \hat{y}_i\right)^2 } 
$$

In [None]:
np.sqrt(np.mean(data['residual']**2))

In [None]:
data["residual"].abs().mean()

## Improving the Model

In [None]:
data.head()

In [None]:
model2 = LinearRegression(fit_intercept=True)
features = ["cylinders", "displacement", "weight", "model_year", "acceleration"]

model2.fit(data[features], data[['mpg']])
print(model2.coef_)

In [None]:
data['yhat2'] = model2.predict(data[features])

What can we say about the magnitudes of the weights?

In [None]:
px.histogram(data, 
             x = ["cylinders", "displacement", "weight"],
             barmode="overlay",
             marginal="box")

Rescaling the features

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Z = scaler.fit_transform(data[features])

In [None]:
Z

In [None]:
px.histogram(x = [Z[:,0], Z[:,1], Z[:,2]],
             barmode="overlay")

In [None]:
model3 = LinearRegression(fit_intercept=True)
model3.fit(Z, data[['mpg']])
print(model3.coef_)
data['yhat3'] = model3.predict(Z)

In [None]:
features

In [None]:
data

### Residual Analysis

In [None]:
data['residual2'] = data['mpg'] - data['yhat2']

In [None]:
np.sqrt(np.mean(data['residual2']**2))

## Computing [$R^2$](https://scikit-learn.org/stable/modules/model_evaluation.html#r2-score-the-coefficient-of-determination)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(data["mpg"], data["yhat"])

In [None]:
r2_score(data["mpg"], data["yhat2"])