# Linear Regression Tutorial (Improved)

Author: Andrew Andrade (adapted and improved by ChatGPT)

This notebook is an upgraded version of your original TER notebook. It includes:
- Bug fixes and compatibility updates
    - Reshape fixes for sklearn
    - Deprecated parameter updates
- Additional explanations and comments
- Extra plots, metrics (MSE, R²)
- Horizontal residuals and orthogonal regression
- Statsmodels summary


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from math import log
import warnings
warnings.filterwarnings('ignore')

# Try to import sklearn and scipy; if not present, print helpful message
try:
    from sklearn import linear_model
except Exception as e:
    print('sklearn not available. If running locally, install with: pip install scikit-learn')

try:
    import statsmodels.api as sm
except Exception:
    print('statsmodels not available. Install with: pip install statsmodels')

try:
    from scipy.odr import Model, Data, ODR
    from scipy.stats import linregress
except Exception:
    print('scipy not available. Install with: pip install scipy')

%matplotlib inline


In [2]:
# Load dataset (ensure anscombe_i.csv is in the same folder)
import os
if not os.path.exists('anscombe_i.csv'):
    # create anscombe_i.csv from the classic Anscombe I data if missing
    df = pd.DataFrame({'x':[10,8,13,9,11,14,6,4,12,7,5],
                       'y':[8.04,6.95,7.58,8.81,8.33,9.96,7.24,4.26,10.84,4.82,5.68]})
    df.to_csv('anscombe_i.csv', index=False)
anscombe_i = pd.read_csv('anscombe_i.csv')
anscombe_i.head()


In [3]:
# Scatter plot of the data
plt.figure(figsize=(6,4))
plt.scatter(anscombe_i.x, anscombe_i.y, color='black')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Anscombe I - Scatter')
plt.show()


## Fit Linear Regression using sklearn

In [4]:
regr_i = linear_model.LinearRegression()
X = anscombe_i['x'].values.reshape(-1,1)
y = anscombe_i['y'].values.reshape(-1,1)
regr_i.fit(X,y)
print('Coefficient (m):', float(regr_i.coef_[0]))
print('Intercept (b):', float(regr_i.intercept_))
mse = np.mean((regr_i.predict(X)-y)**2)
print('Residual sum of squares (MSE): %.4f' % mse)
print('R^2 score: %.4f' % regr_i.score(X,y))
plt.figure(figsize=(6,4))
plt.scatter(anscombe_i.x, anscombe_i.y, color='black')
plt.plot(X, regr_i.predict(X), color='green', linewidth=2, label='OLS fit')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()


## Residuals and Distribution

In [5]:
from numpy import polyfit
k,d = polyfit(anscombe_i.x, anscombe_i.y, 1)
yfit = k*anscombe_i.x + d
residual = anscombe_i.y - yfit
print('Polyfit slope, intercept:', k, d)
plt.figure(figsize=(6,4))
plt.scatter(anscombe_i.x, residual)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('X')
plt.ylabel('Residual (y - yhat)')
plt.title('Residuals vs X')
plt.show()

plt.figure(figsize=(6,4))
plt.hist(residual, bins=10, density=True, alpha=0.7)
try:
    from scipy.stats import norm
    xs = np.linspace(residual.min(), residual.max(), 200)
    plt.plot(xs, norm.pdf(xs, residual.mean(), residual.std()), 'k--')
except Exception:
    pass
plt.title('Residual distribution')
plt.xlabel('Residual')
plt.show()


## Statsmodels OLS and Summary

In [6]:
try:
    X_sm = sm.add_constant(anscombe_i.x)
    model = sm.OLS(anscombe_i.y, X_sm).fit()
    print(model.summary())
except Exception as e:
    print('statsmodels unavailable or error:', e)


## Horizontal residuals (regress X on Y)

In [7]:
k2,d2 = polyfit(anscombe_i.y, anscombe_i.x, 1)
xfit = k2*anscombe_i.y + d2
plt.figure(figsize=(6,4))
plt.scatter(anscombe_i.x, anscombe_i.y, color='black')
plt.plot(xfit, anscombe_i.y, color='blue', label='Horizontal fit (X~Y)')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()


## Total Least Squares (Orthogonal Distance Regression)

In [8]:
def fit_function(p, x):
    return p[0]*x + p[1]
def orthoregress(x, y):
    lr = linregress(x, y)
    model = Model(fit_function)
    data = Data(x, y)
    od = ODR(data, model, beta0=lr[0:2])
    out = od.run()
    return out.beta
try:
    m_ortho, b_ortho = orthoregress(anscombe_i.x.values, anscombe_i.y.values)
    y_ortho = m_ortho*anscombe_i.x + b_ortho
    plt.figure(figsize=(6,4))
    plt.scatter(anscombe_i.x, anscombe_i.y, color='black')
    plt.plot(anscombe_i.x, y_ortho, 'r', label='Orthogonal fit')
    plt.legend()
    plt.show()
except Exception as e:
    print('scipy.odr not available or error:', e)


## Compare all three lines

In [9]:
plt.figure(figsize=(7,5))
plt.scatter(anscombe_i.x, anscombe_i.y, color='black')
plt.plot(anscombe_i.x, yfit, 'g', label='Vertical residuals (OLS)')
plt.plot(xfit, anscombe_i.y, 'b', label='Horizontal residuals')
try:
    plt.plot(anscombe_i.x, y_ortho, 'r', label='Orthogonal')
except NameError:
    pass
plt.legend()
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Comparison of regression fits')
plt.show()


## Key Takeaways

1. Check assumptions before applying linear regression.
2. Understand when vertical, horizontal, or orthogonal residual minimization is appropriate.
3. Use diagnostics (residual plots, distributions, R², p-values) to evaluate fits.


## Extra: Predict a house price example (Homework hint)

This section demonstrates transforming variables (log/power) and fitting a simple linear model — left as an exercise in the original notebook.