The notebook to test different ways of training linear-regression.

In [310]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, SGDRegressor
import random
import matplotlib.pyplot as plt

%run custom_transformers.ipynb

In [98]:
df = pd.read_csv('data/possum.csv')

df["Pop"] = df["Pop"].astype('category')
df["sex"] = df["sex"].astype('category')
df["site"] = df["site"].astype('category')

df = df.rename(columns={"Pop":"pop"})

df = df.dropna()

del df["case"]

In [99]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

train_set_new = train_set.copy()
train_set_labels = train_set_new["age"].copy()
train_set_new = train_set_new.drop("age", axis=1)

In [100]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
    ])

num_attribs = list(set(df.columns).difference(set(["site", "pop", "sex", "age"])))
cat_attribs = ["pop", "sex"] 

pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", myLabelEncoder() , cat_attribs),
 ("site", OneHotEncoder() , ["site"])
 ])

train_set_new.head(10)

Unnamed: 0,site,pop,sex,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
92,7,other,m,89.2,54.0,82.0,38.0,63.8,44.9,12.8,24.0,31.0
26,1,Vic,f,90.5,54.5,85.0,35.0,70.3,50.8,14.2,23.0,28.0
44,2,Vic,m,90.7,55.9,81.0,34.0,71.5,54.0,14.6,27.0,31.5
73,6,other,f,88.7,52.0,83.0,38.0,61.5,45.9,14.7,26.0,34.0
15,1,Vic,m,91.6,56.0,86.0,34.5,73.0,51.4,14.4,28.0,32.0
41,2,Vic,m,85.3,54.1,77.0,32.0,62.7,51.2,13.8,25.5,33.0
75,6,other,m,92.4,56.8,89.0,41.0,64.5,46.4,17.8,26.0,33.0
9,1,Vic,f,91.8,58.0,89.5,37.5,70.9,53.4,14.4,27.5,32.0
99,7,other,m,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
11,1,Vic,f,94.9,55.6,92.0,35.5,71.7,51.0,15.3,28.0,33.0


In [101]:
train_set_ready = pipeline.fit_transform(train_set_new)
train_set_ready.shape

(80, 18)

In [105]:
X = train_set_ready.copy()
y = np.ravel(train_set_labels.copy())

test_set_new = test_set.copy()
test_set_new = test_set_new[test_set_new['age'].notna()]
test_set_labels = test_set_new["age"].copy()
test_set_new = test_set_new.drop("age", axis=1)

test_set_new_ready = pipeline.transform(test_set_new)
test_set_new_ready.shape

X_test, y_test = test_set_new_ready, test_set_labels

<h2> Normal Equation </h2>
m - number of training instances, n - number of features

- fast for large m
- no out-of-core support
- slow for large n
- 0 hyperparameters
- no scaling required

In [138]:
X_b = np.c_[np.ones((X.shape[0], 1)), X]
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

In [139]:
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
y_pred = X_test_b.dot(theta_best)

In [140]:
mean_squared_error(y_test, y_pred)

44.2613509991435

In [141]:
np.linalg.cond(X.T.dot(X))

1.3773188026630424e+17

The matrix is almost singular, so a computation of its inverse generates lots of numerical errors, that's why mse is that big.

<h2> SVD </h2>

- fast for large m
- no out-of-core support
- slow for large n
- 0 hyperparameters
- no scaling required
- from sklearn.linear_model import LinearRegression

- https://github.com/scikit-learn/scikit-learn/blob/36958fb24/sklearn/linear_model/_base.py#L529

In [173]:
lin_reg = LinearRegression()
lin_reg.fit(X, y)
y_pred = lin_reg.predict(X_test)
mean_squared_error(y_test, y_pred)

4.397082988769

<h2> Batch Gradient Descent </h2>

- slow for large m
- no out-of-core support
- fast for large n
- 2 hyperparameters
- scaling required

In [318]:
n_iterations = 500
m = X.shape[0]
eta = 0.1

theta = [0] * (X.shape[1]+1)


In [319]:
for i in range(n_iterations):
    gradient = 2/m * X_b.T.dot(X_b.dot(theta) - y)
    theta = theta - eta * gradient

theta_best = theta

In [320]:
y_pred = X_test_b.dot(theta_best)
mean_squared_error(y_test, y_pred)

4.372434550056197

<h2> Stochastic Gradient Descent </h2>

- fast for large m
- out-of-core support
- fast for large n
- 2 or more hyperparameters
- scaling required
- from sklearn.linear_model import SGDRegressor
- https://github.com/scikit-learn/scikit-learn/blob/36958fb24/sklearn/linear_model/_stochastic_gradient.py#L1694

In [321]:
lin_reg = SGDRegressor()
lin_reg.fit(X, y)
y_pred = lin_reg.predict(X_test)
mean_squared_error(y_test, y_pred)

4.465859605894363

In [322]:
n_epochs = 100
t0, t1 = 10, 100
theta = [0] * (X.shape[1]+1)
eta = 0.1

def learning_rate(t):
    return t0 / (t + t1)

for i in range(n_epochs):
    for j in range(m):
        random_sample = np.random.randint(0, m)
        x_i = X_b[random_sample]
        y_i = y[random_sample]
        gradient = 2 * x_i.T.dot(x_i.dot(theta) - y_i)
        eta = learning_rate(i * m +j)
        theta = theta - eta * gradient
        
theta_best = theta

In [323]:
y_pred = X_test_b.dot(theta_best)
mean_squared_error(y_test, y_pred)

4.508150517987433

<h2> Mini-batch Gradient Descent </h2>

- fast for large m
- out-of-core support
- fast for large n
- 2 or more hyperparameters
- scaling required

In [324]:
n_epochs = 100
t0, t1 = 10, 100
theta = [0] * (X.shape[1]+1)
batch_size = int(0.1 * m)
eta = 0.1

for i in range(n_epochs):
    for j in range(m):
        random_samples = random.sample(range(0, m), batch_size)
        x_i = X_b[random_samples]
        y_i = y[random_samples]
        gradients = 2 / batch_size * x_i.T.dot(x_i.dot(theta) - y_i)
        eta = learning_rate(i * m +j)
        theta = theta - eta * gradients
        
theta_best = theta

In [325]:
y_pred = X_test_b.dot(theta_best)
mean_squared_error(y_test, y_pred)

4.422054505501228