# Linear regression

In [3]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np 

df = pd.read_csv("../data/Advertising.csv", index_col=0)
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 7.8 KB


In [6]:
# 200 samples - each row is a sample, each sample is a point
df.shape

(200, 4)

In [8]:
X, y = df.drop("Sales", axis = "columns"), df["Sales"]

# X matrix - feature matrix
# each column is a feature
# TV -> x1, Radio -> x2, Newspaper -> x3
X

Unnamed: 0,TV,Radio,Newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4
...,...,...,...
196,38.2,3.7,13.8
197,94.2,4.9,8.1
198,177.0,9.3,6.4
199,283.6,42.0,66.2


In [9]:
# vector of labels - the variable that we want to predict, the answers
y

1      22.1
2      10.4
3       9.3
4      18.5
5      12.9
       ... 
196     7.6
197     9.7
198    12.8
199    25.5
200    13.4
Name: Sales, Length: 200, dtype: float64

## Multiple linear regression 

$y = w_0 + w_1x_1 + w_2x_2 + w_3x_3$

- goal is to estimate $w_i$, $i\in\{0,1,2,3\}$
- we use scikit-learn to do this 

## Scikit-learn steps

Steps: 
1. train|test split - some cases train|validation|test - split
2. Scale the dataset 
    - many algorithms require scaling, some don't
    - which type of scaling to use?
    - scale training data, test data to the training data, to avoid data leakage
3. Fit the algorithm to the training data
4. Transform the training data, transform the test data
5. Calculate evaluation metrics

### 1. Train|test split

In [13]:
from sklearn.model_selection import train_test_split

# help(train_test_split)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# check that it adds up to 200
print(f"{X_train.shape = }")
print(f"{y_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_test.shape = }")

X_train.shape = (140, 3)
y_train.shape = (140,)
X_test.shape = (60, 3)
y_test.shape = (60,)


In [17]:
X_test.head(10)

Unnamed: 0,TV,Radio,Newspaper
96,163.3,31.6,52.9
16,195.4,47.7,52.9
31,292.9,28.3,43.2
159,11.7,36.9,45.2
129,220.3,49.0,3.2
116,75.1,35.0,52.7
70,216.8,43.9,27.2
171,50.0,11.6,18.4
175,222.4,3.4,13.1
46,175.1,22.5,31.5


### 2. Feature scaling

Scaling of data is required for many algorithms 
- normalization (min-max)

  - $X' = \frac{X-X_{min}}{X_{max}-X_{min}}$

- feature standardization (standard normal distribution)
  - $X' = \frac{X - \mu}{\sigma}$

In [18]:
from sklearn.preprocessing import MinMaxScaler

# instantiate a scaler instance from the class MinMaxScaler
scaler = MinMaxScaler()
# finds min and max from X_train and stores them
scaler.fit(X_train)
scaler

In [23]:
# uses the min and max from X_train to 
# transform X_train -> scale X_train
scaled_X_train = scaler.transform(X_train)
# transform X_train -> scale X_test
scaled_X_test = scaler.transform(X_test)

# same shape as before
print(f"{scaled_X_train.shape = }")
print(f"{scaled_X_train.min() = }")
print(f"{scaled_X_train.max() = }")


scaled_X_train.shape = (140, 3)
scaled_X_train.min() = 0.0
scaled_X_train.max() = 1.0


In [25]:
print(f"{scaled_X_test.shape = }")
print(f"{scaled_X_test.min() = }")
print(f"{scaled_X_test.max() = }")

# note that for scaled_X_test -> min is not 0 and max is not 1
# this is because we use min and max from train data not from 
# we have not leaked data now as we used X_train for fit

scaled_X_test.shape = (60, 3)
scaled_X_test.min() = 0.005964214711729622
scaled_X_test.max() = 1.1302186878727631


TV           5.4
Radio        0.8
Newspaper    0.9
dtype: float64

### 3. Fit algorithm (linear regression) to training data

- train our model -> estimate $w_i$ from training data


In [31]:
from sklearn.linear_model import LinearRegression

# instantiates a model instance from LinearRegression class
model = LinearRegression()
# trains the model based on scaled_X_train
# it finds parameters that defines a plane
# that minimizes the total error between the points and the plane
model.fit(scaled_X_train, y_train)
print(f"{model.intercept_ = }")
print(f"{model.coef_ = }")

model.intercept_ = 2.741855324852814
model.coef_ = array([13.02832938,  9.88465985,  0.69237469])


In [34]:
scaled_X_test[0]

array([0.54988164, 0.63709677, 0.52286282])

In [43]:
test_sample_features = scaled_X_test[0]
test_sample_label = y_test.values[0]

print(f"{test_sample_features = }")
print(f"{test_sample_label = }")

model.predict(test_sample_features.reshape(1,-1))

test_sample_features = array([0.54988164, 0.63709677, 0.52286282])
test_sample_label = 16.9


array([16.5653963])

### 4. Predict on test data

In [44]:
y_pred = model.predict(scaled_X_test)
y_pred

array([16.5653963 , 21.18822792, 21.55107058, 10.88923816, 22.20231988,
       13.35556872, 21.19692502,  7.35028523, 13.27547079, 15.12449511,
        9.01443026,  6.52542825, 14.30205991,  8.97026042,  9.45679576,
       12.00454351,  8.91549403, 16.15619251, 10.29582883, 18.72473553,
       19.76821818, 13.77469028, 12.49638908, 21.53501762,  7.60860741,
        5.6119801 , 20.91759483, 11.80627665,  9.08076637,  8.51412012,
       12.17604891,  9.9691939 , 21.73008956, 12.77770578, 18.1011362 ,
       20.07590796, 14.26202556, 20.93826535, 10.83938827,  4.38190607,
        9.51332406, 12.40486324, 10.17045434,  8.09081363, 13.16388427,
        5.2243552 ,  9.28893833, 14.09330719,  8.69024497, 11.66119763,
       15.71848432, 11.63156862, 13.35360735, 11.1531472 ,  6.33636845,
        9.76157954,  9.4195714 , 24.25516546,  7.69519137, 12.15317572])

In [46]:
y_test.to_numpy()

array([16.9, 22.4, 21.4,  7.3, 24.7, 12.6, 22.3,  8.4, 11.5, 14.9,  9.5,
        8.7, 11.9,  5.3, 10.3, 11.7,  5.5, 16.6, 11.3, 18.9, 19.7, 12.5,
       10.9, 22.2,  9.3,  8.1, 21.7, 13.4, 10.6,  5.7, 10.6, 11.3, 23.7,
        8.7, 16.1, 20.7, 11.6, 20.8, 11.9,  6.9, 11. , 12.8, 10.1,  9.7,
       11.6,  7.6, 10.5, 14.6, 10.4, 12. , 14.6, 11.7,  7.2,  6.6,  9.4,
       11. , 10.9, 25.4,  7.6, 11.7])

<div style="max-width:66ch;">

## 5. Evaluate performance

How well did we predict $\bf{y}$ (label) with $\hat{\bf{y}}$ (y_pred)?

To answer this question we use several **evaluation metrics** or **loss functions**: 

- Mean Absolute Error (MAE) - mean of error between $\bf{y}$ and ${\hat{\bf{y}}}$. The unit is same as measured quantity.

$$MAE = \frac{1}{m}\sum_{i=1}^m |y_i - \hat{y}_i|$$

- Mean Squared Error (MSE) - mean of squared errors between $\bf{y}$ and ${\hat{\bf{y}}}$. It punishes large errors, and the units are in square units of the measured quantity

$$MSE = \frac{1}{m}\sum_{i=1}^m (y_i - \hat{y}_i)^2$$

- Root Mean Squared Error (RMSE) - square root of MSE between $\bf{y}$ and ${\hat{\bf{y}}}$. It punishes large errors, and the units are same as measured quantity, hence easier to interpret.

$$RMSE = \sqrt{\frac{1}{m}\sum_{i=1}^m (y_i - \hat{y}_i)^2}$$




</div>

In [47]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# MAE
mean_absolute_error(y_test, y_pred)

1.511669222454909

In [48]:
# MSE
mean_squared_error(y_test, y_pred)

3.7967972367152223

In [49]:
# RMSE
np.sqrt(mean_squared_error(y_test, y_pred))

1.9485372043446392