# Lecture 03 - Toy Problem, Linear Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## 1. Toy Data Problem

### Data From Your 100 Friends

In [None]:
socks=np.random.randint(1, 10, 100)
pants=np.random.randint(1, 10, 100)
other_impact=np.random.randn(100)
cost=23.0*pants+5.0*socks+2.0*other_impact
X=np.column_stack((socks, pants, other_impact, cost))

In [None]:
X

### Visualization Cost v.s. Pants

In [None]:
plt.scatter(X[:,1], X[:,3])
plt.title('Scatter plot')
plt.xlabel('pants')
plt.ylabel('cost')
plt.show()

### Visualization Cost v.s. Socks

In [None]:
plt.scatter(X[:,0], X[:,3])
plt.title('Scatter plot')
plt.xlabel('socks')
plt.ylabel('cost')
plt.show()

## 2. Single Variable Linear Regression
#### Model:  $y = \beta_0 +\beta_1 x$

#### Model estimation:
\begin{equation}
\beta_1=\frac{\frac{1}{n}\sum\limits_{i=1}^{n}y^{i}x^{i}-\bar{y}\bar{x}}{{\frac{1}{n}\sum\limits_{i=1}^{n}x^{i}x^{i}}-\bar{x}\bar{x}}=\frac{Cov(X,Y)}{Var(X)}=\rho_{XY}\frac{\sigma_Y}{\sigma_X}
\end{equation}
\begin{equation}
{\beta_0}=(\bar{y}-\beta_1\bar{x})
\end{equation}


In [16]:
Ycost=X[:, 3]  #use the 4th column i.e. cost as the target variable 
Xpants=X[:, 1] #use 2nd column i.e. number of pants as predictor

In [18]:
np.cov(Xpants, Ycost, bias=True)

array([[   7.5971    ,  178.85101756],
       [ 178.85101756, 4398.46887898]])

In [19]:
print(np.cov(Xpants, Ycost, bias=True)[0][1]/(np.cov(Xpants, Ycost, bias=True))[0][0]) # set bias to be true to be consistent with the sklearn calculation

23.5420117621162


In [20]:
beta1=np.cov(Xpants, Ycost, bias=True)[0][1]/(np.cov(Xpants, Ycost, bias=True))[0][0]

In [21]:
print(np.mean(Ycost)-beta1*np.mean(Xpants))

20.4556816167198


#### Calculate the correlation coeffcient $\rho_{XY}$

#### Calculate the linear model coefficients 

$\beta_1=\rho_{XY}\frac{\sigma_Y}{\sigma_X}$

$\beta_0=\bar{y}-\beta_1 \bar{x}$

In [25]:
np.corrcoef(Xpants, Ycost)

array([[1.        , 0.97840065],
       [0.97840065, 1.        ]])

In [26]:
beta1=np.corrcoef(Xpants, Ycost)[0][1]*np.std(Ycost)/np.std(Xpants)

print("beta1", beta1)

beta1 23.542011762116207


In [27]:
beta0=np.mean(Ycost)-beta1*np.mean(Xpants)
print('beta0', beta0)

beta0 20.455681616719758


#### Use sklearn package function 
```class sklearn.linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=None)```
##### Memebers: coef_ ... et.c
##### Methods: fit(self, X, y[, sample_weight]) ... etc.



#### More Details https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [32]:
from sklearn.linear_model import LinearRegression

In [33]:
reg = LinearRegression() # Instantiate an object of class LinearRegression
reg.fit(Xpants.reshape((100,1)), Ycost.reshape((100,1)))
print ([reg.coef_, reg.intercept_])

[array([[23.54201176]]), array([20.45568162])]


## 3. Multivariate Linear Regression

#### Model:  $y = \beta_0 +\beta_1 x_1 + \beta_2 x_2 +\beta_3 x_3+....\beta_m x_m$
#### Model Estimation $\beta=({X^T}{X})^{-1}{X^{T}}{y}$

#### In our case $Cost=\beta_0 + \beta_1 socks+\beta_2 pants $


In [34]:
Xpantsocks=X[:, [0, 1]]
Ycost_colmat=Ycost.reshape(100,1)

#### Fit a linear model with intercept

In [35]:
multireg = LinearRegression()
multireg.fit(Xpantsocks, Ycost_colmat) #X[:,(0,1)] - use the first and second column as predictors
# note the Ycost.reshape is to make the matrix match the dimention of Xpantsocks 
print ([multireg.coef_, multireg.intercept_])

[array([[ 4.91746834, 22.96949057]]), array([0.6066405])]


#### Fit a linear model without intercept

In [36]:
multireg = LinearRegression(fit_intercept=False)
multireg.fit(Xpantsocks, Ycost_colmat) #X[:,(0,1)] - use the first and second column as predictors
# note the Ycost.reshape is to make the matrix match the dimention of Xpantsocks 
print ([multireg.coef_, multireg.intercept_])

[array([[ 4.96474856, 23.02593814]]), 0.0]


#### Multivariate Linear Regression from Scratch: without intercept

#### Model Estimation $\beta=({X^T}{X})^{-1}{X^{T}}{y}$

In [37]:
#Calculate bete follow the estimation equation
beta=np.matmul(np.matmul(np.linalg.inv(np.matmul(Xpantsocks.T, Xpantsocks)), Xpantsocks.T), Ycost_colmat)

In [39]:
print(beta) # note we are missing the beta0 coefficient, we need to add 

[[ 4.96474856]
 [23.02593814]]


#### Multivariate Linear Regression from Scratch: with intercept
#### Trick: add a column all of ones to the matrix Xpantsocks i.e. a matrix
\begin{equation}
X_{nxm,1}=\begin{bmatrix}x_{1,1} & x_{1,2} & \dots & x_{1,n} &1   \\ x_{2,1} & x_{2,2} & \dots & \dots &1  \\      x_{3,1} & x_{3,2} & \ddots & \dots  &1\\      x_{n,1}  & \vdots  & \vdots & x_{n,m}   &1  
\end{bmatrix}  
\end{equation}
The model results from this data set is 

$y=\beta_0 x_0+\beta_1 x_1+\beta_2 x_2+ ....\beta_3 x_3=\beta_0 +\beta_1 x_1+\beta_2 x_2+ ....\beta_3 x_3$
because $x_0=1$

In [40]:
Xpantsocks_1=np.column_stack([Xpantsocks, np.ones([100, 1])]) #create the [X, 1] matrix

In [None]:
Xpantsocks_1

In [42]:
beta=np.matmul(np.matmul(np.linalg.inv(np.matmul(Xpantsocks_1.T, Xpantsocks_1)), Xpantsocks_1.T), Ycost_colmat)
print(beta)

[[ 4.91746834]
 [22.96949057]
 [ 0.6066405 ]]
