In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
from LinearRegression import LinearRegression

In [10]:
# initially we run this cell; 
# and each time we make a change we make a change to LinearRegression.py
# we run this cell again
%run LinearRegression.py

In [11]:
df = pd.read_csv('Advertising.csv')

In [12]:
df.head(10)

Unnamed: 0,TV,radio,newspaper,Total spent,sales
0,230.1,37.8,69.2,337.1,22.1
1,44.5,39.3,45.1,128.9,10.4
2,17.2,45.9,69.3,132.4,9.3
3,151.5,41.3,58.5,251.3,18.5
4,180.8,10.8,58.4,250.0,12.9
5,8.7,48.9,75.0,132.6,7.2
6,57.5,32.8,23.5,113.8,11.8
7,120.2,19.6,11.6,151.4,13.2
8,8.6,2.1,1.0,11.7,4.8
9,199.8,2.6,21.2,223.6,10.6


# 1. Compute Cost With Multiple Variables
The equation for the cost function with multiple variables $J(\mathbf{w},b)$ is:
$$J(\mathbf{w},b) = \frac{1}{2m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})^2 \tag{1}$$ 
where:
$$ f_{\mathbf{w},b}(\mathbf{x}^{(i)}) = \mathbf{w} \cdot \mathbf{x}^{(i)} + b  \tag{2} $$

In [13]:
# J is cost; function of w, b
# w is bolded because it's a vector
# b is intercept
# i is superscript, not power (Represents a row)
# x is number of features (it's bolded because there can be multiple x values)
# Bottom equation is the equation for Prediction

# 2. Gradient Descent With Multiple Variables
Gradient descent for multiple variables:

$$\begin{align*} \text{repeat}&\text{ until convergence:} \; \lbrace \newline\;
& w_j = w_j -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial w_j} \tag{3}  \; & \text{for j = 0..n-1}\newline
&b\ \ = b -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial b}  \newline \rbrace
\end{align*}$$

where, n is the number of features, parameters $w_j$,  $b$, are updated simultaneously and where  

$$
\begin{align}
\frac{\partial J(\mathbf{w},b)}{\partial w_j}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})x_{j}^{(i)} \tag{4}  \\
\frac{\partial J(\mathbf{w},b)}{\partial b}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)}) \tag{5}
\end{align}
$$
* m is the number of training examples in the data set

    
*  $f_{\mathbf{w},b}(\mathbf{x}^{(i)})$ is the model's prediction, while $y^{(i)}$ is the target value

In [14]:
# Gradient Descent is given the learning rate
# Learning rate is a hyper parameter we use; look for the one with the highest performance rate
# alpha is learning rate
# Subscript j represents the selected feature
# b belongs to entire row

## 2.1 Compute Gradient with Multiple Variables
An implementation for calculating the equations (4) and (5) is below. There are many ways to implement this. In this version, there is an
- outer loop over all m examples. 
    - $\frac{\partial J(\mathbf{w},b)}{\partial b}$ for the example can be computed directly and accumulated
    - in a second loop over all n features:
        - $\frac{\partial J(\mathbf{w},b)}{\partial w_j}$ is computed for each $w_j$.

In [15]:
df.describe()

Unnamed: 0,TV,radio,newspaper,Total spent,sales
count,200.0,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,200.8605,14.0225
std,85.854236,14.846809,21.778621,92.985181,5.217457
min,0.7,0.0,0.3,11.7,1.6
25%,74.375,9.975,12.75,123.55,10.375
50%,149.75,22.9,25.75,207.35,12.9
75%,218.825,36.525,45.1,281.125,17.4
max,296.4,49.6,114.0,433.6,27.0


In [18]:
X = df['Total spent'].apply(lambda x:[x]).tolist()
#X
X = np.array(X)

In [19]:
y = df['sales']
y = np.array(y)

In [20]:
b = 4.243
w = np.array([0.0487])
w.shape

(1,)

In [21]:
y_pred = np.dot(X, w) + b
y_pred
df['y_pred'] = y_pred
df.head()

Unnamed: 0,TV,radio,newspaper,Total spent,sales,y_pred
0,230.1,37.8,69.2,337.1,22.1,20.65977
1,44.5,39.3,45.1,128.9,10.4,10.52043
2,17.2,45.9,69.3,132.4,9.3,10.69088
3,151.5,41.3,58.5,251.3,18.5,16.48131
4,180.8,10.8,58.4,250.0,12.9,16.418


In [22]:
X.shape

(200, 1)

In [23]:
n_samples = X.shape[0]
cost = 1/(2*n_samples) * np.sum((y_pred - y) ** 2)
cost

3.34611346762725

In [24]:
lr = LinearRegression(0.01, 10, 1)
lr.fit(X, y)

Gradient Descent: cost 111.8581 num_iter 0 weights [32.354288] bias 14.012500000000001
Gradient Descent: cost 25605754.0113 num_iter 1 weights [-15800.19870146] bias -6484.6859648240015
Gradient Descent: cost 6130489440580.5791 num_iter 2 weights [7731131.17043836] bias 3173649.823775467
Gradient Descent: cost 1467752406275906048.0000 num_iter 3 weights [-3.7828739e+09] bias -1552878858.4473338
Gradient Descent: cost 351407036425628754640896.0000 num_iter 4 weights [1.85097559e+12] bias 759829943189.6824
Gradient Descent: cost 84133335241984669288906096640.0000 num_iter 5 weights [-9.05689884e+14] bias -371787881959638.0
Gradient Descent: cost 20143074455591457288897673478275072.0000 num_iter 6 weights [4.43157744e+17] bias 1.8191732296341277e+17
Gradient Descent: cost 4822624080651265213525780572020102135808.0000 num_iter 7 weights [-2.16838887e+20] bias -8.901288611059977e+19
Gradient Descent: cost 1154625281982285814829508914817924008207450112.0000 num_iter 8 weights [1.06100149e+23

In [23]:
lr = LinearRegression(0.001, 10, 1)
lr.fit(X, y)

Gradient Descent: cost 111.8581 num_iter 0 weights [3.2354288] bias 14.021500000000001
Gradient Descent: cost 254851.2679 num_iter 1 weights [-154.7131249] bias -635.8483464824
Gradient Descent: cost 606284008.8012 num_iter 2 weights [7549.12989451] bias 31089.777123929827
Gradient Descent: cost 1442356851574.5645 num_iter 3 weights [-368206.79849959] bias -1516307.9836765246
Gradient Descent: cost 3431384092382286.0000 num_iter 4 weights [17959331.90233318] bias 73958215.67152676
Gradient Descent: cost 8163303537794547712.0000 num_iter 5 weights [-8.75968484e+08] bias -3607320371.547094
Gradient Descent: cost 19420596137325529923584.0000 num_iter 6 weights [4.27254639e+10] bias 175947467655.5166
Gradient Descent: cost 46201829024600482681716736.0000 num_iter 7 weights [-2.08393943e+12] bias -8581858043987.767
Gradient Descent: cost 109914700358543076691231637504.0000 num_iter 8 weights [1.01644386e+14] bias 418581116681620.44
Gradient Descent: cost 261488378489855196826018032648192.00

In [24]:
lr = LinearRegression(0.0001, 10, 1)
lr.fit(X, y)

Gradient Descent: cost 111.8581 num_iter 0 weights [0.32354288] bias 14.022400000000001
Gradient Descent: cost 2439.9522 num_iter 1 weights [-1.21824504] bias -50.964584648240006
Gradient Descent: cost 54859.9870 num_iter 2 weights [6.09203513] bias 258.7197073568133
Gradient Descent: cost 1235123.7748 num_iter 3 weights [-28.60034391] bias -1209.62682180916
Gradient Descent: cost 27809340.2970 num_iter 4 weights [136.01265289] bias 5758.701778341374
Gradient Descent: cost 626140780.6142 num_iter 5 weights [-645.08747089] bias -27305.54706664375
Gradient Descent: cost 14097864948.7583 num_iter 6 weights [3061.26777947] bias 129586.6143462091
Gradient Descent: cost 317420303846.5362 num_iter 7 weights [-14525.57104702] bias -614873.7544172539
Gradient Descent: cost 7146872927991.7627 num_iter 8 weights [68924.83443416] bias 2917627.485689399
Gradient Descent: cost 160915329067909.7188 num_iter 9 weights [-327051.40956812] bias -13844262.68446244
cost 160915329067909.7188 num_iter 9 weig

In [25]:
lr = LinearRegression(0.00001, 10, 1)
lr.fit(X, y)

Gradient Descent: cost 111.8581 num_iter 0 weights [0.03235429] bias 14.022490000000001
Gradient Descent: cost 25.6102 num_iter 1 weights [0.02070617] bias 7.523791535176
Gradient Descent: cost 9.4511 num_iter 2 weights [0.02781289] bias 9.863438185319634
Gradient Descent: cost 6.2394 num_iter 3 weights [0.02674158] bias 8.435979665290638
Gradient Descent: cost 5.4410 num_iter 4 weights [0.02906186] bias 8.651161919434344
Gradient Descent: cost 5.1116 num_iter 5 weights [0.0298142] bias 8.185109574847393
Gradient Descent: cost 4.8898 num_iter 6 weights [0.03113439] bias 8.03399546539259
Gradient Descent: cost 4.7067 num_iter 7 weights [0.03211191] bias 7.768820078348443
Gradient Descent: cost 4.5473 num_iter 8 weights [0.03314358] bias 7.572475717768244
Gradient Descent: cost 4.4069 num_iter 9 weights [0.03406465] bias 7.365253050602965
cost 4.4069 num_iter 9 weights [0.03406465] bias 7.365253050602965


In [27]:
# Not working with iterations
lr = LinearRegression(0.01, 100, 0)
lr.fit(X, y)

cost inf num_iter 99 weights [-5.98948079e+267] bias -2.4586963119742486e+267


In [30]:
X = df[['TV', 'radio', 'newspaper']].values
#X
# We will keep the same y

In [31]:
lr = LinearRegression(0.01, 10, 0)
lr.fit(X, y)

cost 42341498589290172711680801058506152805547376640.0000 num_iter 9 weights [-4.93259509e+23 -6.09118804e+22 -8.07395357e+22] bias -2.537418223078988e+23


In [32]:
# refer to Lab02_ standardization_scaling_normalization.ipynb
df['scaled_TV'] = (df['TV'] - df['TV'].mean()) / df['TV'].std()
df.head()

Unnamed: 0,TV,radio,newspaper,Total spent,sales,y_pred,scaled_TV
0,230.1,37.8,69.2,337.1,22.1,20.65977,0.967425
1,44.5,39.3,45.1,128.9,10.4,10.52043,-1.194379
2,17.2,45.9,69.3,132.4,9.3,10.69088,-1.51236
3,151.5,41.3,58.5,251.3,18.5,16.48131,0.051919
4,180.8,10.8,58.4,250.0,12.9,16.418,0.393196


In [34]:
df['scaled_radio'] = (df['radio'] - df['radio'].mean()) / df['radio'].std()
df['scaled_newspaper'] = (df['newspaper'] - df['newspaper'].mean()) / df['newspaper'].std()
df.head()

Unnamed: 0,TV,radio,newspaper,Total spent,sales,y_pred,scaled_TV,scaled_radio,scaled_newspaper
0,230.1,37.8,69.2,337.1,22.1,20.65977,0.967425,0.979066,1.774493
1,44.5,39.3,45.1,128.9,10.4,10.52043,-1.194379,1.080097,0.667903
2,17.2,45.9,69.3,132.4,9.3,10.69088,-1.51236,1.524637,1.779084
3,151.5,41.3,58.5,251.3,18.5,16.48131,0.051919,1.214806,1.283185
4,180.8,10.8,58.4,250.0,12.9,16.418,0.393196,-0.839507,1.278593


In [36]:
X = df.iloc[:, [6, 7, 8]].values
# Keep the same y

In [37]:
lr = LinearRegression(0.01, 1000, 0)
lr.fit(X, y)

cost 1.3921 num_iter 999 weights [ 3.92889594  2.79684426 -0.02031832] bias 14.012500000000001


In [43]:
print(f'X[0]: {X[0]}')
predicted_sales = lr.predict(X[0])
print(f'Predicted Sales: {predicted_sales} with scaled of TV 230.1, radio 37.8 and newspaper 69.2')

X[0]: [0.9674246  0.97906559 1.77449253]
Gradient Descent: cost 1.3921 num_iter 999 weights [ 3.92889594  2.79684426 -0.02031832] bias 14.012500000000001
Predicted Sales: 20.515649839549084 with scaled of TV 230.1, radio 37.8 and newspaper 69.2
