# Multiple Linear Regression

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')

In [5]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [10]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [20]:
# Encoding categorical data
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categories='auto')
X = onehotencoder.fit_transform(X).toarray()

In [24]:
X

array([[0., 1., 1., ..., 0., 0., 1.],
       [0., 1., 1., ..., 1., 1., 0.],
       [0., 1., 1., ..., 1., 1., 0.],
       ...,
       [1., 0., 0., ..., 1., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 1., 0.]])

In [29]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [39]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [40]:
# Building the optimal model using Backword Elimination

In [44]:
import statsmodels.formula.api as sm

In [46]:
X = np.append(arr=np.ones((50, 1)).astype(int), values = X, axis = 1)

In [48]:
X_opt = X[:, [1, 2, 3, 4, 5]]

In [51]:
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [52]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.264
Model:,OLS,Adj. R-squared:,0.232
Method:,Least Squares,F-statistic:,8.417
Date:,"Wed, 30 Oct 2019",Prob (F-statistic):,0.000751
Time:,22:09:36,Log-Likelihood:,-593.0
No. Observations:,50,AIC:,1192.0
Df Residuals:,47,BIC:,1198.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,3.449e+04,6351.436,5.430,0.000,2.17e+04,4.73e+04
x2,3.449e+04,6351.436,5.430,0.000,2.17e+04,4.73e+04
x3,-1.959e+04,2.06e+04,-0.950,0.347,-6.1e+04,2.19e+04
x4,-3.33e+04,2.64e+04,-1.263,0.213,-8.64e+04,1.98e+04
x5,4.821e+04,1.26e+04,3.840,0.000,2.3e+04,7.35e+04

0,1,2,3
Omnibus:,1.845,Durbin-Watson:,0.139
Prob(Omnibus):,0.398,Jarque-Bera (JB):,1.762
Skew:,0.43,Prob(JB):,0.414
Kurtosis:,2.676,Cond. No.,7.73e+16


In [54]:
# Remove the predictorrs with P-value > SL (0.05)

X_opt = X[:, [1, 2, 5]]

regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.914
Model:,OLS,Adj. R-squared:,0.911
Method:,Least Squares,F-statistic:,256.7
Date:,"Wed, 30 Oct 2019",Prob (F-statistic):,2.33e-26
Time:,22:15:30,Log-Likelihood:,-593.77
No. Observations:,50,AIC:,1192.0
Df Residuals:,48,BIC:,1195.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,3.522e+04,1.04e+04,3.391,0.001,1.43e+04,5.61e+04
x2,3.522e+04,1.04e+04,3.391,0.001,1.43e+04,5.61e+04
x3,4.6e+04,2.06e+04,2.238,0.030,4665.183,8.73e+04

0,1,2,3
Omnibus:,2.469,Durbin-Watson:,0.101
Prob(Omnibus):,0.291,Jarque-Bera (JB):,2.36
Skew:,0.498,Prob(JB):,0.307
Kurtosis:,2.625,Cond. No.,1.36e+16


### 5 Methods of Building Models:

1. All-in
2. Backward Elimination
3. Forward Selection
4. Bidirectional Elimination
5. Score Comparison