In [66]:
#     AUTHOR:     ERYL KENN VICTORINO
#     PURPOSE:    MULTIPLE LINEAR REGRESSION TUTORIAL
#                 from 'Machine Learning A-Z™: Hands-On Python & R In Data Science' on Udemy
#                 by Kirill Eremenko, Hadelin de Ponteves, and the SuperDataScience Team
#     MOD DATE:   3/19/2019

In [67]:
#   IMPORT LIBRARIES
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as sm

In [68]:
#   IMPORT DATASET
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [69]:
#     ENCODE CATEGORICAL VARIABLES
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

In [70]:
#   PREVENT DUMMY VARIABLE TRAP
X = X[:, 1:]

In [71]:
#   TRAINING/TEST SET SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [72]:
#   FEATURE SCALING
# we don't need to feature scale since the regression library we use below automatically does it for us

In [73]:
#   MULTIPLE LINEAR REGRESSION FIT ON TRAINING SET
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [74]:
#   MULTIPLE LINEAR REGRESSION PREDICTION ON TRAINING SET AND TEST SET
y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

In [75]:
#   MANUAL BACKWARDS ELIMINATION TO FIND OPTIMAL MODEL
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
X_opt1 = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt1).fit()
regressor_OLS.summary()
X_opt1 = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt1).fit()
regressor_OLS.summary()
X_opt1 = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt1).fit()
regressor_OLS.summary()
X_opt1 = X[:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt1).fit()
regressor_OLS.summary()
X_opt1 = X[:, [0, 3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt1).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sun, 24 Mar 2019",Prob (F-statistic):,3.5000000000000004e-32
Time:,14:03:18,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [76]:
#   AUTOMATIC BACKWARDS ELIMINATION TO FIND OPTIMAL MODEL USING ONLY P-VALUES
def backwardElimination1(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x 
sl = 0.05
X_opt2 = X[:, [0, 1, 2, 3, 4, 5]]
X_opt2 = backwardElimination1(X_opt2, sl)

In [77]:
#   AUTOMATIC BACKWARDS ELIMINATION TO FIND OPTIMAL MODEL USING P-VALUES AND ADJUSTED R2
def backwardElimination2(x, SL):
    numVars = len(x[0])
    temp = np.zeros((50,6)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:,j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = sm.OLS(y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:,[0,j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
    regressor_OLS.summary()
    return x 
SL = 0.05
X_opt3 = X[:, [0, 1, 2, 3, 4, 5]]
X_opt3 = backwardElimination2(X_opt3, SL)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     450.8
Date:                Sun, 24 Mar 2019   Prob (F-statistic):           2.16e-31
Time:                        14:03:26   Log-Likelihood:                -525.54
No. Observations:                  50   AIC:                             1057.
Df Residuals:                      47   BIC:                             1063.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.698e+04   2689.933     17.464      0.0