In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
dataset = pd.read_csv('50_Startups.csv')

In [3]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [5]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values

Encode the categorical data and Encode the independent variable

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
# Index column is 3 for the State
X[:,3] = labelencoder_X.fit_transform(X[:,3])
# Index column is 3 for the State
one_hot_encoder = OneHotEncoder(categorical_features = [3])
X = one_hot_encoder.fit_transform(X).toarray()

In [9]:
Encoded_df = pd.DataFrame(X)

In [10]:
Encoded_df.head()

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,1.0,165349.2,136897.8,471784.1
1,1.0,0.0,0.0,162597.7,151377.59,443898.53
2,0.0,1.0,0.0,153441.51,101145.55,407934.54
3,0.0,0.0,1.0,144372.41,118671.85,383199.62
4,0.0,1.0,0.0,142107.34,91391.77,366168.42


In [11]:
Encoded_df.rename_axis(mapper={0:"California",1:"Florida",2:"New York",3:"R&D Spend",4:'Administration',5:"Marketing Spend"},axis=1,inplace=True)

In [12]:
Encoded_df.head()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend
0,0.0,0.0,1.0,165349.2,136897.8,471784.1
1,1.0,0.0,0.0,162597.7,151377.59,443898.53
2,0.0,1.0,0.0,153441.51,101145.55,407934.54
3,0.0,0.0,1.0,144372.41,118671.85,383199.62
4,0.0,1.0,0.0,142107.34,91391.77,366168.42


Avoid Dummy Variable Trap

In [11]:
# Class will take care of this 
# X = X[:,1:]

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=0)

Fit Multiple Linear Regression to the Training Set

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
regressor = LinearRegression()

In [16]:
regressor = regressor.fit(X_train,y_train)

Prediction of Test Set Results

In [17]:
y_pred = regressor.predict(X_test)

In [18]:
#compare the prediction with the test
predictions = pd.DataFrame(y_pred)

In [19]:
test_data_profits = pd.DataFrame(y_test)

In [20]:
test_data_profits.rename_axis(mapper={0:"Acutal Profit"},axis=1,inplace=True)

In [21]:
test_data_profits['Predicted Profits'] = predictions
test_data_profits

Unnamed: 0,Acutal Profit,Predicted Profits
0,103282.38,103015.201598
1,144259.4,132582.277608
2,146121.95,132447.738452
3,77798.83,71976.098513
4,191050.39,178537.482211
5,105008.31,116161.242302
6,81229.06,67851.692097
7,97483.56,98791.733747
8,110352.25,113969.43533
9,166187.94,167921.065696


Backward Elimination

In [22]:
# Look for the more powerful predictors

In [23]:
# Use stats models to evaluate the statistical signifance and p values of the independent variables
import statsmodels.formula.api as sm

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
X = np.append(arr = np.ones((50,1)).astype(int), values = X , axis=1)

In [26]:
X_optimal_matrix_of_features = X[:,[0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog=y, exog = X_optimal_matrix_of_features).fit()

In [27]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,205.0
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,2.9e-28
Time:,14:06:21,Log-Likelihood:,-526.75
No. Observations:,50,AIC:,1064.0
Df Residuals:,45,BIC:,1073.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.122e+04,4607.941,8.945,0.000,3.19e+04,5.05e+04
x1,1.339e+04,2421.500,5.529,0.000,8511.111,1.83e+04
x2,1.448e+04,2518.987,5.748,0.000,9405.870,1.96e+04
x3,1.335e+04,2459.306,5.428,0.000,8395.623,1.83e+04
x4,0.8609,0.031,27.665,0.000,0.798,0.924
x5,-0.0527,0.050,-1.045,0.301,-0.154,0.049

0,1,2,3
Omnibus:,14.275,Durbin-Watson:,1.197
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.26
Skew:,-0.953,Prob(JB):,6.57e-05
Kurtosis:,5.369,Cond. No.,3.34e+17


In [28]:
X_optimal_matrix_of_features = X[:,[0,1,3,4,5]]
regressor_OLS = sm.OLS(endog=y, exog = X_optimal_matrix_of_features).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,205.0
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,2.9e-28
Time:,14:06:22,Log-Likelihood:,-526.75
No. Observations:,50,AIC:,1064.0
Df Residuals:,45,BIC:,1073.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.57e+04,6502.532,8.565,0.000,4.26e+04,6.88e+04
x1,-1091.1075,3377.087,-0.323,0.748,-7892.910,5710.695
x2,-1130.4509,3344.220,-0.338,0.737,-7866.055,5605.154
x3,0.8609,0.031,27.665,0.000,0.798,0.924
x4,-0.0527,0.050,-1.045,0.301,-0.154,0.049

0,1,2,3
Omnibus:,14.275,Durbin-Watson:,1.197
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.26
Skew:,-0.953,Prob(JB):,6.57e-05
Kurtosis:,5.369,Cond. No.,727000.0


In [29]:
X_optimal_matrix_of_features = X[:,[0,3,4,5]]
regressor_OLS = sm.OLS(endog=y, exog = X_optimal_matrix_of_features).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,278.7
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,1.68e-29
Time:,14:06:22,Log-Likelihood:,-526.81
No. Observations:,50,AIC:,1062.0
Df Residuals:,46,BIC:,1069.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.507e+04,6145.947,8.960,0.000,4.27e+04,6.74e+04
x1,-573.7029,2838.043,-0.202,0.841,-6286.386,5138.981
x2,0.8624,0.030,28.282,0.000,0.801,0.924
x3,-0.0530,0.050,-1.063,0.294,-0.154,0.047

0,1,2,3
Omnibus:,14.902,Durbin-Watson:,1.199
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.212
Skew:,-0.964,Prob(JB):,2.48e-05
Kurtosis:,5.543,Cond. No.,674000.0


In [30]:
X_optimal_matrix_of_features = X[:,[0,3,5]]
regressor_OLS = sm.OLS(endog=y, exog = X_optimal_matrix_of_features).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.041
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,1.01
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,0.372
Time:,14:06:23,Log-Likelihood:,-599.6
No. Observations:,50,AIC:,1205.0
Df Residuals:,47,BIC:,1211.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.613e+04,2.59e+04,2.942,0.005,2.41e+04,1.28e+05
x1,2555.2116,1.2e+04,0.212,0.833,-2.16e+04,2.68e+04
x2,0.2885,0.205,1.404,0.167,-0.125,0.702

0,1,2,3
Omnibus:,0.119,Durbin-Watson:,0.097
Prob(Omnibus):,0.942,Jarque-Bera (JB):,0.139
Skew:,0.099,Prob(JB):,0.933
Kurtosis:,2.835,Cond. No.,567000.0


In [31]:
X_optimal_matrix_of_features = X[:,[0,3]]
regressor_OLS = sm.OLS(endog=y, exog = X_optimal_matrix_of_features).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.02
Method:,Least Squares,F-statistic:,0.04727
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,0.829
Time:,14:06:24,Log-Likelihood:,-600.63
No. Observations:,50,AIC:,1205.0
Df Residuals:,48,BIC:,1209.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.111e+05,7085.628,15.682,0.000,9.69e+04,1.25e+05
x1,2642.1322,1.22e+04,0.217,0.829,-2.18e+04,2.71e+04

0,1,2,3
Omnibus:,0.011,Durbin-Watson:,0.021
Prob(Omnibus):,0.994,Jarque-Bera (JB):,0.082
Skew:,0.022,Prob(JB):,0.96
Kurtosis:,2.807,Cond. No.,2.41


In [32]:
# The optimal variable is the R&D spend

Backward Elimination with p-values only

In [53]:
import statsmodels.formula.api as sm
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)

In [35]:
X_opt = X[:, [0, 1, 2, 3, 4, 5]]

In [37]:
X[0]

array([1.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00,
       1.653492e+05, 1.368978e+05, 4.717841e+05])

Backward Elimination with p-values and Adjusted R Squared

In [54]:
import statsmodels.formula.api as sm
def backwardElimination(x, SL):
    numVars = len(x[0])
    temp = np.zeros((50,6)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:,j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = sm.OLS(y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:,[0,j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
    regressor_OLS.summary()
    return x
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)