In [1]:
import pandas as pd

import matplotlib.pyplot as plt

import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

# Backward eliminination we need statsmodel

import statsmodels.formula.api as sm


In [2]:
data = pd.read_csv("Startup_company.csv")

data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
real_x = data.loc[:, "R&D Spend":"State"].values
real_y = data.loc[:, "Profit"].values

In [4]:
Le = LabelEncoder()

In [5]:
real_x[:, 3] = Le.fit_transform(real_x[:, 3])

In [6]:
oneHe = OneHotEncoder(categorical_features=[3])

In [7]:
real_x = oneHe.fit_transform(real_x).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [8]:
# arrangement for backward elimination for handling dummy variables
real_x =real_x[:,1:]

In [9]:
# apply train test split
training_x, test_x, training_y, test_y = train_test_split(real_x, real_y, test_size= 0.2, random_state = 0)

In [10]:
MLR = LinearRegression()

In [11]:
MLR.fit(training_x, training_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [12]:
pred_y = MLR.predict(test_x)

In [13]:
# chcecking results
test_y

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])

In [14]:
pred_y

array([103015.20159795, 132582.27760816, 132447.73845175,  71976.09851258,
       178537.48221056, 116161.24230165,  67851.69209676,  98791.73374687,
       113969.43533013, 167921.06569552])

In [15]:
# creating unit matrix for xo

real_x = np.append(arr=np.ones((50, 1)).astype(int), values= real_x, axis = 1)


In [16]:
# applying backward elimination model

x_opt = real_x[:,[0,1,2,3,4,5]]

In [17]:
# creating regression model of Ordinary Least Square

reg_OLS =sm.OLS(endog = real_y, exog= x_opt).fit()

In [18]:
# Calling summary of OLS model
reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Mon, 09 Nov 2020",Prob (F-statistic):,1.34e-27
Time:,19:49:50,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [19]:
# Eliminating high p value i.e p value more than 0.05, indepedant variables and reiterating the process of OLS

# in present case x3 shows 0.991 p value, So we eliminate x3

# we eliminate 3 as in summary constant is also a variable starting with count 0

x_opt = real_x[:,[0,1,2,4,5]]

reg_OLS =sm.OLS(endog = real_y, exog= x_opt).fit()

reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.613
Model:,OLS,Adj. R-squared:,0.579
Method:,Least Squares,F-statistic:,17.83
Date:,"Mon, 09 Nov 2020",Prob (F-statistic):,7.78e-09
Time:,19:49:50,Log-Likelihood:,-576.91
No. Observations:,50,AIC:,1164.0
Df Residuals:,45,BIC:,1173.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.903e+04,1.84e+04,1.033,0.307,-1.81e+04,5.61e+04
x1,-1703.7028,9337.989,-0.182,0.856,-2.05e+04,1.71e+04
x2,3875.7625,9002.603,0.431,0.669,-1.43e+04,2.2e+04
x3,0.3239,0.133,2.426,0.019,0.055,0.593
x4,0.2507,0.031,7.997,0.000,0.188,0.314

0,1,2,3
Omnibus:,5.729,Durbin-Watson:,1.266
Prob(Omnibus):,0.057,Jarque-Bera (JB):,5.349
Skew:,-0.461,Prob(JB):,0.0689
Kurtosis:,4.311,Cond. No.,1340000.0


In [21]:
# next high p value is 0.856 for x1 i.e eliminate 1 from indepedant variable and reiterate

x_opt = real_x[:,[0,2,4,5]]

reg_OLS =sm.OLS(endog = real_y, exog= x_opt).fit()

reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.613
Model:,OLS,Adj. R-squared:,0.588
Method:,Least Squares,F-statistic:,24.27
Date:,"Mon, 09 Nov 2020",Prob (F-statistic):,1.45e-09
Time:,19:51:30,Log-Likelihood:,-576.93
No. Observations:,50,AIC:,1162.0
Df Residuals:,46,BIC:,1170.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.854e+04,1.8e+04,1.028,0.309,-1.78e+04,5.48e+04
x1,4691.2363,7732.145,0.607,0.547,-1.09e+04,2.03e+04
x2,0.3233,0.132,2.449,0.018,0.058,0.589
x3,0.2495,0.030,8.241,0.000,0.189,0.310

0,1,2,3
Omnibus:,5.975,Durbin-Watson:,1.262
Prob(Omnibus):,0.05,Jarque-Bera (JB):,5.75
Skew:,-0.467,Prob(JB):,0.0564
Kurtosis:,4.374,Cond. No.,1320000.0


In [22]:
# next is x1

x_opt = real_x[:,[0,4,5]]

reg_OLS =sm.OLS(endog = real_y, exog= x_opt).fit()

reg_OLS.summary()

# As can be seen from summary for independant variable 4 and 5 shows less p vale < 0.05. Hence 4 and 5 are most significant
#  variables for optimising the regression model

0,1,2,3
Dep. Variable:,y,R-squared:,0.61
Model:,OLS,Adj. R-squared:,0.593
Method:,Least Squares,F-statistic:,36.71
Date:,"Mon, 09 Nov 2020",Prob (F-statistic):,2.5e-10
Time:,19:52:54,Log-Likelihood:,-577.13
No. Observations:,50,AIC:,1160.0
Df Residuals:,47,BIC:,1166.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.022e+04,1.77e+04,1.143,0.259,-1.54e+04,5.58e+04
x1,0.3237,0.131,2.468,0.017,0.060,0.588
x2,0.2488,0.030,8.281,0.000,0.188,0.309

0,1,2,3
Omnibus:,6.584,Durbin-Watson:,1.279
Prob(Omnibus):,0.037,Jarque-Bera (JB):,6.524
Skew:,-0.512,Prob(JB):,0.0383
Kurtosis:,4.443,Cond. No.,1300000.0
