In [1]:
# Multiple Linear Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib tk


# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

#dataset

In [2]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

# Showing the results as integers so that we can see the conversion to dumpy variables for 'State'
#X= np.int_(X)
#X

In [3]:
# Avoiding the Dummy Variable Trap . Always omit one dummy variable !!!!!
X = X[:, 1:]

# Python libraries will take that into account, so you don't have to do this step but it is a good reminder!


In [4]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


# We don't have to apply feature scaling since the Python library will apply that automatically for us.
# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""





'from sklearn.preprocessing import StandardScaler\nsc_X = StandardScaler()\nX_train = sc_X.fit_transform(X_train)\nX_test = sc_X.transform(X_test)\nsc_y = StandardScaler()\ny_train = sc_y.fit_transform(y_train)'

In [5]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [6]:
# Building the optimal model using Backward Elimination
import statsmodels.formula.api as sm

# First, we have to make the 'statsmodels' library to understand that our model has a b0 as a constant. 
# Remember that our equation for linear regression is y = b0 + b1*x1 + b2*x2 ....
# Hence if you don't add a column of ones in your dataset, the library will think that you model is y = b1*x1 + b2*x2 ....
# The sklearn library already knows that but the statsmodels librady doesn't!

# X = np.append(arr = X, values = np.ones((50,1)).astype(int), axis = 1)   # if you don't use the 'astype' function, you will get a data error.

# Since we want the line of ones to be as a first column, we will invert the array and values in the append command

X = np.append(arr = np.ones((50,1)).astype(int), values = X, axis = 1)





In [7]:
# ----------------------------------------------- Backward Elimination ---------------------------------------------------
# Creating a new matrix of features --> optimal independent variables --> features with statistical significance that have a high impact on the profit
# Getting all the independent variables' index and drop those that they are not significant
X_opt = X[:,[0,1,2,3,4,5]]

# Fit the model with all the possible features X_opt. OLS = ordinary least squares
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()  #reading the info for OLS, you see that the intercept is not included by default, that is why you did the append command above.

# Look for the predictor with the highest P-value.
regressor_OLS.summary()
# Now look at the P-values for each variable. The lower the p-value, the more significant it is.


# Since x2 has a high p-value, much higher than 5%, we remove the variable. Remeber to check the X_opt indexing so that you remove the correct column
X_opt = X[:,[0,1,3,4,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()  
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Fri, 17 Nov 2017",Prob (F-statistic):,8.49e-29
Time:,12:14:06,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [8]:
# Since x1 has a high p-value, much higher than 5%, we remove the variable.
X_opt = X[:,[0,3,4,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()  
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Fri, 17 Nov 2017",Prob (F-statistic):,4.53e-30
Time:,12:14:06,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [9]:
# Since x2 has a high p-value, much higher than 5%, we remove the variable.
X_opt = X[:,[0,3,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()  
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Fri, 17 Nov 2017",Prob (F-statistic):,2.1600000000000003e-31
Time:,12:14:06,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [10]:
# Which means that the R&D and the Marketing spend have both high impact on the profit. We don't strictly follow the 5% confidence value here but we could.