In [1]:
# Importing libraries
# -------------------
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
# ---------------------
dataset = pd.read_csv('50_Startups.csv')
dataset = dataset.rename(columns={'A': 'a','Marketing Spend':'Marketing_Spend','R&D Spend':'R_D_Spend'})
dataset.head()

Unnamed: 0,R_D_Spend,Administration,Marketing_Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# Encoding categorical data
# -------------------------
dataset = pd.get_dummies(dataset).drop("State_New York",axis=1)
dataset.head()

Unnamed: 0,R_D_Spend,Administration,Marketing_Spend,Profit,State_California,State_Florida
0,165349.2,136897.8,471784.1,192261.83,0,0
1,162597.7,151377.59,443898.53,191792.06,1,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,0,0
4,142107.34,91391.77,366168.42,166187.94,0,1


In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [5]:
# Splitting the dataset into the Training set and Test set
# --------------------------------------------------------
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [6]:
# Building the optimal model using Backward Elimination
import statsmodels.formula.api as sm

regressor_OLS = sm.ols(formula ='Profit ~ R_D_Spend + Administration + Marketing_Spend + State_California + State_Florida', data=dataset).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Fri, 27 Nov 2020",Prob (F-statistic):,1.34e-27
Time:,21:45:06,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.008e+04,6952.587,7.204,0.000,3.61e+04,6.41e+04
R_D_Spend,0.8060,0.046,17.369,0.000,0.712,0.900
Administration,-0.0270,0.052,-0.517,0.608,-0.132,0.078
Marketing_Spend,0.0270,0.017,1.574,0.123,-0.008,0.062
State_California,41.8870,3256.039,0.013,0.990,-6520.229,6604.003
State_Florida,240.6758,3338.857,0.072,0.943,-6488.349,6969.701

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1470000.0


In [7]:
regressor_OLS = sm.ols(formula ='Profit ~ R_D_Spend + Administration + Marketing_Spend', data=dataset).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Fri, 27 Nov 2020",Prob (F-statistic):,4.53e-30
Time:,21:45:06,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
R_D_Spend,0.8057,0.045,17.846,0.000,0.715,0.897
Administration,-0.0268,0.051,-0.526,0.602,-0.130,0.076
Marketing_Spend,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [8]:
dataset.drop(["State_Florida","State_California"],axis=1,inplace=True)
dataset.head()

Unnamed: 0,R_D_Spend,Administration,Marketing_Spend,Profit
0,165349.2,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [9]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)