# **Multiple Linear Regression**

## **Data Preprocessing**

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
dataset = pd.read_csv('50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
# Is there any NaN data?
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [None]:
# EDA
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
R&D Spend,50.0,73721.6156,45902.256482,0.0,39936.37,73051.08,101602.8,165349.2
Administration,50.0,121344.6396,28017.802755,51283.14,103730.875,122699.795,144842.18,182645.56
Marketing Spend,50.0,211025.0978,122290.310726,0.0,129300.1325,212716.24,299469.085,471784.1
Profit,50.0,112012.6392,40306.180338,14681.4,90138.9025,107978.19,139765.9775,192261.83


In [None]:
# Correlation
dataset.corr()
# Looks like R&D spends more impact on profit

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.241955,0.724248,0.9729
Administration,0.241955,1.0,-0.032154,0.200717
Marketing Spend,0.724248,-0.032154,1.0,0.747766
Profit,0.9729,0.200717,0.747766,1.0


In [None]:
# Replace 0 to NaN (Because 0 doesn't make any sense)
dataset[(dataset['R&D Spend']	== 0)] = dataset[(dataset['R&D Spend']	== 0)].replace(0, np.nan)
dataset[(dataset['Marketing Spend']	== 0)]= dataset[(dataset['Marketing Spend']	== 0)].replace(0, np.nan)

In [None]:
# Check if there any null value
dataset.isnull().sum()

R&D Spend          2
Administration     0
Marketing Spend    3
State              0
Profit             0
dtype: int64

In [None]:
# Find Numerical Values
num_col = list(dataset.loc[:, dataset.dtypes == 'float64'].columns)
num_col

['R&D Spend', 'Administration', 'Marketing Spend', 'Profit']

In [None]:
# fill nul value with imputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
values = imputer.fit_transform(dataset.loc[:, num_col])
dataset.loc[:, num_col] = values
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
R&D Spend,50.0,76793.349583,43312.151465,542.05,46117.0325,75791.365,101602.8,165349.2
Administration,50.0,121344.6396,28017.802755,51283.14,103730.875,122699.795,144842.18,182645.56
Marketing Spend,50.0,224494.784894,109792.846033,1903.93,142431.385,224494.784894,299469.085,471784.1
Profit,50.0,112012.6392,40306.180338,14681.4,90138.9025,107978.19,139765.9775,192261.83


In [None]:
# Correlation matrix
dataset.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.268979,0.666533,0.881354
Administration,0.268979,1.0,-0.07059,0.200717
Marketing Spend,0.666533,-0.07059,1.0,0.693088
Profit,0.881354,0.200717,0.693088,1.0


In [None]:
# Handling Categorical Variable
cat_col = list(dataset.loc[:, dataset.dtypes == 'object'].columns)
cat_col

['State']

In [None]:
# OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first')
new_df = dataset.copy()
values = encoder.fit_transform(dataset.loc[:, cat_col])
col_ind = [x for x in range(values.shape[1])]
new_df.loc[:, col_ind] = values
new_df.drop('State', axis=1, inplace=True)
new_df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,0,1
0,165349.2,136897.8,471784.1,192261.83,0.0,1.0
1,162597.7,151377.59,443898.53,191792.06,0.0,0.0
2,153441.51,101145.55,407934.54,191050.39,1.0,0.0
3,144372.41,118671.85,383199.62,182901.99,0.0,1.0
4,142107.34,91391.77,366168.42,166187.94,1.0,0.0


In [None]:
# correlation
new_df.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,0,1
R&D Spend,1.0,0.268979,0.666533,0.881354,0.062887,-0.010015
Administration,0.268979,1.0,-0.07059,0.200717,0.010493,0.005145
Marketing Spend,0.666533,-0.07059,1.0,0.693088,0.144084,0.047958
Profit,0.881354,0.200717,0.693088,1.0,0.116244,0.031368
0,0.062887,0.010493,0.144084,0.116244,1.0,-0.492366
1,-0.010015,0.005145,0.047958,0.031368,-0.492366,1.0


In [None]:
# Sepeate Dataset into depeandat and independant
dep = 'Profit'
X = new_df.loc[:, new_df.columns != dep].values
y = new_df.loc[:, dep].values

In [None]:
# Split dataset into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=1/3)

## **Fitting Model on train set**

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## **Prediction on test set**

In [None]:
y_pred = regressor.predict(X_test)
y_pred

array([107344.0656611 , 121390.44661097, 132316.4586238 ,  74216.57489357,
       179645.41603457, 119782.42764804,  65399.93734631,  95491.72478471,
       119974.98813275, 167793.82421704,  98719.22638954,  86546.58757724,
       110744.8964275 ,  88484.11860765, 130576.11570289, 162392.99956582,
       151547.85388251])

## **Model Cefficents**

In [None]:
regressor.coef_

array([6.83504116e-01, 1.06648980e-01, 7.33706959e-02, 9.58256035e+03,
       6.47917928e+03])

## **Model Intercept**

In [None]:
regressor.intercept_

24467.441289251627

## **Score**

In [None]:
regressor.score(X_test, y_test)

0.8711047173002344

## **Feature Elimination using statsmodels**

In [None]:
# OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
new_df = dataset.copy()
values = encoder.fit_transform(dataset.loc[:, cat_col])
col_ind = [x for x in range(values.shape[1])]
new_df.loc[:, col_ind] = values
new_df.drop('State', axis=1, inplace=True)
new_df.drop('Profit', axis=1, inplace=True)
new_df.head()

# add 1 into the dataset
new_X = np.append(np.ones((new_df.shape[0],1)), new_df, axis=1)

In [None]:
import statsmodels.api as sm
regressor_OLS = sm.OLS(exog=new_X, endog=y).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.802
Model:,OLS,Adj. R-squared:,0.78
Method:,Least Squares,F-statistic:,35.69
Date:,"Tue, 31 Aug 2021",Prob (F-statistic):,2.02e-14
Time:,08:19:13,Log-Likelihood:,-560.14
No. Observations:,50,AIC:,1132.0
Df Residuals:,44,BIC:,1144.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.17e+04,1.07e+04,2.950,0.005,1e+04,5.34e+04
x1,0.7076,0.093,7.591,0.000,0.520,0.895
x2,0.0103,0.107,0.096,0.924,-0.206,0.226
x3,0.0633,0.036,1.750,0.087,-0.010,0.136
x4,6428.5599,4890.837,1.314,0.196,-3428.275,1.63e+04
x5,1.305e+04,5530.078,2.359,0.023,1902.950,2.42e+04
x6,1.222e+04,5313.734,2.300,0.026,1513.226,2.29e+04

0,1,2,3
Omnibus:,54.528,Durbin-Watson:,0.875
Prob(Omnibus):,0.0,Jarque-Bera (JB):,278.512
Skew:,-2.917,Prob(JB):,3.3299999999999996e-61
Kurtosis:,12.983,Cond. No.,3.08e+21


In [None]:
new_X = new_X[:,[0,1,3,4,5,6]]

regressor_OLS = sm.OLS(exog=new_X, endog=y).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.802
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,45.61
Date:,"Tue, 31 Aug 2021",Prob (F-statistic):,2.8e-15
Time:,08:19:13,Log-Likelihood:,-560.15
No. Observations:,50,AIC:,1130.0
Df Residuals:,45,BIC:,1140.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.263e+04,4748.599,6.871,0.000,2.31e+04,4.22e+04
x1,0.7115,0.083,8.551,0.000,0.544,0.879
x2,0.0620,0.033,1.856,0.070,-0.005,0.129
x3,6707.5644,3901.061,1.719,0.092,-1149.575,1.46e+04
x4,1.337e+04,4326.234,3.091,0.003,4661.056,2.21e+04
x5,1.254e+04,4087.069,3.069,0.004,4312.882,2.08e+04

0,1,2,3
Omnibus:,54.721,Durbin-Watson:,0.873
Prob(Omnibus):,0.0,Jarque-Bera (JB):,281.572
Skew:,-2.926,Prob(JB):,7.2e-62
Kurtosis:,13.045,Cond. No.,4.91e+21


In [None]:
new_X = new_X[:,[0,1,2,4,5]]

regressor_OLS = sm.OLS(exog=new_X, endog=y).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.802
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,45.61
Date:,"Tue, 31 Aug 2021",Prob (F-statistic):,2.8e-15
Time:,08:19:13,Log-Likelihood:,-560.15
No. Observations:,50,AIC:,1130.0
Df Residuals:,45,BIC:,1140.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.933e+04,6877.914,5.719,0.000,2.55e+04,5.32e+04
x1,0.7115,0.083,8.551,0.000,0.544,0.879
x2,0.0620,0.033,1.856,0.070,-0.005,0.129
x3,6666.9737,6665.186,1.000,0.323,-6757.399,2.01e+04
x4,5837.0964,6506.197,0.897,0.374,-7267.056,1.89e+04

0,1,2,3
Omnibus:,54.721,Durbin-Watson:,0.873
Prob(Omnibus):,0.0,Jarque-Bera (JB):,281.572
Skew:,-2.926,Prob(JB):,7.2e-62
Kurtosis:,13.045,Cond. No.,894000.0


In [None]:
new_X = new_X[:,[0,1,2,3]]

regressor_OLS = sm.OLS(exog=new_X, endog=y).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.799
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,60.81
Date:,"Tue, 31 Aug 2021",Prob (F-statistic):,4.86e-16
Time:,08:19:13,Log-Likelihood:,-560.59
No. Observations:,50,AIC:,1129.0
Df Residuals:,46,BIC:,1137.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.172e+04,6330.504,6.590,0.000,2.9e+04,5.45e+04
x1,0.7046,0.083,8.522,0.000,0.538,0.871
x2,0.0669,0.033,2.034,0.048,0.001,0.133
x3,3623.4197,5725.075,0.633,0.530,-7900.558,1.51e+04

0,1,2,3
Omnibus:,58.427,Durbin-Watson:,0.905
Prob(Omnibus):,0.0,Jarque-Bera (JB):,341.892
Skew:,-3.123,Prob(JB):,5.74e-75
Kurtosis:,14.185,Cond. No.,658000.0


In [None]:
new_X = new_X[:,[0,1,2]]

regressor_OLS = sm.OLS(exog=new_X, endog=y).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.797
Model:,OLS,Adj. R-squared:,0.788
Method:,Least Squares,F-statistic:,92.19
Date:,"Tue, 31 Aug 2021",Prob (F-statistic):,5.41e-17
Time:,08:19:56,Log-Likelihood:,-560.81
No. Observations:,50,AIC:,1128.0
Df Residuals:,47,BIC:,1133.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.242e+04,6193.750,6.848,0.000,3e+04,5.49e+04
x1,0.7023,0.082,8.557,0.000,0.537,0.867
x2,0.0698,0.032,2.155,0.036,0.005,0.135

0,1,2,3
Omnibus:,59.576,Durbin-Watson:,0.887
Prob(Omnibus):,0.0,Jarque-Bera (JB):,361.705
Skew:,-3.188,Prob(JB):,2.8600000000000003e-79
Kurtosis:,14.531,Cond. No.,620000.0
