# Multiple Linear Regression

In [224]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
pd.options.display.float_format = '{:,.2f}'.format

In [225]:
df = pd.read_csv('./50_Startups.csv')
# print(f'{df.head()}')
# print('---------------------------------------------------')
# print(f'{df.describe()}')
# print('---------------------------------------------------')
# print(f'{df.info()}')
# print('---------------------------------------------------')
# print(f'{df.columns}')

### Getting X & y

In [226]:
X = df.iloc[:,:-1].values
y = df['Profit'].values

In [227]:
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (50, 4)
y shape: (50,)


### Encoding categorical data

In [228]:
label_encoder_X = LabelEncoder()
X[:, 3] = label_encoder_X.fit_transform(X[:, 3])

### Encoding the independent variable

In [229]:
hot_encoder = OneHotEncoder(categorical_features = [3])
X = hot_encoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


### Avoiding the dummy variable trap

In [230]:
X = X[:,1:]

### Splitting data between train and test

In [231]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=0)

### Encoding the dependent variable

In [232]:
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(y)

### Fitting multiple linear regression to the training set

In [234]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

### Predicting the test  set values

In [235]:
y_pred = lin_reg.predict(X_test)

In [236]:
y_pred

array([103015.20159796, 132582.27760815, 132447.73845175,  71976.09851258,
       178537.48221056, 116161.24230166,  67851.69209676,  98791.73374687,
       113969.43533013, 167921.06569551])

In [237]:
y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])

### Building the optimal model using Backward Elimination (BE)

In [253]:
uno = np.ones(shape=(50,1)).astype(int)
X = np.append(arr=uno,values=X,axis=1)
X.shape

(50, 7)

In [264]:
X_opt = X[:,[0,1,4]]
X_opt.shape

(50, 3)

In [265]:
lin_reg_OLS = sm.OLS(endog=y,exog=X_opt).fit()
lin_reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.959
Model:,OLS,Adj. R-squared:,0.958
Method:,Least Squares,F-statistic:,1129.0
Date:,"Mon, 22 Oct 2018",Prob (F-statistic):,5.25e-35
Time:,19:30:29,Log-Likelihood:,-124.43
No. Observations:,50,AIC:,252.9
Df Residuals:,48,BIC:,256.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.7852,0.401,1.959,0.056,-0.021,1.591
x1,0.7852,0.401,1.959,0.056,-0.021,1.591
x2,0.0003,9.26e-06,33.599,0.000,0.000,0.000

0,1,2,3
Omnibus:,3.75,Durbin-Watson:,1.294
Prob(Omnibus):,0.153,Jarque-Bera (JB):,3.598
Skew:,0.633,Prob(JB):,0.165
Kurtosis:,2.648,Cond. No.,1.06e+17
