# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

## Importing the dataset

In [2]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# print(X)

## Encoding categorical data

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')

X = np.array(ct.fit_transform(X))   # this always fit the categorical data as dummy data in beginning 

# print(X)


## Splitting the dataset into the Training set and Test set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Mulltiple Linear Regression model on the Training set

In [5]:
from sklearn.linear_model import LinearRegression

# LinearRegression class automatically takes care of
# Dummy variable trap after categorical data is converted to Dummy variable manually
# Also applies backward elimation to the data to detect co-relation between variables

regressor = LinearRegression()          # This creates model based on Multiple Linear Regression
regressor.fit(X_train, y_train)         # This trains the Multiple Linear Regression model on train data

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting the Test set results

In [6]:
# Note about visualising the tested data output
# Since in multiple Linear Regression, the equation is multi vector
# Two vector, original test set dependent variable and predicted test set output is visualised as two vectors

y_pred = regressor.predict(X_test)
np.set_printoptions(precision = 2)

print(np.concatenate(
    (y_pred.reshape(len(y_pred), 1),
    y_test.reshape(len(y_test), 1))
    , axis = 1
))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


## Multiple Linear Regression Equation obtained by Model

In [25]:
intercept = regressor.intercept_
coefficients = regressor.coef_

equation = "y = " + str(intercept) + " + "

for i in range(len(coefficients) - 1):
    equation += str(coefficients[i]) + " * x" + str(i) + " + "

equation += str(coefficients[-1]) + " * x" + str(len(coefficients) - 1)

In [26]:
print(equation)

y = 42467.52924853204 + 86.63836917588786 * x0 + -872.64579087744 * x1 + 786.0074217043534 * x2 + 0.7734671927327668 * x3 + 0.0328845975362384 * x4 + 0.03661002586397899 * x5
