# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('50_Startups.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
print(x[0:3])

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']]


## Encoding categorical data

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [5]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [6]:
print(x[0:3])

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]]


## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [9]:
from sklearn.linear_model import LinearRegression #this class take cares of feature selection (chosing the features with the highest P values) for us, and, also, takecares of the dummy trap, on the categorical data, for us

In [10]:
regressor = LinearRegression() # here we have the model, but it is not trained in the dataset, yet, it is dumb
regressor.fit(x_train, y_train)

LinearRegression()

## Predicting the Test set results

In [11]:
y_pred = regressor.predict(x_test) #vector with predicted values in the test set
np.set_printoptions(precision=2) # we set the number of decimal places

#change the data from horizontal to vertical, for improving the vizualization
y_pred_vert = y_pred.reshape(len(y_pred), 1) #we pass the number of columns and the desired number of columns
y_test_vert = y_test.reshape(len(y_test), 1)

#for comparing the model results, we'll concatenate, side by side the predicted results with the test result
comparation = np.concatenate((y_pred_vert, y_test_vert), 1) #we pass the vector we will concatenate, within a tuple, and we pass 1 to tell we want an horizontal concatenation (0 would be vertical, one below the other)

#lastly we print the result
print(comparation)

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


## Make a prediction for a single entrace

for example the profit of a startup with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = 'California'

In [26]:
#input values (features)
state = [1, 0, 0] # 1,0,0 is california
rd_spending = 160000
administration_spending = 130000
marketing_spending = 300000

input_data = [state[0], state[1], state[2], rd_spending, administration_spending, marketing_spending]

#make prediction for values
predicted_result = regressor.predict([input_data])

print(predicted_result)


[181566.92]


In [None]:
## Get the final linear equation with the values for the coefficients

In [32]:
regressor_coefficients = regressor.coef_
regressor_interceptor = regressor.intercept_

print(f'Profit = state_1 x {"%.2f" % regressor_coefficients[0]} + state_2 x {"%.2f" % regressor_coefficients[1]} + state_3 x {"%.2f" % regressor_coefficients[2]} + R&D_spends x {"%.2f" % regressor_coefficients[3]} + administration_spends x {"%.2f" % regressor_coefficients[4]} + marketing_spends x {"%.2f" % regressor_coefficients[5]} + {regressor_interceptor}')

Profit = state_1 x 86.64 + state_2 x -872.65 + state_3 x 786.01 + R&D_spends x 0.77 + administration_spends x 0.03 + marketing_spends x 0.04 + 42467.52924854249
