# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
df = pd.read_csv("50_Startups.csv")
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [3]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

## Encoding categorical data

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer([("encode", OneHotEncoder(), [3])], remainder="passthrough")
X = ct.fit_transform(X)
X = X[:, 1:]
X

array([[0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 1.0, 94657.16, 145077.58, 282574.31],
       [1.0, 0.0, 91749.16, 114175.79, 294919.57],
       [0.0, 1.0, 86419.7

## Splitting the dataset into the Training set and Test set

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Feature Scaling

In [6]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the Multiple Linear Regression model on the Training set

In [7]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


## Predicting the Test set results

In [8]:
y_pred = regressor.predict(X_test)

pd.DataFrame({
    "y_pred": y_pred,
    "y_test": y_test,
    "delta": y_pred - y_test
})

Unnamed: 0,y_pred,y_test,delta
0,103015.201598,103282.38,-267.178402
1,132582.277608,144259.4,-11677.122392
2,132447.738452,146121.95,-13674.211548
3,71976.098513,77798.83,-5822.731487
4,178537.482211,191050.39,-12512.907789
5,116161.242302,105008.31,11152.932302
6,67851.692097,81229.06,-13377.367903
7,98791.733747,97483.56,1308.173747
8,113969.43533,110352.25,3617.18533
9,167921.065696,166187.94,1733.125696


## Exercise

Question 1: How do I use my multiple linear regression model to make a single prediction, for example, the profit of a startup with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = California?

In [9]:
regressor.predict(sc.transform([[0, 0, 160000, 130000, 300000]]))

array([181566.92389385])

Question 2: How do I get the final regression equation y = b0 + b1 x1 + b2 x2 + ... with the final values of the coefficients?

In [10]:
real_coef = regressor.coef_ / sc.scale_
real_intercept = regressor.intercept_ - sc.mean_.dot(real_coef)

In [11]:
f"Profits = {real_coef[0]} * Dummy State 1 + {real_coef[1]} * Dummy State 2 + {real_coef[2]} * R&D Spend + {real_coef[3]} * Administrator + {real_coef[4]} * Marketing Spend + {real_intercept}"

'Profits = -959.2841600571103 * Dummy State 1 + 699.3690525192719 * Dummy State 2 + 0.7734671927327995 * R&D Spend + 0.03288459753625479 * Administrator + 0.036610025863856234 * Marketing Spend + 42554.16761773253'