## Multiple Linear Regression

### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [2]:
df = pd.read_csv('50_Startups.csv')
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [3]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

### Encoding categorical data

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer([('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

### Avoiding the Dummy Variable Trap

In [5]:
X = X[:, 1:]

### Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Building the optimal model using Backward Elimination

In [7]:
import statsmodels.api as sm

X_train = np.concat([np.ones((X_train.shape[0], 1)), X_train], 1).astype(np.float64)
X_train  # alpha = 0.05 (significance)

array([[1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 5.5493950e+04,
        1.0305749e+05, 2.1463481e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 4.6014020e+04,
        8.5047440e+04, 2.0551764e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 7.5328870e+04,
        1.4413598e+05, 1.3405007e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 4.6426070e+04,
        1.5769392e+05, 2.1079767e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 9.1749160e+04,
        1.1417579e+05, 2.9491957e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.1994324e+05,
        1.5654742e+05, 2.5651292e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.0002300e+03,
        1.2415304e+05, 1.9039300e+03],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 5.4205000e+02,
        5.1743150e+04, 0.0000000e+00],
       [1.0000000e+00, 0.0000000e+00,

In [8]:
X_opt = X_train[:, [0, 1, 2, 3, 4, 5]]  # p_max = 2
regressor_OLS = sm.OLS(y_train, X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,129.7
Date:,"Fri, 27 Jun 2025",Prob (F-statistic):,3.91e-21
Time:,00:01:19,Log-Likelihood:,-421.1
No. Observations:,40,AIC:,854.2
Df Residuals:,34,BIC:,864.3
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.255e+04,8358.538,5.091,0.000,2.56e+04,5.95e+04
x1,-959.2842,4038.108,-0.238,0.814,-9165.706,7247.138
x2,699.3691,3661.563,0.191,0.850,-6741.822,8140.560
x3,0.7735,0.055,14.025,0.000,0.661,0.886
x4,0.0329,0.066,0.495,0.624,-0.102,0.168
x5,0.0366,0.019,1.884,0.068,-0.003,0.076

0,1,2,3
Omnibus:,15.823,Durbin-Watson:,2.468
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.231
Skew:,-1.094,Prob(JB):,9.03e-06
Kurtosis:,6.025,Cond. No.,1490000.0


In [9]:
X_opt = X_train[:, [0, 1, 3, 4, 5]]  # p_max = 1
regressor_OLS = sm.OLS(y_train, X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,166.7
Date:,"Fri, 27 Jun 2025",Prob (F-statistic):,2.87e-22
Time:,00:01:19,Log-Likelihood:,-421.12
No. Observations:,40,AIC:,852.2
Df Residuals:,35,BIC:,860.7
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.292e+04,8020.397,5.352,0.000,2.66e+04,5.92e+04
x1,-1272.1608,3639.780,-0.350,0.729,-8661.308,6116.986
x2,0.7754,0.053,14.498,0.000,0.667,0.884
x3,0.0319,0.065,0.488,0.629,-0.101,0.165
x4,0.0363,0.019,1.902,0.065,-0.002,0.075

0,1,2,3
Omnibus:,16.074,Durbin-Watson:,2.467
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.553
Skew:,-1.086,Prob(JB):,4.66e-06
Kurtosis:,6.164,Cond. No.,1430000.0


In [10]:
X_opt = X_train[:, [0, 3, 4, 5]]  # p_max = 2
regressor_OLS = sm.OLS(y_train, X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,227.8
Date:,"Fri, 27 Jun 2025",Prob (F-statistic):,1.8499999999999998e-23
Time:,00:01:19,Log-Likelihood:,-421.19
No. Observations:,40,AIC:,850.4
Df Residuals:,36,BIC:,857.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.299e+04,7919.773,5.428,0.000,2.69e+04,5.91e+04
x1,0.7788,0.052,15.003,0.000,0.674,0.884
x2,0.0294,0.064,0.458,0.650,-0.101,0.160
x3,0.0347,0.018,1.896,0.066,-0.002,0.072

0,1,2,3
Omnibus:,15.557,Durbin-Watson:,2.481
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.539
Skew:,-1.081,Prob(JB):,1.28e-05
Kurtosis:,5.974,Cond. No.,1430000.0


In [11]:
X_opt = X_train[:, [0, 3, 5]]  # p_max = 2
regressor_OLS = sm.OLS(y_train, X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.947
Method:,Least Squares,F-statistic:,349.0
Date:,"Fri, 27 Jun 2025",Prob (F-statistic):,9.65e-25
Time:,00:01:19,Log-Likelihood:,-421.3
No. Observations:,40,AIC:,848.6
Df Residuals:,37,BIC:,853.7
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.635e+04,2971.236,15.598,0.000,4.03e+04,5.24e+04
x1,0.7886,0.047,16.846,0.000,0.694,0.883
x2,0.0326,0.018,1.860,0.071,-0.003,0.068

0,1,2,3
Omnibus:,14.666,Durbin-Watson:,2.518
Prob(Omnibus):,0.001,Jarque-Bera (JB):,20.582
Skew:,-1.03,Prob(JB):,3.39e-05
Kurtosis:,5.847,Cond. No.,497000.0


In [12]:
X_train = X_train[:, [3]]
X_test = X_test[:, [3]]
X_train

array([[ 55493.95],
       [ 46014.02],
       [ 75328.87],
       [ 46426.07],
       [ 91749.16],
       [130298.13],
       [119943.24],
       [  1000.23],
       [   542.05],
       [ 65605.48],
       [114523.61],
       [ 61994.48],
       [ 63408.86],
       [ 78013.11],
       [ 23640.93],
       [ 76253.86],
       [ 15505.73],
       [120542.52],
       [ 91992.39],
       [ 64664.71],
       [131876.9 ],
       [ 94657.16],
       [ 28754.33],
       [     0.  ],
       [162597.7 ],
       [ 93863.75],
       [ 44069.95],
       [ 77044.01],
       [134615.46],
       [ 67532.53],
       [ 28663.76],
       [ 78389.47],
       [ 86419.7 ],
       [123334.88],
       [ 38558.51],
       [  1315.46],
       [144372.41],
       [165349.2 ],
       [     0.  ],
       [ 22177.74]])

## Feature Scaling

In [13]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training the Multiple Linear Regression model on the Training set

In [14]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Predicting the Test set results

In [15]:
y_pred = regressor.predict(X_test)

pd.DataFrame({
    "y_pred": y_pred,
    "y_test": y_test,
    "delta": y_pred - y_test
})

Unnamed: 0,y_pred,y_test,delta
0,203961.420854,103282.38,100679.040854
1,126587.273952,144259.4,-17672.126048
2,142600.76327,146121.95,-3521.18673
3,120557.920789,77798.83,42759.090789
4,134554.154148,191050.39,-56496.235852
5,157308.663738,105008.31,52300.353738
6,104579.058454,81229.06,23349.998454
7,178460.734319,97483.56,80977.174319
8,152980.886993,110352.25,42628.636993
9,126247.612715,166187.94,-39940.327285
