## Multiple Linear Regression

### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [2]:
df = pd.read_csv('50_Startups.csv')
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [3]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

### Encoding categorical data

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer([('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

### Avoiding the Dummy Variable Trap

In [5]:
X = X[:, 1:]

### Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Building the optimal model using Forward Selection

In [7]:
import statsmodels.api as sm

X_train = np.concat([np.ones((X_train.shape[0], 1)), X_train], 1).astype(np.float64)
X_train

array([[1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 5.5493950e+04,
        1.0305749e+05, 2.1463481e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 4.6014020e+04,
        8.5047440e+04, 2.0551764e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 7.5328870e+04,
        1.4413598e+05, 1.3405007e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 4.6426070e+04,
        1.5769392e+05, 2.1079767e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 9.1749160e+04,
        1.1417579e+05, 2.9491957e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.1994324e+05,
        1.5654742e+05, 2.5651292e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.0002300e+03,
        1.2415304e+05, 1.9039300e+03],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 5.4205000e+02,
        5.1743150e+04, 0.0000000e+00],
       [1.0000000e+00, 0.0000000e+00,

In [8]:
X_opt = X_train[:, [0, 1]]  # .421
regressor_OLS = sm.OLS(y_train, X_opt).fit()
print(regressor_OLS.summary())

X_opt = X_train[:, [0, 2]]  # 0.829
regressor_OLS = sm.OLS(y_train, X_opt).fit()
print(regressor_OLS.summary())

X_opt = X_train[:, [0, 3]]  # 0
regressor_OLS = sm.OLS(y_train, X_opt).fit()
print(regressor_OLS.summary())

X_opt = X_train[:, [0, 4]] # 0.162
regressor_OLS = sm.OLS(y_train, X_opt).fit()
print(regressor_OLS.summary())

X_opt = X_train[:, [0, 5]] # 0
regressor_OLS = sm.OLS(y_train, X_opt).fit()
print(regressor_OLS.summary())

[3, 5]  # best

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.026
Method:                 Least Squares   F-statistic:                  0.009578
Date:                Fri, 27 Jun 2025   Prob (F-statistic):              0.923
Time:                        00:14:37   Log-Likelihood:                -481.08
No. Observations:                  40   AIC:                             966.2
Df Residuals:                      38   BIC:                             969.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.091e+05   7578.960     14.392      0.0

[3, 5]

In [9]:
X_opt = X_train[:, [0, 3, 1]]  # .691
regressor_OLS = sm.OLS(y_train, X_opt).fit()
print(regressor_OLS.summary())

X_opt = X_train[:, [0, 3, 2]]  # 0.845
regressor_OLS = sm.OLS(y_train, X_opt).fit()
print(regressor_OLS.summary())

X_opt = X_train[:, [0, 3, 4]] # 0.289
regressor_OLS = sm.OLS(y_train, X_opt).fit()
print(regressor_OLS.summary())

X_opt = X_train[:, [0, 3, 5]] # 0.06
regressor_OLS = sm.OLS(y_train, X_opt).fit()
print(regressor_OLS.summary())

[3, 5]  # not optimal
[3]

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.945
Model:                            OLS   Adj. R-squared:                  0.942
Method:                 Least Squares   F-statistic:                     317.7
Date:                Fri, 27 Jun 2025   Prob (F-statistic):           5.01e-24
Time:                        00:14:38   Log-Likelihood:                -423.09
No. Observations:                  40   AIC:                             852.2
Df Residuals:                      37   BIC:                             857.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.832e+04   3009.528     16.056      0.0

[3]

In [10]:
X_train = X_train[:, [3]]
X_test = X_test[:, [3]]

## Feature Scaling

In [11]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training the Multiple Linear Regression model on the Training set

In [12]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Predicting the Test set results

In [13]:
y_pred = regressor.predict(X_test)

pd.DataFrame({
    "y_pred": y_pred,
    "y_test": y_test,
    "delta": y_pred - y_test
})

Unnamed: 0,y_pred,y_test,delta
0,203961.420854,103282.38,100679.040854
1,126587.273952,144259.4,-17672.126048
2,142600.76327,146121.95,-3521.18673
3,120557.920789,77798.83,42759.090789
4,134554.154148,191050.39,-56496.235852
5,157308.663738,105008.31,52300.353738
6,104579.058454,81229.06,23349.998454
7,178460.734319,97483.56,80977.174319
8,152980.886993,110352.25,42628.636993
9,126247.612715,166187.94,-39940.327285
