# Multiple Linear Regression

## Importing the libraries

In [35]:
import pandas as pd

## Importing the dataset

In [36]:
# converts data in .csv file into pandas dataframe
df = pd.read_csv('50_Startups.csv')

In [37]:
# by default, shows first five entries in dataframe
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [38]:
# shows a touple of rows and columns
df.shape

(50, 5)

In [39]:
# shows statistical summary of dataframe; shows only numerical columns by default
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [40]:
# shows information about dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [41]:
# look for any missing data
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

### Splitting the dataframe into independent feature set and dependent vector

In [42]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

## Encoding categorical data

In [49]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

def encode_categorical_features(X):
    # Encode categorical features using OneHotEncoder
    ct = ColumnTransformer(
        transformers=[
            ("encoder", OneHotEncoder(), [3])
        ],
        remainder="passthrough"
    )
    X = np.array(ct.fit_transform(X))
    return X


In [50]:
X = encode_categorical_features(X)
X

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

In [76]:
y

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94, 156991.12,
       156122.51, 155752.6 , 152211.77, 149759.96, 146121.95, 144259.4 ,
       141585.52, 134307.35, 132602.65, 129917.04, 126992.93, 125370.37,
       124266.9 , 122776.86, 118474.03, 111313.02, 110352.25, 108733.99,
       108552.04, 107404.34, 105733.54, 105008.31, 103282.38, 101004.64,
        99937.59,  97483.56,  97427.84,  96778.92,  96712.8 ,  96479.51,
        90708.19,  89949.14,  81229.06,  81005.76,  78239.91,  77798.83,
        71498.49,  69758.98,  65200.33,  64926.08,  49490.75,  42559.73,
        35673.41,  14681.4 ])

## Splitting the dataset into the Training set and Test set

In [54]:
from sklearn.model_selection import train_test_split
def split_dataset(X, y, test_size=1/5, random_state=0):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [55]:
X_train, X_test, y_train, y_test = split_dataset(X, y)

In [56]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (40, 6)
X_test shape: (10, 6)
y_train shape: (40,)
y_test shape: (10,)


## Training the Multiple Linear Regression model on the Training set

In [61]:
from sklearn.linear_model import LinearRegression
def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

In [62]:
regressor_model = train_model(X_train, y_train)


## Predicting the Test set results

In [63]:
y_pred = regressor_model.predict(X_test)

In [74]:
def print_predictions(y_pred, y_test):
   for i in range(len(y_pred)):
       print(f"Predicted: {y_pred[i]:.2f}, Actual: {y_test[i]}")


In [75]:
print_predictions(y_pred, y_test)

Predicted: 103015.20, Actual: 103282.38
Predicted: 132582.28, Actual: 144259.4
Predicted: 132447.74, Actual: 146121.95
Predicted: 71976.10, Actual: 77798.83
Predicted: 178537.48, Actual: 191050.39
Predicted: 116161.24, Actual: 105008.31
Predicted: 67851.69, Actual: 81229.06
Predicted: 98791.73, Actual: 97483.56
Predicted: 113969.44, Actual: 110352.25
Predicted: 167921.07, Actual: 166187.94
