<a href="https://colab.research.google.com/github/85jahnavi/MachineLearning_Assignments/blob/main/Multiple_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing required libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

In [39]:
#Loading the dataset
data = pd.read_csv("/content/50_Startups.csv")
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [40]:
data.isnull().sum()

Unnamed: 0,0
R&D Spend,0
Administration,0
Marketing Spend,0
State,0
Profit,0


In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [42]:
# Converting state column to numeric datatype
data = pd.get_dummies(data, columns=['State'], drop_first=True)
print(data.head())

   R&D Spend  Administration  Marketing Spend     Profit  State_Florida  \
0  165349.20       136897.80        471784.10  192261.83          False   
1  162597.70       151377.59        443898.53  191792.06          False   
2  153441.51       101145.55        407934.54  191050.39           True   
3  144372.41       118671.85        383199.62  182901.99          False   
4  142107.34        91391.77        366168.42  166187.94           True   

   State_New York  
0            True  
1           False  
2           False  
3            True  
4           False  


In [44]:
data.duplicated().sum()

0

In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   Profit           50 non-null     float64
 4   State_Florida    50 non-null     bool   
 5   State_New York   50 non-null     bool   
dtypes: bool(2), float64(4)
memory usage: 1.8 KB


In [46]:
#Create feature and target array from such given data
X = data.drop('Profit', axis=1)
y = data['Profit']

In [47]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## No Regularization

In [48]:
#Fitting a regression model
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

In [49]:
#Evaluating the model on training data only
y_pred1 = reg.predict(X_train)
print(f'Performance of the model on training data :\n')
print(f'MAE = {mean_absolute_error(y_train, y_pred1)}')
print(f'MSE = {mean_squared_error(y_train, y_pred1)}')
print(f'RMSE = {np.sqrt(mean_squared_error(y_train, y_pred1))}')
print(f'R_2 = {r2_score(y_train, y_pred1)}')

Performance of the model on training data :

MAE = 6662.656240898313
MSE = 79700060.08259317
RMSE = 8927.489013300054
R_2 = 0.9537019995248526


In [50]:
y_pred2 = reg.predict(X_test)
print(f'Performance of the model on test data :\n')
print(f'MAE = {mean_absolute_error(y_test, y_pred2)}')
print(f'MSE = {mean_squared_error(y_test, y_pred2)}')
print(f'RMSE = {np.sqrt(mean_squared_error(y_test, y_pred2))}')
print(f'R_2 = {r2_score(y_test, y_pred2)}')

Performance of the model on test data :

MAE = 6961.477813252376
MSE = 82010363.04430099
RMSE = 9055.957323458464
R_2 = 0.8987266414328637


# Lasso Regression(L1 regularization)

In [51]:
from sklearn.linear_model import Lasso


In [52]:
#Fitting the Model
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

In [53]:
#Evaluating the model on training data only
y_pred1 = lasso_model.predict(X_train)
print(f'Performance of the model on training data :\n')
print(f'MAE = {mean_absolute_error(y_train, y_pred1)}')
print(f'MSE = {mean_squared_error(y_train, y_pred1)}')
print(f'RMSE = {np.sqrt(mean_squared_error(y_train, y_pred1))}')
print(f'R_2 = {r2_score(y_train, y_pred1)}')

Performance of the model on training data :

MAE = 6662.622932690499
MSE = 79700060.27993599
RMSE = 8927.489024352592
R_2 = 0.9537019994102155


In [54]:
#Evaluating the model on testing data only
y_pred2 = lasso_model.predict(X_test)
print(f'Performance of the model on test data :\n')
print(f'MAE = {mean_absolute_error(y_test, y_pred2)}')
print(f'MSE = {mean_squared_error(y_test, y_pred2)}')
print(f'RMSE = {np.sqrt(mean_squared_error(y_test, y_pred2))}')
print(f'R_2 ={r2_score(y_test, y_pred2)}')

Performance of the model on test data :

MAE = 6961.487870576682
MSE = 82009745.3745589
RMSE = 9055.92322044301
R_2 =0.8987274041838984


# Rigid Regression(L2 Regularization)

In [55]:
from sklearn.linear_model import Ridge


In [56]:
#Rigid model fitting
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)

In [57]:
y_pred1 = ridge_model.predict(X_train)
print(f'Performance of the model on training data :\n')
print(f'MAE = {mean_absolute_error(y_train, y_pred1)}')
print(f'MSE = {mean_squared_error(y_train, y_pred1)}')
print(f'RMSE = {np.sqrt(mean_squared_error(y_train, y_pred1))}')
print(f'R_2 = {r2_score(y_train, y_pred1)}')

Performance of the model on training data :

MAE = 6661.840583104003
MSE = 79700094.5458275
RMSE = 8927.490943474964
R_2 = 0.9537019795050579


In [58]:
y_pred2 = ridge_model.predict(X_test)
print(f'Performance of the model on test data :\n')
print(f'MAE = {mean_absolute_error(y_test, y_pred2)}')
print(f'MSE = {mean_squared_error(y_test, y_pred2)}')
print(f'RMSE = {np.sqrt(mean_squared_error(y_test, y_pred2))}')
print(f'R_2 = {r2_score(y_test, y_pred2)}')

Performance of the model on test data :

MAE = 6961.6909186619505
MSE = 81996139.21167997
RMSE = 9055.171959255107
R_2 = 0.8987442062289183


In [63]:
#Creating results of train and test data in a table
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, rmse, r2

In [64]:
train_results = {
    "No Regularization (Train)": evaluate_model(y_train, y_pred1),
    "L1-Regularization (Train)": evaluate_model(y_train, y_pred1),
    "L2-Regularization (Train)": evaluate_model(y_train, y_pred1)
}

In [65]:
test_results = {
    "No Regularization (Test)": evaluate_model(y_test, y_pred2),
    "L1-Regularization (Test)": evaluate_model(y_test, y_pred2),
    "L2-Regularization (Test)": evaluate_model(y_test, y_pred2)
}

In [66]:
train_results_df = pd.DataFrame(train_results, index=["MAE", "MSE", "RMSE", "R2 Score"])
test_results_df = pd.DataFrame(test_results, index=["MAE", "MSE", "RMSE", "R2 Score"])

In [67]:
print("Training Results:")
train_results_df

Training Results:


Unnamed: 0,No Regularization (Train),L1-Regularization (Train),L2-Regularization (Train)
MAE,6661.841,6661.841,6661.841
MSE,79700090.0,79700090.0,79700090.0
RMSE,8927.491,8927.491,8927.491
R2 Score,0.953702,0.953702,0.953702


In [68]:
print("\nTesting Results:")
test_results_df


Testing Results:


Unnamed: 0,No Regularization (Test),L1-Regularization (Test),L2-Regularization (Test)
MAE,6961.691,6961.691,6961.691
MSE,81996140.0,81996140.0,81996140.0
RMSE,9055.172,9055.172,9055.172
R2 Score,0.8987442,0.8987442,0.8987442


# Observation

1.R_2 score for **No Regularization** for train data is very high
indicating model fits training data well,The R² score for the testing dataset is often lower than that of the training dataset,hence it performs overfitting.

2.In **L1 regularization,L2 regularization** there is a slight difference in r2_score value which indicates model performance.