<a href="https://colab.research.google.com/github/Aditya12D/LR-from-scratch/blob/main/multiplelinearregression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiple Linear Regression Hyperplane Visualisation

In this notebook, we demonstrate multiple linear regression for

- **Real world dataset with 2 features**
- **Real-world dataset** (practical application)

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


In [None]:
class multipleLR:
  def __init__(self):
    self.coef=None
    self.intercept=None
  def train(self,X_train,y_train):
    X_train=np.array(X_train)
    y_train=np.array(y_train)
    X_train=np.insert(X_train,0,1,axis=1)
    beta=np.linalg.inv(X_train.T.dot(X_train)).dot(X_train.T).dot(y_train)
    self.intercept=beta[0]
    self.coef=beta[1:]
    return self.intercept,self.coef
  def predict(self,X_test):
    X_test=np.array(X_test)
    y_pred=X_test.dot(self.coef)+self.intercept
    return y_pred

#Using real Data with 2 features

In [None]:
df1=pd.read_csv('/content/multiple_linear_regression_dataset.csv')
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   age         20 non-null     int64
 1   experience  20 non-null     int64
 2   income      20 non-null     int64
dtypes: int64(3)
memory usage: 612.0 bytes


In [None]:
X=df1.drop(columns=['income'])
y=df1['income']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

using the multiple linear regression created on our own

In [None]:
model1=multipleLR()
model1.train(X_train,y_train)
y_pred=model1.predict(X_test)

using inbuilt class

In [None]:
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred1=lr.predict(X_test)

comparing both

In [None]:
print("locally made",y_pred)
print("inbuilt",y_pred1)

locally made [36843.22672444 48934.22956284 46639.48736579 31059.36312867]
inbuilt [36843.22672444 48934.22956284 46639.48736579 31059.36312867]


In [None]:
fig=px.scatter_3d(df1,x='age',y='experience',z='income')
fig.show()

plotting just for visualization, yet to be learned


In [None]:
# Extract features and target
x1 = df1['age'].values
x2 = df1['experience'].values
y_actual = df1['income'].values

# Create grid for hyperplane surface
x1_range = np.linspace(x1.min(), x1.max(), 10)
x2_range = np.linspace(x2.min(), x2.max(), 10)
x1_grid, x2_grid = np.meshgrid(x1_range, x2_range)

# Calculate predicted income on grid using model coefficients (using sklearn model for this plot)
intercept = model1.intercept
coef1 = model1.coef[0]
coef2 = model1.coef[1]

# Plane equation: y = intercept + coef1 * x1 + coef2 * x2
y_pred_grid = intercept + coef1 * x1_grid + coef2 * x2_grid

# Scatter plot for actual data
scatter = go.Scatter3d(
    x = x1,
    y = x2,
    z = y_actual,
    mode = 'markers',
    marker=dict(size=5, color='blue'),
    name='Actual Data'
)

# Surface plot for regression hyperplane
surface = go.Surface(
    x = x1_grid,
    y = x2_grid,
    z = y_pred_grid,
    colorscale='Viridis',
    opacity=0.6,
    name='Regression Hyperplane'
)

# Combine plots
fig = go.Figure(data=[scatter, surface])

# Layout settings
fig.update_layout(
    title='Multiple Linear Regression Hyperplane (Plotly)',
    scene=dict(
        xaxis_title='Age',
        yaxis_title='Experience',
        zaxis_title='Income'
    )
)

fig.show()


#using data with more than 2 features


In [None]:
df2=pd.read_csv('/content/Student_Performance.csv')

In [None]:
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(drop='first'),['Extracurricular Activities'])],remainder='passthrough')

In [None]:
X2=df2.drop(columns=['Performance Index'])
y2=df2['Performance Index']

In [None]:
X2=ct.fit_transform(X2)

In [None]:
feature_names=ct.get_feature_names_out()

In [None]:
X2=pd.DataFrame(X2,columns=feature_names)

In [None]:
X2

Unnamed: 0,encoder__Extracurricular Activities_Yes,remainder__Hours Studied,remainder__Previous Scores,remainder__Sleep Hours,remainder__Sample Question Papers Practiced
0,1.0,7.0,99.0,9.0,1.0
1,0.0,4.0,82.0,4.0,2.0
2,1.0,8.0,51.0,7.0,2.0
3,1.0,5.0,52.0,5.0,2.0
4,0.0,7.0,75.0,8.0,5.0
...,...,...,...,...,...
9995,1.0,1.0,49.0,4.0,2.0
9996,1.0,7.0,64.0,8.0,5.0
9997,1.0,6.0,83.0,8.0,5.0
9998,1.0,9.0,97.0,7.0,0.0


In [None]:
X2_train,X2_test,y2_train,y2_test=train_test_split(X2,y2,test_size=0.2,random_state=2)

using locally made

In [None]:
model2=multipleLR()
model2.train(X2_train,y2_train)
y_pred3=model.predict(X2_test)

using inbuilt

In [None]:
lr2=LinearRegression()
lr2.fit(X2_train,y2_train)
y_pred4=lr2.predict(X2_test)

comparing

In [None]:
print("locally made",y_pred3)
print("inbuilt",y_pred4)

locally made [28.09141683 35.39557867 80.43795479 ... 45.01315301 30.31243483
 68.64738321]
inbuilt [28.09141683 35.39557867 80.43795479 ... 45.01315301 30.31243483
 68.64738321]


In [None]:
print("locally made intercept and coeff",model2.intercept,model2.coef)
print("inbuilt intercept and coeff",lr2.intercept_,lr2.coef_)

locally made intercept and coeff -34.177527651217595 [0.59483017 2.85352109 1.01959723 0.48314352 0.1977199 ]
inbuilt intercept and coeff -34.17752765121665 [0.59483017 2.85352109 1.01959723 0.48314352 0.1977199 ]


calculations


In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [None]:
r2_score(y2_test,y_pred3)

0.9890954655668952

In [None]:
r2_score(y_test,y_pred)

0.9579002513351086

In [None]:
mean_absolute_error(y2_test,y_pred3)

1.5926720612039327

In [None]:
np.sqrt(mean_squared_error(y2_test,y_pred3))

np.float64(2.017493145379337)