# Simple linear regression

In [31]:
#import libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

#model libraries
import statsmodels.api as sm
from statsmodels.tools import add_constant

In [32]:
#get the dataset 
print(sns.get_dataset_names(), end=" ")

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic'] 

In [54]:
#load the dataset
df = sns.load_dataset("mpg")
#select only numerical columns
df= df.select_dtypes("number")

#check top 4 rows
df[:4]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70


In [50]:
#check missing values
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
dtype: int64

In [53]:
#drop the null values if any
df.dropna(inplace=True)
#check any null values
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
dtype: int64

In [35]:
#duplicates
df.duplicated().sum()

0

## Simple Linear Regresssion

we model one feature against the target variable

In [37]:
#seperate a feature from a label
X = df["horsepower"]
y = df["mpg"]

In [38]:
#add constant
x_con = add_constant(X)

In [39]:
#create Ols model

model1 = sm.OLS(y, x_con).fit()

In [40]:
#check model summary
model1.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.606
Model:,OLS,Adj. R-squared:,0.605
Method:,Least Squares,F-statistic:,599.7
Date:,"Mon, 13 Oct 2025",Prob (F-statistic):,7.03e-81
Time:,21:14:34,Log-Likelihood:,-1178.7
No. Observations:,392,AIC:,2361.0
Df Residuals:,390,BIC:,2369.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,39.9359,0.717,55.660,0.000,38.525,41.347
horsepower,-0.1578,0.006,-24.489,0.000,-0.171,-0.145

0,1,2,3
Omnibus:,16.432,Durbin-Watson:,0.92
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.305
Skew:,0.492,Prob(JB):,0.000175
Kurtosis:,3.299,Cond. No.,322.0


## Multiple Linear Regression

In [46]:
#seperate features from labels
X = df.drop("mpg", axis=1) #feature
y = df["mpg"] # label

#add constant
X_con1 = add_constant(X)

In [48]:
#create model
model2 = sm.OLS(y,X_con1)
model2 = model2.fit()

In [49]:
#check model summary
model2.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.809
Model:,OLS,Adj. R-squared:,0.806
Method:,Least Squares,F-statistic:,272.2
Date:,"Mon, 13 Oct 2025",Prob (F-statistic):,3.79e-135
Time:,21:30:21,Log-Likelihood:,-1036.5
No. Observations:,392,AIC:,2087.0
Df Residuals:,385,BIC:,2115.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-14.5353,4.764,-3.051,0.002,-23.902,-5.169
cylinders,-0.3299,0.332,-0.993,0.321,-0.983,0.323
displacement,0.0077,0.007,1.044,0.297,-0.007,0.022
horsepower,-0.0004,0.014,-0.028,0.977,-0.028,0.027
weight,-0.0068,0.001,-10.141,0.000,-0.008,-0.005
acceleration,0.0853,0.102,0.836,0.404,-0.115,0.286
model_year,0.7534,0.053,14.318,0.000,0.650,0.857

0,1,2,3
Omnibus:,37.865,Durbin-Watson:,1.232
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60.248
Skew:,0.63,Prob(JB):,8.26e-14
Kurtosis:,4.449,Cond. No.,85300.0


In [55]:
sm.add_constant