In [1]:
import pandas as pd

In [3]:
import statsmodels.api as sm
df_adv=pd.read_csv('/content/Advertising.csv',index_col=0)
X=df_adv[['TV','radio','newspaper']]
y=df_adv['sales']
df_adv.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [6]:
# If we are try to find out multiple linear regression with Ordinary least Square(OLS) then we will consider intercept as 1 for all the rows.
X=sm.add_constant(X)
X

Unnamed: 0,const,TV,radio,newspaper
1,1.0,230.1,37.8,69.2
2,1.0,44.5,39.3,45.1
3,1.0,17.2,45.9,69.3
4,1.0,151.5,41.3,58.5
5,1.0,180.8,10.8,58.4
...,...,...,...,...
196,1.0,38.2,3.7,13.8
197,1.0,94.2,4.9,8.1
198,1.0,177.0,9.3,6.4
199,1.0,283.6,42.0,66.2


In [4]:
# fit a OLS model with intercept on TV and Radio
X=sm.add_constant(X)
model=sm.OLS(y,X).fit()

In [5]:
# here constant is our intercept and TV,radio,newspaper are features
# R square and Adjucent R Square value will be ranging between 0 to 1.So here its value looks good fit.
# So as here our standard error is very low that means in our features TV,radio,newpaper don't have multicollinearity problem.
# Here all the P value are less than 0.5 apart from newspaper
# Newspaper coefficient value is -ve shows that we are making unnecessary expenditure on newspaper
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Wed, 16 Aug 2023",Prob (F-statistic):,1.58e-96
Time:,10:05:54,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [8]:
# as per the P value of newspaper which is high so we can drop that column
import matplotlib.pyplot as plt
X.iloc[:,1:].corr() # As here every column have very less value near to 0 means there is no much correlation between these features

Unnamed: 0,TV,radio,newspaper
TV,1.0,0.054809,0.056648
radio,0.054809,1.0,0.354104
newspaper,0.056648,0.354104,1.0


In [9]:
df_salary=pd.read_csv('/content/Salary_Data.csv')
df_salary.head()

Unnamed: 0,YearsExperience,Age,Salary
0,1.1,21.0,39343
1,1.3,21.5,46205
2,1.5,21.7,37731
3,2.0,22.0,43525
4,2.2,22.2,39891


In [11]:
X=df_salary[['Age','YearsExperience']]
y=df_salary['Salary']

In [12]:
# fit a OLS Model with intercept
X=sm.add_constant(X)
X

Unnamed: 0,const,Age,YearsExperience
0,1.0,21.0,1.1
1,1.0,21.5,1.3
2,1.0,21.7,1.5
3,1.0,22.0,2.0
4,1.0,22.2,2.2
5,1.0,23.0,2.9
6,1.0,23.0,3.0
7,1.0,23.3,3.2
8,1.0,23.3,3.2
9,1.0,23.6,3.7


In [13]:
X=sm.add_constant(X)
model=sm.OLS(y,X).fit()

In [14]:
# here we have high coefficient values
# our R square values are also fine here
# here in our standard error having a huge values its shows a multicollinearity
# here as per the P value of independent features(yearsexperience,age) shows some correlation between each other
model.summary()

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.96
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,323.9
Date:,"Wed, 16 Aug 2023",Prob (F-statistic):,1.35e-19
Time:,10:44:00,Log-Likelihood:,-300.35
No. Observations:,30,AIC:,606.7
Df Residuals:,27,BIC:,610.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6661.9872,2.28e+04,-0.292,0.773,-5.35e+04,4.02e+04
Age,1836.0136,1285.034,1.429,0.165,-800.659,4472.686
YearsExperience,6153.3533,2337.092,2.633,0.014,1358.037,1.09e+04

0,1,2,3
Omnibus:,2.695,Durbin-Watson:,1.711
Prob(Omnibus):,0.26,Jarque-Bera (JB):,1.975
Skew:,0.456,Prob(JB):,0.372
Kurtosis:,2.135,Cond. No.,626.0


In [15]:
# So as per the P value let me just find out the correlation
# As we an see here age and yearsexperience having 98% correlation
X.iloc[:,1:].corr()

Unnamed: 0,Age,YearsExperience
Age,1.0,0.987258
YearsExperience,0.987258,1.0


In [None]:
# Remedies for Multicollinearlity
1.Don't do any thing.
2.Check the P value and which column have high p value just drop that feature(column)