In [1]:
#Import all necessary packages
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

#Create a dataframe
df = pd.read_csv('/blue/zoo6927/m.rifat/auto_mpg.csv')
df

Unnamed: 0,mpg,cyl,disp,hp,weight,accel,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
387,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
388,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
389,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
390,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [2]:
#Select the predictor variables for the model 

x = df[['cyl','disp','hp','weight','accel','year']]
y = df[['mpg']]
x_full = sm.add_constant(x)
est = sm.OLS(y,x_full)
est_full = est.fit()
print(est_full.summary())

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.809
Model:                            OLS   Adj. R-squared:                  0.806
Method:                 Least Squares   F-statistic:                     272.2
Date:                Fri, 19 Feb 2021   Prob (F-statistic):          3.79e-135
Time:                        22:55:03   Log-Likelihood:                -1036.5
No. Observations:                 392   AIC:                             2087.
Df Residuals:                     385   BIC:                             2115.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -14.5353      4.764     -3.051      0.0

In [3]:
#Based on the information above, the model becomes,
#mpg = -14.5353-0.3299*cly+0.0077*disp-0.0004*hp-0.0068*weight+0.0853*accel+0.7534*year
#Here, we see that the p-values of the variables cyl, disp,hp and accel are high for alpha = 0.05.
# It makes the model insignificant. So, we eliminate the variables whose p values are higher than 0.05.
#Therefore the improved model becomes
x2 = df[['weight','year']]
x2_reduced = sm.add_constant(x2)
est2 = sm.OLS(y,x2_reduced)
est2_reduced = est2.fit()
print(est2_reduced.summary())
#Here the improved model becomes: mpg = -14.3473-0066*weight+0.7573*year.
# Weight = -0.0066 means, with the increse of weight by each lbs, the fuel efficiency decreases by 0.0066 mpg.
#Year = 0.7573 means, with the progress of time, fuel efficiency incresed significantly,by 0.7573 units per year. 
# R_squared value is 0.808.
#That means the model can explain 80.8% variability of the depending variable using the independent variables.
#Rest variability is caused by unknown factors.
#Unfortunately we have a serious error in this model,The condition number is large, 7.17e+04.
# Which means the model might have a multicolinearity problem, i.e there might be a correlation between year and weight.
#This also means that other factors also might have correlation with the variables. 
#Therefore Linear regression is NOT a good fit for this dataset.
#It can be modeled by non-parametric non-linear regression, which is extremely messy.

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.808
Model:                            OLS   Adj. R-squared:                  0.807
Method:                 Least Squares   F-statistic:                     819.5
Date:                Fri, 19 Feb 2021   Prob (F-statistic):          3.33e-140
Time:                        22:55:35   Log-Likelihood:                -1037.6
No. Observations:                 392   AIC:                             2081.
Df Residuals:                     389   BIC:                             2093.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -14.3473      4.007     -3.581      0.0