In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy

In [None]:
from scipy import stats
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import influence_plot


In [None]:
corolla = pd.read_csv('toyoto_corrola.csv')

In [None]:
corolla.head()

In [None]:
corolla.info()

In [None]:
corolla = corolla.drop(['Model','Id'],axis=1)

In [None]:
corolla.head()

In [None]:
corolla.isna().sum()

In [None]:
corolla.corr()

In [None]:
corolla.info()

In [None]:
sns.set_style(style='darkgrid')
sns.pairplot(corolla)

In [None]:
corolla.tail()

In [None]:
corolla = corolla.rename({'Age_08_04':'Age'},axis=1)


In [None]:
corolla.head()

In [None]:
corolla[corolla.duplicated()]

In [None]:
corolla = corolla.drop_duplicates().reset_index(drop=True)

In [None]:
corolla[corolla.duplicated()]

In [None]:
#Building a model

import statsmodels.formula.api as smf
model = smf.ols('Price~Age+KM+HP+Doors+Cylinders+Gears+Weight',data=corolla).fit()

In [None]:
model.params

In [None]:
print(model.tvalues,'\n',np.round(model.pvalues,5))

In [None]:
(model.rsquared,model.rsquared_adj) #Current model accuracy is 86.15%

In [None]:
# Build SLR and MLR models for insignificant variable 'Doors'(it's p value is greater then 0.05)
# Also find their tvalues and pvalues

# Regresiion models


In [None]:
model_d = smf.ols('Price~Doors',data=corolla).fit()
print(model_d.tvalues,model_d.pvalues)

# Model Validation Techniques
Two Techniques: 1. Collinearity Check & 2. Residual Analysis

In [None]:
#Collinearity problem check
#Calculate VIF = 1/(1-Rsquare) for all independent variable

rsq_hp = smf.ols('HP~KM+Age+Doors+Cylinders+Gears+Weight',data=corolla).fit().rsquared
vif_hp = 1/(1-rsq_hp)

rsq_km = smf.ols('KM~Age+HP+Doors+Cylinders+Gears+Weight',data=corolla).fit().rsquared
vif_km = 1/(1-rsq_km)

rsq_age = smf.ols('Age~KM+HP+Doors+Cylinders+Gears+Weight',data=corolla).fit().rsquared
vif_age = 1/(1-rsq_age)

rsq_doors = smf.ols('Doors~KM+HP+Age+Cylinders+Gears+Weight',data=corolla).fit().rsquared
vif_doors = 1/(1-rsq_doors)

rsq_cyl = smf.ols('Cylinders~KM+HP+Doors+Age+Gears+Weight',data=corolla).fit().rsquared
vif_cyl = 1/(1-rsq_cyl)

rsq_gears = smf.ols('Gears~KM+HP+Doors+Cylinders+Age+Weight',data=corolla).fit().rsquared
vif_gears = 1/(1-rsq_gears)

rsq_weight = smf.ols('Weight~KM+HP+Doors+Cylinders+Gears+Age',data=corolla).fit().rsquared
vif_weight = 1/(1-rsq_weight)

d1 = {'Variables':['HP','Age','KM','Weight','Cylinders','Gears','Doors'],'VIF':[vif_age,vif_km,vif_hp,vif_cyl,vif_gears,vif_weight,vif_doors]}
Vif_frame = pd.DataFrame(d1)  
Vif_frame


In [None]:
# None variable has VIF>20, No Collinearity, so consider all varaibles in Regression equation

In [None]:
#Residual Analysis
# Test for Normality of Residuals (Q-Q Plot) using residual model (model.resid)

import statsmodels.api as sm
qqplot = sm.qqplot(model.resid,line='q')
plt.title('Normal Q-Q plot of residuals')
plt.show()

In [None]:
list(np.where(model.resid>6000)) #Outlier detection from Q-Q plot

In [None]:
list(np.where(model.resid<-6000))

In [None]:
# Test for Homoscedasticity or Heteroscedasticity (plotting model's standardized fitted values vs standardized residual

def get_standardized_values(vals):
    return (vals - vals.mean())/vals.std()

In [None]:
plt.scatter(get_standardized_values(model.fittedvalues),get_standardized_values(model.resid))

plt.title('Residual plot')
plt.xlabel('Standardized Fitted Values')
plt.ylabel('Standardized residual values')
plt.show()

In [None]:
# Test for errors or Residuals Vs Regressors or independent 'x' variables or predictors 
# using Residual Regression Plots code graphics.plot_regress_exog(model,'x',fig)    # exog = x-variable & endog = y

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model,"Age",fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model,"KM",fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model,'HP',fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model,'Doors',fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model,'Cylinders',fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model,'Gears',fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model,'Weight',fig=fig)
plt.show()

# Model Deletion Diagnostics (checking Outliers or Influencers)
Two Techniques : 1. Cook's Distance & 2. Leverage value

In [None]:
model_influence = model.get_influence()
(c, _) = model_influence.cooks_distance


In [None]:
fig  = plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(corolla)),np.round(c,3))
plt.xlabel('Row index')
plt.ylabel('Cooks Distance')
plt.show()

In [None]:
# Index and value of influencer where C>0.5
(np.argmax(c),np.max(c))

In [None]:
from statsmodels.graphics.regressionplots import influence_plot
influence_plot(model)
plt.show()

In [None]:
k  = corolla.shape[1]
n  = corolla.shape[0]
leverage_cutoff = 3*((k+1)/n)


In [None]:
leverage_cutoff

In [None]:
corolla[corolla.index.isin([220])]

# Improving the model

In [None]:
corolla_new = corolla.copy()
corolla_new

In [None]:
df = corolla_new.drop(corolla_new.index[[220]],axis=0).reset_index()

In [None]:
df.head()

In [None]:
df = df.drop(['index'],axis=1)

In [None]:
df.head()

# Model Deletion Diagnostics and Final Model

In [None]:
while model.rsquared < 0.90:
    for c in [np.max(c)>0.5]:
        model=smf.ols('Price~Age+KM+HP+Doors+Cylinders+Gears+Weight',data=df).fit()
        (c,_)=model.get_influence().cooks_distance
        c
        np.argmax(c) , np.max(c)
        df=df.drop(df.index[[np.argmax(c)]],axis=0).reset_index(drop=True)
        df
    else:
        final_model=smf.ols('Price~Age+KM+HP+Doors+Cylinders+Gears+Weight',data=df).fit()
        final_model.rsquared , final_model.aic
        print("Thus model accuracy is improved to",final_model.rsquared)

In [None]:
new_data = pd.DataFrame({'Age':71,'KM':17016,'HP':86,'Doors':3,'Cylinders':4,'Gears':5,'Weight':1015},index=[1])

In [None]:
new_data

In [None]:
final_model.predict(new_data)

In [None]:
pred_x = final_model.predict(df)

In [None]:
pred_x

In [None]:
df.head()

In [None]:
df.tail()