# **Import libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.graphics.regressionplots import influence_plot

# **import Toyotcorolla File:**

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
Cars= pd.read_csv("ToyotaCorolla.csv",encoding='latin1')

In [None]:
Cars

# **EDA and Visualization**

In [None]:
Cars.info()

In [None]:
# Check missing values
Cars.isnull().sum()

In [None]:
Cars1=pd.concat([Cars.iloc[:,2:4],Cars.iloc[:,6:7],Cars.iloc[:,8:9],Cars.iloc[:,12:14],Cars.iloc[:,15:18]],axis=1)
Cars1

In [None]:
Cars2=Cars1.rename({'Age_08_04':'Age','cc':'CC','Quarterly_Tax':'QT'},axis=1)
Cars2

In [None]:
#Duplicate row
Cars2[Cars2.duplicated()]

In [None]:
Cars3=Cars2.drop_duplicates().reset_index(drop=True)
Cars3

In [None]:
Cars3.describe()

# **Correlation Matrix**

In [None]:
Cars3.corr()

# **Scatter Plot between Variables along with histogram**

In [None]:
sns.set_style(style='darkgrid')
sns.pairplot(Cars3)

# **Preparing the Model**

In [None]:
model=smf.ols('Price~Age+KM+HP+CC+Doors+Gears+QT+Weight',data=Cars3).fit()

In [None]:
# Cofficients
model.params

In [None]:
# R squared values
(model.rsquared , model.rsquared_adj)

**Hence, Model is 86.17% accurate approximately.**


# **Simple Regression & MultiRegression Model**

In [None]:
#Build SLR & MLR model for cc and Doors, since they are insignificant.
slr_c=smf.ols('Price~CC',data=Cars3).fit()
slr_c.tvalues , slr_c.pvalues 

cc has significant pvalues.

In [None]:
slr_d=smf.ols('Price~Doors',data=Cars3).fit()
slr_d.tvalues , slr_d.pvalues

Doors has significant pvalues.

In [None]:
mlr_cd=smf.ols('Price~CC+Doors',data=Cars3).fit()
mlr_cd.tvalues , mlr_cd.pvalues

Now, cc and Doors has significant pvalues.

# **Model Validation**

In [None]:
# collinearity check

# **Calculate VIF = 1/(1-Rsquare)**

In [None]:
rsq_age=smf.ols('Age~KM+HP+CC+Doors+Gears+QT+Weight',data=Cars3).fit().rsquared
vif_age=1/(1-rsq_age)

rsq_KM=smf.ols('KM~Age+HP+CC+Doors+Gears+QT+Weight',data=Cars3).fit().rsquared
vif_KM=1/(1-rsq_KM)

rsq_HP=smf.ols('HP~Age+KM+CC+Doors+Gears+QT+Weight',data=Cars3).fit().rsquared
vif_HP=1/(1-rsq_HP)

rsq_CC=smf.ols('CC~Age+KM+HP+Doors+Gears+QT+Weight',data=Cars3).fit().rsquared
vif_CC=1/(1-rsq_CC)

rsq_DR=smf.ols('Doors~Age+KM+HP+CC+Gears+QT+Weight',data=Cars3).fit().rsquared
vif_DR=1/(1-rsq_DR)

rsq_GR=smf.ols('Gears~Age+KM+HP+CC+Doors+QT+Weight',data=Cars3).fit().rsquared
vif_GR=1/(1-rsq_GR)

rsq_QT=smf.ols('QT~Age+KM+HP+CC+Doors+Gears+Weight',data=Cars3).fit().rsquared
vif_QT=1/(1-rsq_QT)

rsq_WT=smf.ols('Weight~Age+KM+HP+CC+Doors+Gears+QT',data=Cars3).fit().rsquared
vif_WT=1/(1-rsq_WT)

In [None]:
# Storing VIF values in a DataFrame
d1={'Variables':['Age','KM','HP','CC','Doors','Gears','QT','Weight'],       
     'Vif':[vif_age,vif_KM,vif_HP,vif_CC,vif_DR,vif_GR,vif_QT,vif_WT]}
Vif_fram=pd.DataFrame(d1)
Vif_fram

Hence, there is no variable whose VIF>20 So there is no colinearity it mean we  consider all variable in regression equation.



# **Residual Analysis**

In [None]:
# Q-Q plot
sm.qqplot(model.resid,line='q') #   line=45  to draw a diagonal line
plt.title("Normal Q-Q plot of residuals")
plt.show()

In [None]:
## outliar detection from above QQ plot of residuals
list(np.where(model.resid>6000))

In [None]:
list(np.where(model.resid<-6000))

# **Residual plot for Homoscedacity**

In [None]:
def get_standardized_values(vals) : return (vals-vals.mean())/vals.std()

In [None]:
plt.scatter(get_standardized_values(model.fittedvalues),
            get_standardized_values(model.resid))
plt.title('Residual Plot')
plt.xlabel('standardized fitted values')
plt.ylabel('standardized residual values')
plt.show() 

# **Residual Vs Regressor**

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'Age',fig=fig) #  # exog = x-variable & endog = y-variable
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'KM',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'HP',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'CC',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'Doors',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'Gears',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'QT',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'Weight',fig=fig)
plt.show()

# **Model Deletion Diagnostics**

# **Detecting influencers/outliers**

# **Cook's Distance**

In [None]:
model_influence = model.get_influence()
(c,_)=model_influence.cooks_distance
c

In [None]:
# Stem plot to detect influencer
fig=plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(Cars3)),np.round(c, 3))
plt.xlabel('Row Index')
plt.ylabel('Cooks Distance')
plt.show()

In [None]:
# Index and value of influencer where C>0.5
np.argmax(c) , np.max(c)

# **High influence point**

In [None]:
fig,ax=plt.subplots(figsize=(20,20))
fig=influence_plot(model,ax = ax)
plt.show()

## **From the above plot it is evident that data point 80 is influencer.**

In [None]:
Cars3[Cars3.index.isin([80])]

# **Improving the model**

In [None]:
Cars_new=Cars3.copy()

In [None]:
Cars_new

In [None]:
# Exclude the data points which are influencers and reset the row number (reset_index(drop=True))
Cars4=Cars_new.drop(Cars_new.index[[80]],axis=0).reset_index(drop=True)
Cars4

# **Model Deletion Diagnostics and Final Model**

In [None]:
while np.max(c)>0.5 :
    model=smf.ols('Price~Age+KM+HP+CC+Doors+Gears+QT+Weight',data=Cars4).fit()
    (c,_)=model.get_influence().cooks_distance
    c
    np.argmax(c) , np.max(c)
    Cars4=Cars4.drop(Cars4.index[[np.argmax(c)]],axis=0).reset_index(drop=True)
    Cars4
else:
    final_model=smf.ols('Price~Age+KM+HP+CC+Doors+Gears+QT+Weight',data=Cars4).fit()
    final_model.rsquared , final_model.aic
    print("Thus model accuracy is improved to",final_model.rsquared)

In [None]:
final_model.rsquared

In [None]:
# Again check cooks distance
model_influence= final_model.get_influence()
(c,_) = model_influence.cooks_distance
c

In [None]:
(np.argmax(c),np.max(c))

In [None]:
#Stem plot to detect high influencer 
fig=plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(Cars4)),np.round(c, 3))
plt.xlabel('Row Index')
plt.ylabel('Cooks Distance')
plt.show()

# **Since the value(0.16)<1 , we can stop the diagnostic process and finalize the model**

# **Final model predictions**

In [None]:
# Take new data for prediction
new_data=pd.DataFrame({'Age':15,"KM":42000,"HP":82,"CC":1200,"Doors":4,"Gears":5,"QT":70,"Weight":1014},index=[0])
new_data

In [None]:
final_model.predict(new_data)

In [None]:
pred_y=final_model.predict(Cars4)
pred_y

# **So, this is our final model.**