<a href="https://colab.research.google.com/github/Bady9898/R_Python_libraries/blob/main/Multi_Linear_Regression_(toyoto).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.regressionplots import influence_plot
import statsmodels.formula.api as smf
import numpy as np

In [None]:
#Read the data
toyoto = pd.read_csv("Toyoto_Corrola.csv")
toyoto.head()

In [None]:
toyoto.info()

In [None]:
#check for missing values
toyoto.isna().sum()

# Correlation Matrix

In [None]:
toyoto.corr()

# Scatterplot between variables along with histograms

In [None]:
#Format the plot background and scatter plots for all the variables
sns.set_style(style='darkgrid')
sns.pairplot(toyoto1)

# Preparing a model

In [None]:
toyoto1 = toyoto.drop(['Cylinders'],axis=1)
toyoto1

In [None]:
#Build model
import statsmodels.formula.api as smf 
model = smf.ols('Price ~  Age_08_04+KM+HP+Doors+Gears+Weight',data=toyoto1).fit()

In [None]:
#Coefficients
model.params

In [None]:
#t and p-Values
print(model.tvalues, '\n', model.pvalues)

In [None]:
#R squared values
(model.rsquared,model.rsquared_adj)

# Simple Linear Regression Models

In [None]:
ml_v=smf.ols('Price ~ Age_08_04',data = toyoto1).fit()  
#t and p-Values
print(ml_v.tvalues, '\n', ml_v.pvalues)  

In [None]:
ml_w=smf.ols('Price ~ KM',data = toyoto1).fit()  
print(ml_w.tvalues, '\n', ml_w.pvalues)  

In [None]:
ml_wv=smf.ols('Price ~ Age_08_04+KM',data = toyoto1).fit()  
print(ml_wv.tvalues, '\n', ml_wv.pvalues)  

# Calculating VIF

In [None]:
rsq_price = smf.ols('Price~KM+Age_08_04+HP+Doors+Gears+Weight',data=toyoto1).fit().rsquared  
vif_price = 1/(1-rsq_price) # 16.33

rsq_km = smf.ols('KM~Age_08_04+HP+Doors+Gears+Weight+Price',data=toyoto1).fit().rsquared  
vif_km = 1/(1-rsq_km) # 564.98

rsq_age = smf.ols('Age_08_04~HP+Doors+Gears+Weight+Price+KM',data=toyoto1).fit().rsquared  
vif_age = 1/(1-rsq_age) #  564.84

rsq_hp = smf.ols('HP~Doors+Gears+Weight+Price+KM+Age_08_04',data=toyoto1).fit().rsquared  
vif_hp = 1/(1-rsq_hp) #  16.35

rsq_doors = smf.ols('Doors~Gears+Weight+Price+KM+Age_08_04+HP', data=toyoto1).fit().rsquared
vif_doors= 1/(1-rsq_doors) 

rsq_gears = smf.ols('Gears~Weight+Price+KM+Age_08_04+HP+Doors', data=toyoto1).fit().rsquared
vif_gears = 1/(1-rsq_gears)

rsq_weight = smf.ols('Weight~Price+KM+Age_08_04+HP+Doors+Gears', data=toyoto1).fit().rsquared
vif_weight = 1/(1-rsq_weight)
# Storing vif values in a data frame
d1 = {'Variables':['Price','KM','Age_08_04','HP','Doors','Gears','Weight'],'VIF':[vif_price,vif_km,vif_age,vif_hp,vif_doors,vif_gears,vif_weight]}
Vif_frame = pd.DataFrame(d1)  
Vif_frame

# Residual Analysis

## Test for Normality of Residuals (Q-Q Plot)

In [None]:
import statsmodels.api as sm
qqplot=sm.qqplot(model.resid,line='q') # line = 45 to draw the diagnoal line
plt.title("Normal Q-Q plot of residuals")
plt.show()

In [None]:
list(np.where(model.resid>10))

## Residual Plot for Homoscedasticity

In [None]:
def get_standardized_values( vals ):
    return (vals - vals.mean())/vals.std()

In [None]:
plt.scatter(get_standardized_values(model.fittedvalues),
            get_standardized_values(model.resid))

plt.title('Residual Plot')
plt.xlabel('Standardized Fitted values')
plt.ylabel('Standardized residual values')
plt.show()

## Residual Vs Regressors

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "KM", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "HP", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "HP", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "Age_08_04", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "Doors", fig=fig)
plt.show()

# Model Deletion Diagnostics

## Detecting Influencers/Outliers

## Cook’s Distance

In [None]:
model_influence = model.get_influence()
(c, _) = model_influence.cooks_distance

In [None]:
#Plot the influencers values using stem plot
fig = plt.subplots(figsize=(20, 7))
plt.stem(np.arange(len(toyoto1)), np.round(c, 3))
plt.xlabel('Row index')
plt.ylabel('Cooks Distance')
plt.show()

In [None]:
#index and value of influencer where c is more than .5
(np.argmax(c),np.max(c))

## High Influence points

In [None]:
from statsmodels.graphics.regressionplots import influence_plot
influence_plot(model)
plt.show()

In [None]:
k = toyoto1.shape[1]
n = toyoto1.shape[0]
leverage_cutoff = 3*((k + 1)/n)

In [None]:
leverage_cutoff

#### From the above plot, it is evident that data point 221, 601, 956, 960,991 are the influencers

In [None]:
toyoto1[toyoto1.index.isin([219,597, 952,956,986])]

In [None]:
#See the differences in HP and other variable values
toyoto1.head()

# Improving the model

In [None]:
#Load the data
toyoto1_new = pd.read_csv("Toyoto_Corrola.csv")

In [None]:
#Discard the data points which are influencers and reasign the row number (reset_index())
toyoto2 = toyoto1_new.drop(toyoto1_new.index[[219,597,952,956,986]],axis=0).reset_index()
#car1=cars_new.drop(cars_new.index[[70,76]],axis=0).reset_index()

In [None]:
#Drop the original index
toyoto2 = toyoto2.drop(['index'], axis=1)
#car1=car1.drop(['index'],axis=1)

In [None]:
toyoto2

# Build Model

In [None]:
#Exclude variable "Gears" and generate R-Squared and AIC values
final_ml_V= smf.ols('Price~KM+HP+Age_08_04+Weight+Doors',data = toyoto2).fit()

In [None]:
(final_ml_V.rsquared,final_ml_V.aic)

In [None]:
#Exclude variable "Doors" and generate R-Squared and AIC values
final_ml_W= smf.ols('Price~KM+HP+Age_08_04+Weight+Gears',data = toyoto2).fit()

In [None]:
(final_ml_W.rsquared,final_ml_W.aic)

##### Comparing above R-Square and AIC values, model 'final_ml_W' has high R- square and low AIC value hence include variable 'Doors' so that multi collinearity problem would be resolved.

# Cook’s Distance

In [None]:
model_influence_V = final_ml_V.get_influence()
(c_V, _) = model_influence_V.cooks_distance

In [None]:
fig= plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(toyoto2)),np.round(c_V,3));
plt.xlabel('Row index')
plt.ylabel('Cooks Distance');

In [None]:
#index of the data points where c is more than .5
(np.argmax(c_V),np.max(c_V))

In [None]:
#Drop 220 observations
toyoto3=toyoto2.drop(toyoto2.index[[220]],axis=0)

In [None]:
toyoto3

In [None]:
#Reset the index and re arrange the row values
toyoto4=toyoto3.reset_index()

In [None]:
toyoto5=toyoto4.drop(['index'],axis=1)

In [None]:
toyoto5

In [None]:
#Build the model on the new data
final_ml_V= smf.ols('Price~KM+HP+Age_08_04+Doors+Weight',data = toyoto5).fit()

In [None]:
#Again check for influencers
model_influence_V = final_ml_V.get_influence()
(c_V, _) = model_influence_V.cooks_distance

In [None]:
fig= plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(toyoto5)),np.round(c_V,3));
plt.xlabel('Row index')
plt.ylabel('Cooks Distance');

In [None]:
#index of the data points where c is more than .5
(np.argmax(c_V),np.max(c_V))

#### Since the value is <1 , we can stop the diagnostic process and finalize the model

In [None]:
#Check the accuracy of the mode
final_ml_V= smf.ols('Price~KM+HP+Age_08_04+Doors+Weight',data = toyoto5).fit()

In [None]:
(final_ml_V.rsquared,final_ml_V.aic)

## Predicting for new data

In [None]:
#New data for prediction
new_data=pd.DataFrame({'HP':40,"VOL":95,"SP":102,"WT":35},index=[1])
new_data=pd.DataFrame({'Price': 11000,"KM":42000,"HP":100,"Age_08_04":45,"Doors":3,"Weight":1100}, index=[1])

In [None]:
new_data

In [None]:
final_ml_V.predict(new_data)

Price(Y)
Offer price in Euros
Age
Age in months as on August 2004
Kilometers
Accumulated kilometers on odometer
HP
Horsepower
Gears
Number of gears
CC
Cylinder volume in cubic centimeters
Doors
Number of doors
QuartTax
Quarterly Road Tax
Weight
Weight in kilograms


Price(Y)
Offer price in Euros
Age
Age in months as on August 2004
Kilometers
Accumulated kilometers on odometer
HP
Horsepower
Gears
Number of gears
CC
Cylinder volume in cubic centimeters
Doors
Number of doors
QuartTax
Quarterly Road Tax
Weight
Weight in kilograms
