# **Import libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.graphics.regressionplots import influence_plot

# **import Toyotcorolla File**:

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
startups= pd.read_csv("50_Startups.csv")

In [None]:
startups

# **EDA and Visualization**

In [None]:
startups.info()

In [None]:
# Check missing values
startups.isnull().sum()

In [None]:
startups1=startups.rename({'R&D Spend':'RDS','Administration':'ADMS','Marketing Spend':'MKTS'},axis=1)
startups1

In [None]:
startups1[startups1.duplicated()] # No duplicate values

In [None]:
startups1.describe()

# **Correlation Matrix**

In [None]:
startups1.corr()

# **Scatter Plot between Variables along with histogram**

In [None]:
sns.set_style(style='darkgrid')
sns.pairplot(startups1)

# **Preparing the Model**

In [None]:
model=smf.ols('Profit~RDS+ADMS+MKTS',data=startups1).fit()

In [None]:
# Cofficients
model.params

In [None]:
# t and p-values
print(model.tvalues,'\n',model.pvalues)

In [None]:
# R squared values
(model.rsquared , model.rsquared_adj)

Hence, Model is 94.75% accurate approximately.

# **Simple Regression & MultiRegression Model**

In [None]:
#Build SLR & MLR model for 'ADMS' and 'MKTS', since they are insignificant.
slr_c=smf.ols('Profit~ADMS',data=startups1).fit()
slr_c.tvalues , slr_c.pvalues 

Now, ADMS has in-significant pvalues.

In [None]:
slr_d=smf.ols('Profit~MKTS',data=startups1).fit()
slr_d.tvalues , slr_d.pvalues

 ** MKTS has significant pvalues.**

In [None]:
mlr_cd=smf.ols('Profit~ADMS+MKTS',data=startups1).fit()
mlr_cd.tvalues , mlr_cd.pvalues

Now,for mlr, ADMS and MKTS have significant pvalues.

# **Model Validation**

In [None]:
# collinearity check

# **Calculate VIF = 1/(1-Rsquare)**

In [None]:
# Check Collinearity 

rsq_r=smf.ols("RDS~ADMS+MKTS",data=startups1).fit().rsquared
vif_r=1/(1-rsq_r)

rsq_a=smf.ols("ADMS~RDS+MKTS",data=startups1).fit().rsquared
vif_a=1/(1-rsq_a)

rsq_m=smf.ols("MKTS~RDS+ADMS",data=startups1).fit().rsquared
vif_m=1/(1-rsq_m)

In [None]:
# Storing VIF values in a DataFrame
d1={'Variables':['RDS','ADMS','MKTS'],'Vif':[vif_r,vif_a,vif_m]}
Vif_df=pd.DataFrame(d1)
Vif_df

Hence, there is no variable whose VIF>20 So there is no colinearity it mean we will consider all variable in regression equation.

# **Residual Analysis**

In [None]:
# Test for Normality of Residuals (Q-Q Plot) using residual model (model.resid)

sm.qqplot(model.resid,line='q')
plt.title("Normal Q-Q plot of residuals")
plt.show()

In [None]:
# Q-Q plot
sm.qqplot(model.resid,line='q') #   line=45  to draw a diagonal line
plt.title("Normal Q-Q plot of residuals")
plt.show()

In [None]:
## outliar detection from above QQ plot of residuals


In [None]:
list(np.where(model.resid<-30000))

# **Residual plot for Homoscedacity**

In [None]:
def get_standardized_values(vals) : return (vals-vals.mean())/vals.std()

In [None]:
plt.scatter(get_standardized_values(model.fittedvalues),
            get_standardized_values(model.resid))
plt.title('Residual Plot')
plt.xlabel('standardized fitted values')
plt.ylabel('standardized residual values')
plt.show() 

# **Residual Vs Regressor**

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'RDS',fig=fig) #  # exog = x-variable & endog = y-variable
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'ADMS',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'MKTS',fig=fig)
plt.show()

# **Model Deletion Diagnostics**

# **Detecting influencers/outliers**

# **Cook's Distance**

In [None]:
model_influence = model.get_influence()
(c,_)=model_influence.cooks_distance
c

In [None]:
fig=plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(startups1)),np.round(c, 3))
plt.xlabel('Row Index')
plt.ylabel('Cooks Distance')
plt.show()

In [None]:
# Index and value of influencer where C>0.5
np.argmax(c) , np.max(c)

# **High influence point**

In [None]:
influence_plot(model)
plt.show()

In [None]:
# Leverage Cuttoff Value = 3*(k+1)/n ; k = no.of features/columns & n = no. of datapoints
k=startups1.shape[1]
n=startups1.shape[0]
leverage_cutoff = (3*(k+1))/n
leverage_cutoff

In [None]:
startups1[startups1.index.isin([49])] 

# **Improving the model**

In [None]:
# Exclude the data points which are influencers and reset the row number (reset_index(drop=True))
startups2=startups1.drop(startups1.index[[49]],axis=0).reset_index(drop=True)
startups2

# **Model Deletion Diagnostics and Final Model**

In [None]:
while np.max(c)>0.5 :
    model=smf.ols("Profit~RDS+ADMS+MKTS",data=startups2).fit()
    (c,_)=model.get_influence().cooks_distance
    c
    np.argmax(c) , np.max(c)
    startups2=startups2.drop(startups2.index[[np.argmax(c)]],axis=0).reset_index(drop=True)
    startups2
else:
    final_model=smf.ols("Profit~RDS+ADMS+MKTS",data=startups2).fit()
    final_model.rsquared , final_model.aic
    print("Thus model accuracy is improved to",final_model.rsquared)

In [None]:
(final_model.rsquared , final_model.aic)

In [None]:
final_model.rsquared

In [None]:
#cooks distance
model_infuence=final_model.get_influence()
(c, _) = model_influence.cooks_distance
c


# **Final model predictions**

In [None]:
# Take new data for prediction
new_data=pd.DataFrame({'RDS':75000,"ADMS":80000,"MKTS":120000},index=[0])
new_data

In [None]:
final_model.predict(new_data)

In [None]:
pred_y=final_model.predict(startups2)
pred_y