In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
preprocess_model = pickle.load(open('preprocess_model.pkl','rb'))

Restoring data from pickel file to variable

In [3]:
x_train = preprocess_model[0]
y_train = preprocess_model[1]
x_test = preprocess_model[2]
y_test = preprocess_model[3]

# Linear regression

In [4]:
from sklearn.linear_model import LinearRegression

In [5]:
regression = LinearRegression()

In [6]:
regression.fit(x_train,y_train)

LinearRegression()

In [7]:
print(regression.coef_)
print(regression.intercept_)

[ 241.43824282    0.29956773    1.93156076 -133.79361719]
307.33452006980804


In [8]:
# prediction for test data
reg_pred= regression.predict(x_test)

In [9]:
reg_pred

array([369.47692213, 149.75793751, 364.75716305, 295.94196972,
       133.65255732, 276.42539732, 260.44763676, 609.40538758,
       444.66114621, 124.8005163 , 444.79045205, 244.64269497,
       325.39516569, 502.21348636, 268.66000089, 536.18146691,
       308.98216506, 168.49080865, 224.28260447, 329.80977285,
       226.58223691, 148.7669609 , 305.27007723, 293.00498774,
       193.79969775, 204.35897771, 359.14290197, 405.55392131,
       177.25768787, 389.11201975, 294.11360717, 314.32646274,
       384.78165165, 393.61268264, 577.15535637, 205.76917261,
        95.87003768, 182.68695399, 517.98734606, 273.45562209,
       546.12806327, 284.10643202,   6.85850795, 352.98987346,
       386.08617774, 271.32714779, 338.90794874, 259.45676405,
       316.26399964, 373.39965688, 140.05210812, 341.31584266,
       361.36864058, 478.50058876, 236.91697016, 334.00199934,
       339.79019663, 192.21597953, 166.2109765 , 213.34955829,
       259.85086372, 367.61953943,  59.21195042, 353.65

In [10]:
from sklearn.metrics import r2_score

R square score

In [11]:
from sklearn.metrics import r2_score
linear_r2 = r2_score(y_test,reg_pred)
print(linear_r2)

0.9818148575003433


adjusted R square score

In [13]:
adjusted_linear_r2 = 1 - (1-linear_r2)*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)
adjusted_linear_r2

0.9816587618565694

# Ridge regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge = Ridge()

In [None]:
ridge.fit(x_train,y_train)

In [None]:
ridge_pred = ridge.predict(x_test)

In [None]:
ridge_pred

R square score

In [None]:
from sklearn.metrics import r2_score
ridge_r2 = r2_score(y_test,ridge_pred)
print(ridge_r2)

Adjusted R square score

In [None]:
adjusted_ridge_r2 = 1 - (1-ridge_r2)*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)
adjusted_ridge_r2 

# Lasso regression

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso()

In [None]:
lasso.fit(x_train,y_train)

In [None]:
lasso_pred = lasso.predict(x_test)

R square score

In [None]:
from sklearn.metrics import r2_score
lasso_r2 = r2_score(lasso_pred,y_test)
lasso_r2

Adjusted R square

In [None]:
adjusted_lasso_r2 = 1 - (1-lasso_r2)*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)
adjusted_lasso_r2

# Elasticnet

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
elastic = ElasticNet()

In [None]:
elastic.fit(x_train,y_train)

In [None]:
elastic_pred=elastic.predict(x_test)

In [None]:
elastic_pred

R square

In [None]:
from sklearn.metrics import r2_score
elasticnet_r2=r2_score(lasso_pred,y_test)
elasticnet_r2

Adjusted R square

In [None]:
adjusted_elasticnet_r2 = 1 - (1-elasticnet_r2)*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)
adjusted_elasticnet_r2

# Support Vector Regressor

from sklearn.svm import SVR

In [None]:
svr = SVR()

In [None]:
svr.fit(x_train,y_train)

In [None]:
svr_pred = svr.predict(x_test)

R square

In [None]:
from sklearn.metrics import r2_score
svr_r2=r2_score(lasso_pred,y_test)
svr_r2

Adjusted R square

In [None]:
adjusted_svr_r2 = 1 - (1-score)*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)
adjusted_svr_r2

Applying hyperperameter tunning

In [None]:
params = {'kernel':['linear','poly','sigmoid','rbf']}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = GridSearchCV(estimator=svr,param_grid=params,cv=10,n_jobs=-1)

In [None]:
grid.fit(x_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
results = {'models':['linear','ridge','lasso','elasticnet','svr'],
           'r_square':[linear_r2,ridge_r2,lasso_r2,elasticnet_r2,svr_r2],
           'adjusted_r_square':[adjusted_linear_r2,adjusted_ridge_r2,adjusted_lasso_r2,adjusted_elasticnet_r2,adjusted_svr_r2]}

In [None]:
df_score = pd.DataFrame(results)

In [None]:
df_score

# Now talk about assumption of linear regression

1. linear relation between input and output feature.

2. No multicolinearity.

3. residuals should be Normally distributed.

4. Homoscedasticity - spread of residual in graph should be equal.

5. No autocorrelation of errors - no autocorrelation in plot of residuals.

1. linear relation between input and output feature

In [None]:
plt.figure(figsize=(15,15))
for i in range(x_test.shape[1]):
    plt.subplot(3,3,i+1)
    plt.xlabel('x_train')
    plt.ylabel('y_train')
    plt.scatter(x_train.T[i],y_train)

2. No multicolinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = []

for i in range(x_train.shape[1]):
    vif.append(variance_inflation_factor(x_train,i))

In [None]:
vif

3. residuals should be Normally distributed.

In [None]:
residuals = y_test-reg_pred

In [None]:
sns.displot(residuals , kind='kde')

4. Homoscedasticity - spread of residuals should be equal on the x axis

In [None]:
plt.scatter(reg_pred,residuals)

5. No Aoutocorrelation of errors.

In [None]:
plt.plot(residuals)