In [1]:
import pandas as pd   
import matplotlib.pyplot as plt
import time

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import Pipeline

from numpy import mean

import warnings 
warnings.filterwarnings('ignore')



IMPORTING DATA

In [2]:
df2 = pd.read_excel(r'C:\Users\ayesha.amjad\OneDrive - Astera Software\Data Science for Starters\Resource Folder - Data Science for Starters\Datasets for HandsOn\ToyotaCorolla.xls')
df2 = df2.loc[:, df2.columns != 'Id']
df = df2[['Price', 'Age_08_04', 'KM', 'Quarterly_Tax', 'Mfr_Guarantee', 'BOVAG_Guarantee', 
          'ABS','Airco', 'Metallic_Rim','Fuel_Type','Color']]
X2 = df.loc[:, df.columns != 'Price']
y = df[['Price']]
X = pd.get_dummies(X2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

FIT & COMPARE MULTIPLE MODELS AT ONCE

In [3]:
def fitting_models_CV2():
    cv = RepeatedKFold(n_splits=10, n_repeats=1)#, random_state=1)
    
    lr=LinearRegression()
    ls = Lasso(alpha=10.5)
    rg = Ridge(alpha=1.5)
    gbr = GradientBoostingRegressor() # we can specify loss as huber as one of the parameters.
    knr = KNeighborsRegressor(n_neighbors=5)
    pipe_knr = Pipeline([("scaler", MinMaxScaler()), 
                         ("knr", KNeighborsRegressor(n_neighbors=5))])
    rgs = [('Linear Regression', lr),
        ('Lasso Regression', ls),
        ('Gradient Boosting', gbr),   
        ('KNearest Neighbor',knr),
        ('Scaled KNearest',pipe_knr)       
    ]
    for name,rg in rgs:
        start = time.perf_counter()
        scores = cross_val_score(rg, X, y, cv=cv) 
        end = time.perf_counter()        
        score = format(mean(scores), '.4f')
        duration = format((end-start),'.4f')
        print("{} : {} - {}".format(name,score,duration))

In [4]:
fitting_models_CV2()

Linear Regression : 0.8224 - 0.0652
Lasso Regression : 0.8236 - 0.0526
Gradient Boosting : 0.8946 - 1.1096
KNearest Neighbor : 0.4205 - 0.0688
Scaled KNearest : 0.6998 - 0.0890


In [5]:
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor

BAGGING ALGORITHM

In [6]:
cv = RepeatedKFold(n_splits=10, n_repeats=1)#, random_state=1)
reg_bg = BaggingRegressor(base_estimator=Ridge(1),
                        n_estimators=20, random_state=0)
scores = cross_val_score(reg_bg, X, y, cv=cv)
score = format(mean(scores), '.4f')
print(score)

0.8246


VOTING ALGORITHM

In [7]:
r1 = LinearRegression()
r2 =  GradientBoostingRegressor()
r3 = Pipeline([("scaler", MinMaxScaler()), 
                         ("knr", KNeighborsRegressor(n_neighbors=5))])

reg_vr = VotingRegressor([('lr', r1), ('gb', r2),('knn', r3)])
scores = cross_val_score(reg_vr, X, y, cv=cv)
score = format(mean(scores), '.4f')
print(score)

0.8668


STACKING ALGORITHM

In [12]:
estimators = [
('lr', LinearRegression()),
('gb', GradientBoostingRegressor()),
('knn', Pipeline([("scaler", MinMaxScaler()), 
                         ("knr", KNeighborsRegressor(n_neighbors=5))]))
]

reg_sr = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=100,random_state=42))
scores = cross_val_score(reg_sr, X, y, cv=cv)
score = format(mean(scores), '.4f')
print(score)

0.5853
