In [1]:
import pandas as pd 
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import VotingRegressor, StackingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR


from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error

1. Get - Clean - Impute data

In [21]:
data = sns.load_dataset(name="mpg")

In [22]:
data.drop("name", axis=1, inplace=True)

In [23]:
data = pd.get_dummies(data, dtype=int)

In [24]:
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa
0,18.0,8,307.0,130.0,3504,12.0,70,0,0,1
1,15.0,8,350.0,165.0,3693,11.5,70,0,0,1
2,18.0,8,318.0,150.0,3436,11.0,70,0,0,1
3,16.0,8,304.0,150.0,3433,12.0,70,0,0,1
4,17.0,8,302.0,140.0,3449,10.5,70,0,0,1
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,0,0,1
394,44.0,4,97.0,52.0,2130,24.6,82,1,0,0
395,32.0,4,135.0,84.0,2295,11.6,82,0,0,1
396,28.0,4,120.0,79.0,2625,18.6,82,0,0,1


In [30]:
pd.DataFrame(data.isnull().sum().sort_values(ascending=False))

Unnamed: 0,0
horsepower,6
mpg,0
cylinders,0
displacement,0
weight,0
acceleration,0
model_year,0
origin_europe,0
origin_japan,0
origin_usa,0


In [33]:
data["horsepower"]=data["horsepower"].fillna(data["horsepower"].mean())

In [36]:
X = data.drop("mpg", axis=1)

In [38]:
y = data["mpg"]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

2. Train Models: lr, rfr, ridge, gbr, sr, vr, sr - vr as final

In [41]:
# linear reg
lr = LinearRegression()

In [44]:
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print("R2 score: ",r2_score(y_test, y_pred))
print("Mean Squeared Error: ",mean_squared_error(y_test,y_pred))

R2 score:  0.8362207232996565
Mean Squeared Error:  9.273755618336786


In [45]:
# rfr
rfr = RandomForestRegressor(n_estimators=50, criterion="squared_error")

In [47]:
rfr.fit(X_train, y_train)

y_pred2 = rfr.predict(X_test)

print("R2 score: ",r2_score(y_test, y_pred2))
print("Mean Squeared Error: ",mean_squared_error(y_test,y_pred2))

R2 score:  0.9162758372262575
Mean Squeared Error:  4.740755000000007


In [48]:
# ridge with param tunning
ridge = Ridge()

In [49]:
param_grid_ridge = {
    "alpha": [0.1,0.2, 0.5, 0.7, 0.9]
}

In [52]:
CV_ridge = GridSearchCV(ridge, param_grid=param_grid_ridge, cv=3, n_jobs=-1, scoring="neg_mean_absolute_error")

In [53]:
CV_ridge.fit(X_train, y_train)

In [59]:
print("best estimator", CV_ridge.best_estimator_)
y_pred3 = CV_ridge.best_estimator_.predict(X_test)

print("R2 score: ",r2_score(y_test, y_pred3))
print("Mean Squeared Error: ",mean_squared_error(y_test,y_pred3))

best estimator Ridge(alpha=0.9)
R2 score:  0.8364057084312276
Mean Squeared Error:  9.263281113028302


In [60]:
# gradient boositng regressor
gbr = GradientBoostingRegressor()

In [61]:
gbr.fit(X_train, y_train)

y_pred4 = gbr.predict(X_test)

print("R2 score: ",r2_score(y_test, y_pred4))
print("Mean Squeared Error: ",mean_squared_error(y_test,y_pred4))

R2 score:  0.9166059183731692
Mean Squeared Error:  4.722064650693602


In [64]:
#  stacking regressor
estimators = [
    ("lr", lr),
    ("rfr", rfr),
    ("ridge", CV_ridge.best_estimator_),  
]

In [65]:
sr = StackingRegressor(estimators, final_estimator=gbr, cv=3, n_jobs=-1)

In [66]:
sr.fit(X_train, y_train)

In [67]:
y_pred5 = sr.predict(X_test)

print("R2 score: ",r2_score(y_test, y_pred5))
print("Mean Squeared Error: ",mean_squared_error(y_test,y_pred5))

R2 score:  0.8707600427421114
Mean Squeared Error:  7.318018517854623


In [71]:
#  voting regressor
estimators_v = [
    ("gbr", gbr),
    ("rfr", rfr),
    ("ridge", CV_ridge.best_estimator_),  
]

In [74]:
# voting regressor
vr = VotingRegressor(estimators=estimators_v, weights=(3,2,1), n_jobs=-1)

In [75]:
vr.fit(X_train, y_train)

y_pred6 = vr.predict(X_test)

print("R2 score: ", r2_score(y_test, y_pred6))
print("Mean Squeared Error: ",mean_squared_error(y_test,y_pred6))

R2 score:  0.9168167525497856
Mean Squeared Error:  4.710126482023393


In [86]:
#  stacking regressor 2
estimators_2 = [
    ("gbr", gbr),
    ("rfr", rfr),
    ("ridge", CV_ridge.best_estimator_),  
]

In [87]:
# stack with voting as final
sr2 = StackingRegressor(estimators_2, final_estimator=vr, cv=3, n_jobs=-1)

In [88]:
sr2.fit(X_train, y_train)

In [89]:
y_pred7 = sr2.predict(X_test)

print("R2 score: ", r2_score(y_test, y_pred7))
print("Mean Squeared Error: ", mean_squared_error(y_test,y_pred7))

R2 score:  0.900265729226093
Mean Squeared Error:  5.647303325331521


In [90]:
#  param tunning with svr
svr = SVR(C=1.0, kernel='linear')

In [91]:
svr.fit(X_train, y_train)

y_pred8 = svr.predict(X_test)

print("R2 score: ", r2_score(y_test, y_pred8))
print("Mean Squeared Error: ",mean_squared_error(y_test,y_pred8))

R2 score:  0.7332924509933078
Mean Squeared Error:  15.101914484449896


In [93]:
#  stacking regressor 2
estimators_3 = [
    ("gbr", gbr),
    ("rfr", rfr),
    ("lr", lr),
    ("svr", svr)  
]

In [95]:
sr3 = StackingRegressor(estimators_3, final_estimator=Ridge())


In [92]:
param_grid_sr = {
    "rfr__n_estimators": [10,20,40, 50, 80,100, 110],
    "svr__C": [0.1, 1.0, 10.0],
    "final_estimator__alpha": [0.01, 0.1, 0.2, 0.5,0.9] 
}

In [96]:
RCV_sr = RandomizedSearchCV(sr3, param_grid_sr, cv=5, n_iter=3, n_jobs=-1)

In [97]:
RCV_sr.fit(X_train , y_train)
print("best estimator", RCV_sr.best_estimator_)
y_pred9 = RCV_sr.best_estimator_.predict(X_test)

print("R2 score: ",r2_score(y_test, y_pred9))
print("Mean Squeared Error: ",mean_squared_error(y_test,y_pred9))

best estimator StackingRegressor(estimators=[('gbr', GradientBoostingRegressor()),
                              ('rfr', RandomForestRegressor(n_estimators=40)),
                              ('lr', LinearRegression()),
                              ('svr', SVR(kernel='linear'))],
                  final_estimator=Ridge(alpha=0.9))
R2 score:  0.9131278814523293
Mean Squeared Error:  4.919003268846348
