In [122]:
import pandas as pd
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

In [2]:
mpg = sns.load_dataset('mpg')

## Explore the DataSet

In [3]:
mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [4]:
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


## Data Cleaning

### Drop Unnecessary columns

In [5]:
#drop names column
mpg = mpg.drop('name', axis =1)

### Encode Binary Numbers to Categorical Column

In [7]:
mpg = pd.get_dummies(mpg)

In [8]:
mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa
0,18.0,8,307.0,130.0,3504,12.0,70,0,0,1
1,15.0,8,350.0,165.0,3693,11.5,70,0,0,1
2,18.0,8,318.0,150.0,3436,11.0,70,0,0,1
3,16.0,8,304.0,150.0,3433,12.0,70,0,0,1
4,17.0,8,302.0,140.0,3449,10.5,70,0,0,1


### Check for NaN Values

In [11]:
mpg.isnull().sum()

mpg              0
cylinders        0
displacement     0
horsepower       6
weight           0
acceleration     0
model_year       0
origin_europe    0
origin_japan     0
origin_usa       0
dtype: int64

#### There are 6 NaN values under the horsepower column

### Filling out Null Values With the Overall Mean Value of Horsepower

In [15]:
mpg['horsepower'].mean()

104.46938775510205

In [16]:
## Replace the Null values with 104.47
mpg['horsepower'].fillna(mpg['horsepower'].mean(), inplace= True)

## Modeling

### Split the Data Between X & Y.
#### X = Features
#### Y = Targets

In [17]:
X = mpg.drop('mpg', axis = 1)

In [18]:
y = mpg['mpg']

### Train Test Split 80/20

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.2, random_state = 42)

### Linear Regression 

In [25]:
lr = LinearRegression()

In [27]:
lr.fit(X_train, y_train)

LinearRegression()

In [28]:
y_pred = lr.predict(X_test)

In [34]:
mean_squared_error(y_test, y_pred)

8.339142500255903

In [36]:
r2_score(y_test, y_pred)

0.8449006123776617

### Random Forest model

In [41]:
rfr = RandomForestRegressor(random_state = 42)

In [42]:
rfr.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [43]:
y_pred2 =rfr.predict(X_test)

In [44]:
mean_squared_error(y_test, y_pred2)

4.5866399625

In [45]:
r2_score(y_test, y_pred2)

0.9146932613987512

### Ridge Regression

In [47]:
ridge = Ridge()

In [50]:
param_grid_ridge = {
    'alpha' : [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 10, 15, 30, 50, 75],
}

In [116]:
ridge_cv =GridSearchCV(ridge, param_grid_ridge, cv=5, n_jobs = -1, scoring='neg_mean_squared_error')

In [117]:
%%time
ridge_cv.fit(X_train, y_train)

Wall time: 2.12 s


GridSearchCV(cv=5, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 10, 15, 30,
                                   50, 75]},
             scoring='neg_mean_squared_error')

In [54]:
### The alpha value that worked the best
ridge_cv.best_estimator_

Ridge(alpha=10)

In [55]:
y_pred3 =ridge_cv.predict(X_test)

In [56]:
mean_squared_error(y_test, y_pred3)

8.366781015266866

In [57]:
r2_score(y_test, y_pred3)

0.8443865647099477

### Gradient Boosting Regression

In [60]:
gbr = GradientBoostingRegressor()

In [61]:
gbr.fit(X_train, y_train)

GradientBoostingRegressor()

In [62]:
y_pred4 = gbr.predict(X_test)

In [63]:
mean_squared_error(y_test, y_pred4)

5.600520791306055

In [64]:
r2_score(y_test, y_pred4)

0.895836131224393

### Stacking Regressing Model

In [85]:
estimators = [
    ('lr', lr),
    ('ridge',ridge_cv.best_estimator_),
    ('gbr',gbr),
]

In [68]:
sr = StackingRegressor(
    estimators = estimators,
    final_estimator = rfr
)

In [70]:
sr.fit(X_train,y_train)

StackingRegressor(estimators=[('lr', LinearRegression()),
                              ('ridge', Ridge(alpha=10)),
                              ('gbr', GradientBoostingRegressor())],
                  final_estimator=RandomForestRegressor(random_state=42))

In [71]:
y_pred5 = sr.predict(X_test)

In [72]:
mean_squared_error(y_test, y_pred5)

6.3753337750000005

In [73]:
r2_score(y_test, y_pred5)

0.8814254146202481

### Comparing the best performers

1. Mean Squared Error (MSE):

Lower MSE is Better: For the MSE, a lower value indicates a better model fit. It measures the average squared difference between the predicted values and the actual values. Smaller MSE means the model's predictions are closer to the actual data points.

2. R-squared (R^2) Score:

Higher R^2 is Better: R-squared is a measure of how well the independent variables explain the variation in the dependent variable. It ranges from 0 to 1, and a higher R^2 score indicates a better model fit. An R^2 score of 1 means that the model explains all the variability in the data, while an R^2 score of 0 means that the model does not explain any of the variability.

In summary:

For MSE, lower values are better.

For R^2, higher values (closer to 1) are better.



In [86]:
data = {
    'Model': ['Linear Regressor', 'Random Forest', 'Ridge Regressor','Gradient Boosting Regression',],
    'MSE': [mean_squared_error(y_test, y_pred), mean_squared_error(y_test, y_pred2), mean_squared_error(y_test, y_pred3),mean_squared_error(y_test, y_pred4)],
    'R-squared': [r2_score(y_test, y_pred), r2_score(y_test, y_pred2), r2_score(y_test, y_pred3),r2_score(y_test, y_pred4)]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Sort the DataFrame by 'MSE' column in descending order
sorted_df = df.sort_values(by='MSE', ascending=True)

print(sorted_df)

                          Model       MSE  R-squared
1                 Random Forest  4.586640   0.914693
3  Gradient Boosting Regression  5.600521   0.895836
0              Linear Regressor  8.339143   0.844901
2               Ridge Regressor  8.366781   0.844387


Random Forest, Gradient Boosting Regression and Linear Regressor and performed the best in the order of lowest MSE

### Voting Regressor

In [91]:
### Weighting the estomators by the level of best performers
vr = VotingRegressor([
    ('lr', lr),
    ('rfr', rfr),
    ('gbr',gbr),
], weights=[2,3,1])

In [92]:
vr.fit(X_train,y_train)

VotingRegressor(estimators=[('lr', LinearRegression()),
                            ('rfr', RandomForestRegressor(random_state=42)),
                            ('gbr', GradientBoostingRegressor())],
                weights=[2, 3, 1])

In [93]:
y_pred6 = vr.predict(X_test)

In [94]:
mean_squared_error(y_test, y_pred6)

4.9625939733020425

In [95]:
r2_score(y_test, y_pred6)

0.9077009073470284

### Stacking and voting regressor combined

In [96]:
sr2 = StackingRegressor(
    estimators = estimators,
    final_estimator = vr
)

In [97]:
sr2.fit(X_train,y_train)

StackingRegressor(estimators=[('lr', LinearRegression()),
                              ('ridge', Ridge(alpha=10)),
                              ('gbr', GradientBoostingRegressor())],
                  final_estimator=VotingRegressor(estimators=[('lr',
                                                               LinearRegression()),
                                                              ('rfr',
                                                               RandomForestRegressor(random_state=42)),
                                                              ('gbr',
                                                               GradientBoostingRegressor())],
                                                  weights=[2, 3, 1]))

In [98]:
y_pred7 = sr2.predict(X_test)

In [99]:
mean_squared_error(y_test, y_pred7)

5.627069675885674

In [100]:
r2_score(y_test, y_pred7)

0.8953423495507702

In [102]:
### comparing all the models 
data = {
    'Model': ['Linear Regressor', 'Random Forest', 'Ridge Regressor','Gradient Boosting Regression','StackingRegressor','VotingRegressor','StackingRegressor'],
    'MSE': [mean_squared_error(y_test, y_pred), mean_squared_error(y_test, y_pred2), mean_squared_error(y_test, y_pred3),mean_squared_error(y_test, y_pred4),mean_squared_error(y_test, y_pred5),mean_squared_error(y_test, y_pred6),mean_squared_error(y_test, y_pred7)],
    'R-squared': [r2_score(y_test, y_pred), r2_score(y_test, y_pred2), r2_score(y_test, y_pred3),r2_score(y_test, y_pred4),r2_score(y_test, y_pred5),r2_score(y_test, y_pred6),r2_score(y_test, y_pred7)]
}

# Create a DataFrame
df2 = pd.DataFrame(data)

# Sort the DataFrame by 'MSE' column in descending order
sorted_df2 = df2.sort_values(by='MSE', ascending=True)

print(sorted_df2)

                          Model       MSE  R-squared
1                 Random Forest  4.586640   0.914693
5               VotingRegressor  4.962594   0.907701
3  Gradient Boosting Regression  5.600521   0.895836
6             StackingRegressor  5.627070   0.895342
4             StackingRegressor  6.375334   0.881425
0              Linear Regressor  8.339143   0.844901
2               Ridge Regressor  8.366781   0.844387


### Hyperparameter Tuning

In [104]:
estimators2 = [
    ('ridge',ridge_cv.best_estimator_),
    ('gbr',gbr),
    ('svr',SVR(C =1.0, kernel= 'linear')),
    ('random_forest',RandomForestRegressor())
]

In [105]:
sr3 = StackingRegressor(
    estimators = estimators2,
    final_estimator = Ridge(alpha = 1.0)
)

In [106]:
param_grid_sr= {
    'random_forest__n_estimators':[50,100,250],
    'svr__C': [0.1,1.0,10.0],
    'final_estimator__alpha': [0.1,1.0,10.0]
    
}

In [123]:
sr_cv = RandomizedSearchCV(sr3, param_grid_sr, n_iter = 5, cv=3, scoring='neg_mean_squared_error' )

In [124]:
%%time
sr_cv.fit(X_train, y_train)

Wall time: 7min 16s


RandomizedSearchCV(cv=3,
                   estimator=StackingRegressor(estimators=[('ridge',
                                                            Ridge(alpha=10)),
                                                           ('gbr',
                                                            GradientBoostingRegressor()),
                                                           ('svr',
                                                            SVR(kernel='linear')),
                                                           ('random_forest',
                                                            RandomForestRegressor())],
                                               final_estimator=Ridge()),
                   n_iter=5,
                   param_distributions={'final_estimator__alpha': [0.1, 1.0,
                                                                   10.0],
                                        'random_forest__n_estimators': [50, 100,
                       

In [125]:
y_pred8 = sr_cv.predict(X_test)

In [126]:
mean_squared_error(y_test, y_pred8)

5.234766061102006

In [127]:
r2_score(y_test, y_pred8)

0.9026387892522275

In [129]:
sr_cv.best_params_

{'svr__C': 0.1,
 'random_forest__n_estimators': 250,
 'final_estimator__alpha': 0.1}