In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [2]:
column_names = ['mpg', 'cylinders','displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
auto_mpg_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data', delim_whitespace=True, header=None,names=column_names)
auto_mpg_data['horsepower'].unique()
auto_mpg_data = auto_mpg_data.drop('car_name', axis = 1)
auto_mpg_data = auto_mpg_data.replace('?', -1)
auto_mpg_data.horsepower = auto_mpg_data.horsepower.astype('float')

In [3]:
X = auto_mpg_data.iloc[:, 1:]
y = auto_mpg_data.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
numeric_features = ['displacement', 'horsepower', 'weight', 'acceleration']
categorical_features = ['model_year', 'origin']
ordinal_features = ['cylinders']

In [4]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(missing_values = -1, strategy="mean")), ("scaler", StandardScaler())]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
ordinal_transformer = OrdinalEncoder()
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("ord", ordinal_transformer, ordinal_features),
    ]
)
reg = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", BaggingRegressor())]
)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.7485216848317473


In [5]:
r2 = cross_val_score(estimator = reg, X = X_train, y = y_train, scoring='r2', cv = 10)
X_train_new = preprocessor.fit_transform(X_train)

In [6]:
RF_reg = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", RandomForestRegressor())]
)

RF_reg.fit(X_train, y_train)
y_pred = RF_reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.7925406483485963


In [7]:
GB_reg = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", GradientBoostingRegressor())]
)

GB_reg.fit(X_train, y_train)
y_pred = GB_reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.8173538432105059


In [8]:
AB_reg = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", AdaBoostRegressor())]
)

AB_reg.fit(X_train, y_train)
y_pred = AB_reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.7249188825105819


In [9]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
svm= SVR()
knn= KNeighborsRegressor()
VR_reg = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", VotingRegressor(estimators=[('lr', lr), ('dt', dt), ('svc', svm), ('knn',knn)]))]
)

VR_reg.fit(X_train, y_train)
y_pred = VR_reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.8313415103683222
