In [48]:
#! conda install -c conda-forge sklearn-contrib-py-earth

In [49]:
# Helper packages
import numpy as np
import pandas as pd
from plotnine import *

# Modeling packages
from pyearth import Earth
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.inspection import partial_dependence
from sklearn.pipeline import Pipeline


import pickle

In [50]:
# reading the data, the train set and the test set.
cars = pd.read_csv("../EDA/cleaned_data.csv")
cars_Train =  pd.read_csv("../EDA/train.csv")
cars_Test =  pd.read_csv("../EDA/test.csv")

In [51]:
# separate features from labels and only use numeric features
X_train = cars_Train.drop("Price", axis=1)
y_train = cars_Train[["Price"]]

In [52]:
infile = open("../pickles/target_transformer", "rb")
targetTrans = pickle.load(infile)
infile.close()
targetTrans

TransformedTargetRegressor(transformer=PowerTransformer(method='box-cox'))

In [53]:
infile = open("../pickles/features_preprocessor", "rb")
preprocesser = pickle.load(infile)
infile.close()
preprocesser

ColumnTransformer(remainder='passthrough',
                  transformers=[('nzv_encode', VarianceThreshold(threshold=0.1),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fa908a43b50>),
                                ('norm', PowerTransformer(method='box-cox'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fa908b4a8b0>),
                                ('std', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fa90899a1c0>),
                                ('one-hot', OneHotEncoder(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fa90899a280>),
                                ('rare_encode',
                                 RareLabelEncoder(replace_with='other',
                                                  tol=0.01),
                       

In [60]:
# creat a MARS model
earth_mod = Earth()

# define loss function
loss = 'neg_root_mean_squared_error'

# create 10 fold CV object
kfold = KFold(n_splits=10, random_state=123, shuffle=True)

# Create grid of hyperparameter values
hyper_grid = {'max_terms': range(1, 30, 3),
              'max_degree': range(1, 3)}

In [61]:
model_pipeline = Pipeline(steps=[
  ("preprocessor", preprocesser),
  ("earth", earth_mod),
])

grid_search = GridSearchCV(estimator = model_pipeline, param_grid= hyper_grid, cv=kfold, scoring=loss)
results = model_pipeline.fit(X_train, y_train)

# Optimal penalty parameter in grid search
results.best_estimator_




ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [None]:
# Best model's cross validated RMSE
round(abs(results.best_score_), 2)