In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

np.random.seed(42)

df = pd.read_csv('1_ML_Datas/car-sales-extended-missing-data.csv')
df.dropna(subset=['Price'], inplace=True)
X = df.drop(columns='Price')
y = df.Price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
cat_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_features = ['Odometer (KM)']
door_features = ['Doors']
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

door_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='constant', fill_value=4))])

numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='mean'))])

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, cat_features), 
    ('door', door_transformer, door_features),
    ('num', numerical_transformer, num_features)])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', RandomForestRegressor())])

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.22188417408787875

In [None]:
pipe_grid = {
    'preprocessor__num__imputer__strategy':['mean', 'median'],
    'model__n_estimators':[100, 400, 500, 900],
    'model__max_depth':[None, 5, 10, 12],
    'model__max_features':['sqrt', 'log2'],
    'model__min_samples_split':[2, 10, 15, 20],
    'model__min_samples_leaf':[2, 10, 12, 20]
}
gs_model = GridSearchCV(model, param_grid=pipe_grid, cv=5)
gs_model.fit(X_train, y_train)