In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
car_sales = pd.read_csv('car-sales-extended-missing-data.csv')

In [4]:
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

## Steps
### Doing the previous work using Pipeline
1. Fill missing data
2. Convert non numerical data in to numerical 
3. Build a machine learning model on it

In [6]:
# Getting Data ready
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Set up Random Seed
np.random.seed(42)
# import data and drop the rows with missing labels
car_sales = pd.read_csv('car-sales-extended-missing-data.csv')
car_sales.dropna(subset=['Price'],inplace=True)

# Define different features and transformers pipeline
categorical_features = ['Make','Colour']
# So this just means modifying the data and to do so we use pipeline
categorical_transformers = Pipeline( steps = [
                                                ("imputer",SimpleImputer(strategy='constant',fill_value='missing')),
                                                ("One Hot Encodeer",OneHotEncoder(handle_unknown='ignore'))
])

door_features = ["Doors"]
door_transformers = Pipeline( steps = [
                                                ("imputer",SimpleImputer(strategy='constant',fill_value=4))
                                               
])

numeric_features = ["Odometer (KM)"]
numeric_transformers = Pipeline( steps = [
                                                ("imputer",SimpleImputer(strategy='mean'))
                                               
])


# Set up preprocessing steps
preprocessor = ColumnTransformer(
                                    transformers = [
                                        ('cat', categorical_transformers, categorical_features),
                                        ('door', door_transformers, door_features),
                                        ('num',numeric_transformers, numeric_features)
                                    ])

# next step is to create preprocessing and modeling pipeline
model = Pipeline(
                    steps = [
                        ('preprocessor',preprocessor),
                        ('model',RandomForestRegressor())
                    ]
)

# Split Data
X = car_sales.drop('Price',axis=1)
y = car_sales['Price']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2)
# fit and score the model
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.22188417408787875

 ### It is also Possible to use GridSearchCv and RandomizedSearchCv with Pipeline

In [7]:
# Using Grid Search CV with pipeline

# creating the gride for Hyperparameter
pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"], # note the double underscore after each prefix "preprocessor__"
    "model__n_estimators": [100, 1000],
    "model__max_depth": [None, 5],
    "model__max_features": ["sqrt"],
    "model__min_samples_split": [2, 4]
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_sampl