In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [3]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [4]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

# Steps we want to do (all in one cell):
1. Fill missing data
2. Convert data to numbers
3. Build a model to Data

In [8]:
#Geting data ready
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Setupp random seed
import numpy as np
np.random.seed(42)

# Import data drop rows with missing labes
data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"],inplace=True)

# Define diffrent features and transfor pipeline
catagorical_features=["Make","Colour"]
catagorical_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
                                           ('onehot',OneHotEncoder(handle_unknown='ignore'))])
door_features = ["Doors"]
door_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value= 4))])

numeric_featues = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean'))])

#Setup the pre-processing fill missing value and convert into number

preprocessor = ColumnTransformer(transformers=[('cat',catagorical_transformer,catagorical_features),
                                               ('door',door_transformer,door_features),
                                               ('num',numeric_transformer,numeric_featues)])
                                   

#Creat a preprocssing and modelling pipeline
model = Pipeline(steps = [('preprocessor',preprocessor),
                           ('model',RandomForestRegressor())])

#Split data
x = data.drop("Price",axis=1)
y = data["Price"] 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

#Fit and score the model
model.fit(x_train,y_train)

In [9]:
model.score(x_test,y_test)

0.22188417408787875

In [10]:
# Use GridSearchCV with our regression Pipeline
from sklearn.model_selection import GridSearchCV
pipe_grid = {

    "preprocessor__num__imputer__strategy":['mean','median'],
    "model__n_estimators":[100,1000],
    "model__max_depth":[None,5],
    "model__max_features":["sqrt", "log2", None],
    "model__min_samples_split":[2,4]
}
gs_model = GridSearchCV(model,pipe_grid,cv=5,verbose=2)

gs_model.fit(x_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samp

In [11]:
gs_model.score(x_test,y_test)

0.28660438125948107