## Putting it all together 

In [53]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)
data = pd.read_csv("car-sales-extended-missing-data.csv")
data.dropna(subset = ["Price"], inplace= True)
data


categorical_features =["Make", "Colour"]

categorical_transformer = Pipeline (steps= [ # steps is a list containing tuples
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown= "ignore"))
    ## If set to "error" (default), it will crash when it encounters an unknown category.
    ## If set to "ignore", it will just skip it and output all zeros for that column.
])

door_features =["Doors"]

door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value= round(np.mean(data["Doors"])))), # takes the mean and fills
    ##the door NAN coloumns with 4
])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

# set up preprocessing steps (fill missing values then convert coloumns to number)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("door", door_transformer, door_features ),
        ("num", numeric_transformer, numeric_features)
    ], 
    n_jobs=-1,
)

model = Pipeline(steps =[
    ("Preprocessor", preprocessor),
    ("model", RandomForestRegressor())
])

#split data
x = data.drop("Price", axis=1)
y = data["Price"]
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y,
                                                    test_size=0.2)

model.fit(x_train, y_train)
model.score(x_test, y_test)

# y_preds= model.predict(x_test)


 


0.22188417408787875

In [None]:
#use GrideSearchCV with our regression Pipeline


In [43]:
round(np.mean(data["Doors"]))
      

4

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [19]:
data.dtypes


Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [33]:
data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64