# 

# Putting it all together

In [None]:
# Basic imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk

In [8]:
# Getting Ready
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Creating a random seed
np.random.seed(42)

# Import data and drop the rows with the missing labels
data = pd.read_csv('data/car-sales-extended-missing-data.csv')
data.dropna(subset=["Price"], inplace=True)

# Define Features & Transformer pipeline
categorical_features = ["Make", "Colour"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

door_feature = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='constant', fill_value=4))])

numerical_feature = ["Odometer (KM)"]
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='mean'))])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(
                transformers=[
                    ("cat", categorical_transformer, categorical_features),
                    ("door", categorical_transformer, categorical_features),
                    ("num", categorical_transformer, categorical_features)])

# Create a preprocessing and modelling pipeline
model = Pipeline(steps=[('preprocessor', preprocessor), 
                         ('model', RandomForestRegressor())])

# Split data
X = data.drop('Price', axis=1)
y = data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and Score model
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.12064933209558326