In [3]:
from __future__ import print_function, division
import numpy as np
import scipy as sp
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [4]:
data = pd.read_csv('dataset/car-sales-extended-missing-data.csv')

In [7]:
data.dropna(subset=['Price'], inplace=True)

In [13]:
X = data.drop(axis=1, columns='Price')
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [15]:
y = data['Price']
y.head()

0    15323.0
1    19943.0
2    28343.0
3    13434.0
4    14043.0
Name: Price, dtype: float64

In [45]:
#Imputer

cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)
num_imputer = SimpleImputer(strategy='mean')

cat_featuers = ['Make', 'Colour']
door_feature = ['Doors']
num_features = ['Odometer (KM)']

imputer = ColumnTransformer([
    ('cat_imputer', cat_imputer, cat_featuers),
    ('door_imputer', door_imputer, door_feature),
    ('num_imputer', num_imputer, num_features)])

filled_X = imputer.fit_transform(X)
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [46]:
filled_XX = pd.DataFrame(filled_X, columns=['Make', 'Colour' , 'Doors', 'Odometer (KM)'])
filled_XX.dtypes

Make             object
Colour           object
Doors            object
Odometer (KM)    object
dtype: object

In [47]:
filled_XX['Doors'] = filled_XX['Doors'].astype('float')
filled_XX['Odometer (KM)'] = filled_XX['Odometer (KM)'].astype('float')
filled_XX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 773 entries, 0 to 772
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           773 non-null    object 
 1   Colour         773 non-null    object 
 2   Doors          773 non-null    float64
 3   Odometer (KM)  773 non-null    float64
dtypes: float64(2), object(2)
memory usage: 24.3+ KB


In [42]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 773 entries, 0 to 999
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           773 non-null    object 
 1   Colour         773 non-null    object 
 2   Odometer (KM)  773 non-null    float64
 3   Doors          773 non-null    float64
dtypes: float64(2), object(2)
memory usage: 30.2+ KB


In [48]:
#    transformers : list of tuples
#    List of (name, transformer, column(s)) tuples specifying the
#    transformer objects to be applied to subsets of the data.

one_hot_enc = OneHotEncoder()

categorical_features = ['Make', 'Colour', 'Doors']
transformer = ColumnTransformer([
    ('one_hot_enc', one_hot_enc, categorical_features)],
    remainder='passthrough')


transformed_X = transformer.fit_transform(filled_XX)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [49]:
transformed_X = pd.DataFrame(transformed_X)
transformed_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
768,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,163322.0
769,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
770,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
771,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0
