### 1.2 Dealing with missing values

There are two possible solutions
* 1 Fill them with some value
* 2 Remove the samples with missing data.

In [19]:
import pandas as pd

In [34]:
car_sales_missing = pd.read_csv('dataset/car-sales-extended-missing-data.csv')
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [35]:
# try to see  if there are missing values
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

### Dealing with pandas

In [16]:

# fill missing values from features
car_sales_missing['Make'].fillna('missing', inplace=True)
car_sales_missing['Doors'].fillna(4, inplace=True)
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True)
car_sales_missing['Colour'].fillna('missing', inplace=True)

# drop missing values that are missed in labels
car_sales_missing.dropna(inplace=True)

In [17]:
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

### Dealing with sklearn


In [36]:
# first remove rows which has missing values
car_sales_missing.dropna(subset=["Price"], inplace=True)


In [37]:
# Split into X and y
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']
columns = X.columns

In [38]:
# Fill missing values with sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with missing and numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define features 
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

# create an imputer (something fills in missing data)
imputer = ColumnTransformer([('cat_imputer', cat_imputer, cat_features),
                             ('num_imputer', num_imputer, num_features),
                             ('door_imputer', door_imputer, door_feature)],
                           remainder="passthrough")

X_array = imputer.fit_transform(X)
X = pd.DataFrame(X_array, columns=columns)

## Run and Test The Model

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor


cat_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, cat_features)], remainder='passthrough')

X = transformer.fit_transform(X)

# Split into training and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# choose the model
model = RandomForestRegressor()

# train the model
model.fit(X_train, y_train)

# score the accuracy
model.score(X_test, y_test)


0.273010595559243