## What if there were missing values?
* Fill them with some value (also known as imputation).
* Remove the sample with missing data altogether.

In [160]:
import pandas as pd
import numpy as np

In [161]:
car_sales = pd.read_csv('1_ML_Datas/car-sales-extended-missing-data.csv')
car_sales.head(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0


In [162]:
car_sales.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [163]:
x = car_sales.drop('Price', axis=1)
y = car_sales['Price']


In [164]:
x.head(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0


In [165]:
import warnings
warnings.filterwarnings('ignore')

## Fill missing values with Pandas

In [166]:
#Fill missing datas (Make Column)
car_sales['Make'].fillna('missing', inplace=True)
#Fill Colour column
car_sales['Colour'].fillna(np.random.choice(['Skyblue', 'Hotpink', 'Silver']), inplace=True)
#Fill the Odometer column
car_sales['Odometer (KM)'].fillna(car_sales['Odometer (KM)'].mean(), inplace=True)
#Fill Doors column
car_sales['Doors'].fillna(4, inplace=True)

In [167]:
car_sales.dropna(inplace=True)

In [168]:
car_sales.isnull().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [169]:
car_sales.shape

(950, 5)

In [170]:
x = car_sales.drop('Price', axis=1)
y = car_sales['Price']

In [178]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
category_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                  one_hot,
                                  category_features)],
                                remainder='passthrough')
transformed_x = transformer.fit_transform(x)
transformed_x

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [180]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, class_likelihood_ratios
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
X_train, X_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)
model.fit(X_train, y_train)

In [181]:
import sklearn
print(sklearn.__version__)

1.5.2


In [182]:
model.score(X_train, y_train)

0.8670271357086201

In [183]:
model.score(X_test, y_test)

0.2277083350805621

## Filling missing value with scikit-learn

In [103]:
car_sales_missing = pd.read_csv('1_ML_Datas/car-sales-extended-missing-data.csv')

In [104]:
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [105]:
car_sales_missing.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [106]:
car_sales_missing.dropna(subset=['Price'], inplace=True)

In [107]:
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [108]:
x = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [109]:
x.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [129]:
#Fill missing values with scikit-learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Fill categorical values with 'missing' and numerical value with 'mean'
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)
num_imputer = SimpleImputer(strategy='mean')

#Define categorical and number features
cat_features = ['Make', 'Colour']
door_features = ['Doors']
num_features = ['Odometer (KM)']

#Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ('cat_imputer', cat_imputer, cat_features),
    ('num_imputer', num_imputer, num_features),
    ('door_imputer', door_imputer, door_features)
])
filled_x = imputer.fit_transform(x)
filled_x

array([['Honda', 'White', 35431.0, 4.0],
       ['BMW', 'Blue', 192714.0, 5.0],
       ['Honda', 'White', 84714.0, 4.0],
       ...,
       ['Nissan', 'Blue', 66604.0, 4.0],
       ['Honda', 'White', 215883.0, 4.0],
       ['Toyota', 'Blue', 248360.0, 4.0]], dtype=object)

In [130]:
car_sales_filled = pd.DataFrame(filled_x, 
                                columns=['Make', 'Color', 'Odometer (KM)', 'Doors'])

In [131]:
car_sales_filled.head()

Unnamed: 0,Make,Color,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [132]:
car_sales_filled.isna().sum()

Make             0
Color            0
Odometer (KM)    0
Doors            0
dtype: int64

In [133]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_feature = ['Make', 'Color', 'Doors']
one_hots = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                  one_hots,
                                  categorical_feature)],
                                  remainder = 'passthrough')
transformed_x = transformer.fit_transform(car_sales_filled)

In [134]:
transformed_x

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [149]:
#Above transformed_x contains string to integer transformed data and no missing values
# Building model
np.random.seed(42)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, class_likelihood_ratios
reg = RandomForestRegressor()
X_train, X_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.25)
reg.fit(X_train, y_train)

In [150]:
reg.score(X_train, y_train)

0.8868862778702971

In [151]:
reg.score(X_test, y_test)

0.21020722035944994

In [153]:
for i in range(10, 80, 10):
    print(f'Model executed in {i}_estimator.')
    reg = RandomForestRegressor(n_estimators=i)
    reg.fit(X_train, y_train)
    print(f'Score: {reg.score(X_test, y_test)*100:.2f}')

Model executed in 10_estimator.
Score: 14.86
Model executed in 20_estimator.
Score: 21.94
Model executed in 30_estimator.
Score: 21.05
Model executed in 40_estimator.
Score: 19.59
Model executed in 50_estimator.
Score: 20.80
Model executed in 60_estimator.
Score: 21.51
Model executed in 70_estimator.
Score: 23.24
