BOSTON HOUSES PREDICTION WITH SCIKIT (PART 2)

In [65]:
# We continue using the Boston houses dataset, trying to improve the results in the previous code. 
# This time, instead of dropping all the rows with missing values, we will be imputating the values. Sometimes, it also helps to get 
# better results, adding a new column and indicating if the value was imputated or not.

In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split

dir = 'https://raw.githubusercontent.com/AleGL92/Scikit-Learn/main/melb_data.csv'
melb_data = pd.read_csv(dir)
# print(melb_data.describe())
# print(melb_data.head())
# When we use describe, object type columns don't appear. We could use them to get information like if a Suburb or a Street is good or bad,
# making the predicted price of the property vary. But for simplicity, we're leaving them out this time.
print(melb_data.dtypes)

y = melb_data.Price
melb_predictors = melb_data.drop(['Price'], axis=1)         # This drops the entire column
X = melb_predictors.select_dtypes(exclude=['object'])       # include = ['float64'] would have also been an option
# print(X.dtypes)

train_X, val_X, train_y, val_y = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)


Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object


1. HANDLING MISSING VALUES

In [67]:
# Defining the model and the error measurement function.
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def get_mae(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(random_state = 0)          # n_estimators means the number of trees for the forest
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    return mean_absolute_error(val_y, preds)

1.1 Dropping rows with missing values

In [68]:
melb_data2 = melb_data.dropna(axis = 0)
y2 = melb_data2.Price

melb_predictors2 = melb_data2.drop(['Price'], axis=1)         # This drops the entire column
X2 = melb_predictors2.select_dtypes(exclude=['object']) 

train_X2, val_X2, train_y2, val_y2 = train_test_split(X2, y2, train_size = 0.8, test_size = 0.2, random_state = 0)
print('Dropping rows we get: {}'.format(get_mae(train_X2, val_X2, train_y2, val_y2)))

# Our best result (in the previous code) was 173864 so far. This time, we didn't do that well and we got 182586.
# It also looks like the selected cols previously: 
# m_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
# give a better result than just excluding those cols that aren't objects.

Dropping rows we get: 182586.1992983871


1.2 Dropping columns with missing values

In [69]:
cols_missing_vals = [col for col in train_X.columns if train_X[col].isnull().any()]
# print(cols_missing_vals)

rtrain_X = train_X.drop(cols_missing_vals, axis = 1)
rval_X = val_X.drop(cols_missing_vals, axis = 1)
print('Dropping columns we get: {}'.format(get_mae(rtrain_X, rval_X, train_y, val_y)))

Dropping columns we get: 175703.48185157913


1.3 Using simple imputation of missing values

In [70]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
itrain_X = pd.DataFrame(imputer.fit_transform(train_X))
ival_X = pd.DataFrame(imputer.transform(val_X))

# We have to name back the columns. The imputer removes them.
itrain_X.columns = train_X.columns
ival_X.columns = val_X.columns

print('Using a simple imputer we get: {}'.format(get_mae(itrain_X, ival_X, train_y, val_y)))
# We got the best results so far (169237)

Using a simple imputer we get: 169237.0268668034


1.4 Using extended imputation

In [71]:
train_X_ext = train_X.copy()
val_X_ext = val_X.copy()

for col in cols_missing_vals:
    train_X_ext[col + '_was_missing'] = train_X_ext[col].isnull()       #Reminder: Isnull() returns 1 if null, 0 if not
    val_X_ext[col + '_was_missing'] = val_X_ext[col].isnull()

imputer = SimpleImputer()
itrain_X_ext = pd.DataFrame(imputer.fit_transform(train_X_ext))
ival_X_ext = pd.DataFrame(imputer.transform(val_X_ext))

# We have to name back the columns. The imputer removes them.
itrain_X_ext.columns = train_X_ext.columns
ival_X_ext.columns = val_X_ext.columns

print('Using an extended imputer we get: {}'.format(get_mae(itrain_X_ext, ival_X_ext, train_y, val_y)))

# The results were good but just slightly better than the simple imputer (169795). Making new columns to get almost the same result is not 
# worth it. Maybe in a different dataset this could perform better, so it's still a valid method.

Using an extended imputer we get: 169795.45249719475


In [72]:
# It's useful to check how many missing values there are:
# Shape of training data (num_rows, num_columns)
print(train_X.shape)
# Number of missing values in each column of training data
missing_val_count_by_column = (train_X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])
# We can see that the columns Car, BuildingArea and Yearbuilt, where the ones with missing values. The last 2 with 5156 and 4307 missing
# values respectively. Car has just 49 missing. But in any those, dropping the entire columns means losing more than half of the data.

(10864, 12)
Car               49
BuildingArea    5156
YearBuilt       4307
dtype: int64


2. CATEGORICAL VARIABLES

In [73]:
# Here we will see different aproaches to handle categorical variables, such as the ones in the 'object' columns.
# melb_predictors.head()
print(melb_data.dtypes)
# We can see there are some columns with type object, that were excluded in the predictions before. Now we'll keep some of them.

y3 = melb_data.Price
X3 = melb_data.drop(['Price'], axis=1)
X3_train_full, X3_valid_full, y3_train, y3_valid = train_test_split(X3, y3, train_size=0.8, test_size=0.2)

# For simplicity, we'll continue dropping columns with missing values. We could also apply on of the imputations methods above.
# ['Car', 'BuildingArea', 'YearBuilt', 'CouncilArea']
cw_missing = [col for col in X3_train_full.columns if X3_train_full[col].isnull().any()]
print(f'Columns with missing values: {cw_missing} (This are excluded)' )
X3_train_full_d = X3_train_full.drop(cw_missing, axis = 1)      # ,inplace = True to modify directly in the original DF.
X3_valid_full_d = X3_valid_full.drop(cw_missing, axis = 1)

# Also for simplicity, we'll just convert to categorical values, those columns having less than 10 unique values. This is called 
# low cardinality. ['Type', 'Method', 'Regionname']
lc_cols = [col for col in X3_train_full_d.columns if (X3_train_full_d[col].nunique() < 10 and X3_train_full_d[col].dtype == 'object')]
print('Columns with low cardinality: ', lc_cols)

# Now we select numerical columns
n_cols = [col for col in X3_train_full_d.columns if X3_train_full_d[col].dtype in ['int64', 'float']]
print('Columns with numerical values: ', n_cols)

# Now we have all the columns that will be used; those that are numerical and those that are categorical, but dont have more than 10
# different categorical values.
my_cols = lc_cols + n_cols
X_train_f = X3_train_full_d[my_cols].copy()
X_val_f = X3_valid_full_d[my_cols].copy()
X_train_f.head()


Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object
Columns with missing values: ['Car', 'BuildingArea', 'YearBuilt', 'CouncilArea'] (This are excluded)
Columns with low cardinality:  ['Type', 'Method', 'Regionname']
Columns with numerical values:  ['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'Propertycount']


Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
433,h,S,Southern Metropolitan,2,12.2,3147.0,2.0,1.0,586.0,-37.8683,145.1082,2894.0
7628,t,S,Northern Metropolitan,2,4.5,3057.0,2.0,1.0,3886.0,-37.7762,144.9771,5533.0
2,h,SP,Northern Metropolitan,3,2.5,3067.0,3.0,2.0,134.0,-37.8093,144.9944,4019.0
6889,h,S,Northern Metropolitan,3,12.4,3060.0,3.0,1.0,684.0,-37.7174,144.9664,5070.0
2169,h,S,Eastern Metropolitan,3,13.9,3108.0,3.0,1.0,811.0,-37.7927,145.1386,9028.0


In [74]:
# We define the model again and the measure error function (MAE)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def get_mae(X_train, X_val, y_train, y_val):
    model = RandomForestRegressor(n_estimators = 100, random_state = 1)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    return mae

2.1 Drop categorical values

In [75]:
# We'll actually use this just as a reference to compare with the folowing methods. We are dropping columns here.
dX_train_f = X_train_f.select_dtypes(exclude = ['object'])
dX_val_f = X_val_f.select_dtypes(exclude = ['object'])

# print(dX_train_f.describe())
# print(dX_val_f.describe())
# print(y3_train.describe())
# print(y3_valid.describe())

print('MAE dropping categorical values: ', get_mae(dX_train_f, dX_val_f, y3_train, y3_valid))


MAE dropping categorical values:  177651.28653005994


2.2 Ordinal Encoding

In [76]:
# This time we'll use an ordinal encoder, which asigns a number to each categorical value, so that they can be classified.
from sklearn.preprocessing import OrdinalEncoder
# It's useful to make a copy of the original DS, to avoind changing it during the encoding
eX_train = X_train_f.copy()
eX_val = X_val_f.copy()

# Apply ordinal encoder to each column with categorical data, but just to those selected before, with low cardinality.
encoder = OrdinalEncoder()
eX_train[lc_cols] = encoder.fit_transform(eX_train[lc_cols])
eX_val[lc_cols] = encoder.fit_transform(eX_val[lc_cols])

print('MAE using ordinal encoding: ', get_mae(eX_train, eX_val, y3_train, y3_valid))
# Using the ordinal encoder, we got 176296, which is better than just dropping the categorical values

MAE using ordinal encoding:  167099.28154618497


2.3 One-hot encoding

In [77]:
# This time we'll add columns representing each categorical value. The row value will be 1 if it's the right categorical value or 
# 0 if it's not. This way we can classify all the categorical values.
from sklearn.preprocessing import OneHotEncoder
oh_e = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
# handle_unknown; When this parameter is set to ‘ignore’ and an unknown category is encountered during transform, the resulting one-hot encoded 
# columns for this feature will be all zeros.
# sparse; will return sparse matrix if set True else will return an array.

# This time, we'll just encode the categorical columns, because the OneHot encoded columns will be returned in addition.
# Then we remove from the DS the categorical columns and, finally, we add the result columns from the encoding.

oh_cols_train = pd.DataFrame(oh_e.fit_transform(X_train_f[lc_cols]))
oh_cols_val = pd.DataFrame(oh_e.fit_transform(X_val_f[lc_cols]))

# One-hot encoding removed index; put it back
oh_cols_train.index = X_train_f.index
oh_cols_val.index = X_val_f.index

# Remove categorical columns (will replace with one-hot encoding)
pre_X_train = X_train_f.drop(lc_cols, axis = 1)
pre_X_val = X_val_f.drop(lc_cols, axis = 1)

# Add one-hot encoded columns to numerical features
ohX_train = pd.concat([pre_X_train, oh_cols_train], axis = 1)
ohX_val = pd.concat([pre_X_val, oh_cols_val], axis = 1)

print('MAE using one-hot enconding: ', get_mae(ohX_train, ohX_val, y3_train, y3_valid))
# The MAE is 173099, which is slightly better than ordinal encoding. In this case I wouldnt choose one-hot encoding, because the results
# are very similar, but we're adding more columns, making the DS heavier and slower computing. However, one-hot encoding is supposed to 
# return better results than ordinal encoding. Not in this case.

MAE using one-hot enconding:  167627.49990247274


3. PIPELINES

In [78]:
# This time, we'll be applying some of the previous methods in a few lines. It's much better for clarity and it also helps avoiding mistakes.
# The pipeline will have 2 parts, one for numerical columns and another one for categorical columns.
# As a remainder, in step 2 we had separated both of them and finally concatenated as my_cols = lc_cols + n_cols
# Numerical columns were all that had format int64 or float and for categorical columns we just those with a cardinality lower than 10.

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transform = SimpleImputer(strategy = 'mean')

# Preprocessing for categorical data
categorical_transform = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers = [
    ('num', numerical_transform, n_cols),
    ('cat', categorical_transform, lc_cols) 
])


In [79]:
# Creating the model, the pipeline and, evaluating the pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators = 100, random_state = 0)
my_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model)
])

# With the pipeline, we supply the unprocessed features in X_valid to the predict() command, and the pipeline automatically preprocesses 
# the features before generating predictions. (However, without a pipeline, we have to remember to preprocess the validation data before 
# making predictions.)

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train_f, y3_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_val_f)

# Evaluate the model
MAE = mean_absolute_error(y3_valid, preds)
print('MAE using a pipeline:', MAE)
# We got good results, similar to the best so far, 

MAE using a pipeline: 166988.97910290578
