In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import cupy as cp
import pickle

# Data preprocessing
Clean => Transform => Reduce
1. Spliting into features (`X`) and labels (`y`)
2. Handling missing data: fill (`imputation`) or discard.
3. Feature encoding (converting non-numerical values to numerical ones)

In [3]:
sales = pd.read_csv('./car-sales-extended-missing-data.csv')
sales.isna().sum()    # Shows how many rows are missing data for each feature

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

## Handling missing data

### Either fill wid pandas...

In [4]:
# Fill columns

# FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
# The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

# For example, when doing 'df[col].method(value, inplace=True)',
# try using 'df.method({col: value}, inplace=True)'
# or df[col] = df[col].method(value) instead,
# to perform the operation inplace on the original object.


sales.fillna({'Make': 'missing'}, inplace=True)
sales.fillna({'Colour': 'missing'}, inplace=True)
sales.fillna({'Odometer (KM)': sales['Odometer (KM)'].mean()}, inplace=True)
sales.fillna({'Doors': 4}, inplace=True)
# sales.fillna({'Price': sales['Price'].mean()}, inplace=True)
# sales.dropna(subset=['Price'] inplace=True)
sales.dropna(inplace=True)    # Same as previous line because at this point all other columns do not contain missing values
sales.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [5]:
X = sales.drop('Price', axis=1)

In [6]:
y = sales['Price']

### ... or using Scikit-Learn

In [7]:
sales = pd.read_csv('./car-sales-extended-missing-data.csv')
sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [8]:
sales.dropna(subset=['Price'], inplace=True)
sales.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [15]:
X = sales.drop('Price', axis=1)
y = sales['Price']

# Fill categorical values with 'missing' & numerical values with mean
categorical_imputer = SimpleImputer(strategy='constant', fill_value='missing')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)
num_imputer = SimpleImputer(strategy='mean')

categorical_features = ['Make', 'Colour']
doors_features = ['Doors']
num_features = ['Odometer (KM)']

# create an imputer
preprocessor = ColumnTransformer([
    ('cat_imputer', categorical_imputer, categorical_features),
    ('door_imputer', door_imputer, doors_features),
    ('num_imputer', num_imputer, num_features)
])

# Transform the data
filled_X = preprocessor.fit_transform(X)

X = pd.DataFrame(filled_X,
            columns=['Make', 'Colour', 'Doors', 'Odometer (KM)'])

y = y


## Splitting data

## Feature encoding

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [13]:
categorical_features = ['Make', 'Colour', 'Doors']

# Doors, while being an int/float, is categorical because in the dataset we have only the values 5, 4 or 3
sales['Doors'].value_counts()

one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')

In [9]:
X.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [10]:
transformed_X = transformer.fit_transform(X)
pd.DataFrame(transformed_X).shape

(950, 1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

In [12]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (760, 15)
X_test shape: (190, 15)
y_train shape: (760,)
y_test shape: (190,)


### Using a pipeline

In [21]:
from sklearn.pipeline import Pipeline

one_hot_enc = OneHotEncoder(handle_unknown='ignore')

categorical_transformer = Pipeline(steps=[
    ('fill_missing_values', categorical_imputer),
    ('feature_encoding', one_hot_enc)
])

door_transformer = Pipeline(steps=[
    ('fill_missing_values', door_imputer),
])

numeric_transformer = Pipeline(steps=[
    ('fill_missing_values', num_imputer)
])

preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, categorical_features),
    ('door', door_transformer, doors_features),
    ('numerical', numeric_transformer, num_features)
])

reg = RandomForestRegressor()

# Preprocessing and modeling pipeline
model = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', reg)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model.fit(X_train, y_train)
model.score(X_test, y_test)

0.3604853760243467

# Fitting the model

In [13]:
cp.random.seed(0)

model = RandomForestRegressor()
model_params = model.get_params()
n_estimators = model_params['n_estimators']

model.fit(X_train, y_train)
score = model.score(X_test, y_test)
score

0.3159162394578

In [14]:
for val in range(1, 151, 10):
    global n_estimators
    global score
    
    if val == model_params['n_estimators']:
        continue
    print(f'Trying {val} estimators...')
    local_model = RandomForestRegressor(n_estimators=val)
    local_model.fit(X_train, y_train)
    local_score = local_model.score(X_test, y_test)

    print(f'local_score: {local_score}')
    
    if local_score > score:
        # replace the n_estimator value
        n_estimators = val
        score = local_score
        print(f'Obtained better score ({local_score}) with {n_estimators} estimators')

Trying 1 estimators...
local_score: -0.24343230280124017
Trying 11 estimators...
local_score: 0.23728812929282905
Trying 21 estimators...
local_score: 0.318281110545404
Obtained better score (0.318281110545404) with 21 estimators
Trying 31 estimators...
local_score: 0.32136167475523736
Obtained better score (0.32136167475523736) with 31 estimators
Trying 41 estimators...
local_score: 0.3198768661056357
Trying 51 estimators...
local_score: 0.3082695962998957
Trying 61 estimators...
local_score: 0.31301893686629934
Trying 71 estimators...
local_score: 0.32143013960988465
Obtained better score (0.32143013960988465) with 71 estimators
Trying 81 estimators...
local_score: 0.3171072007142496
Trying 91 estimators...
local_score: 0.318359780593364
Trying 101 estimators...
local_score: 0.2990279259441113
Trying 111 estimators...
local_score: 0.31186079453951665
Trying 121 estimators...
local_score: 0.3040813619262749
Trying 131 estimators...
local_score: 0.31482108090351935
Trying 141 estimator

In [15]:
print(n_estimators)
new_model = RandomForestRegressor(n_estimators=n_estimators)
new_model.fit(X_train, y_train)
print(score)
score = new_model.score(X_test, y_test)
score

71
0.32143013960988465


0.31361814938861843