In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("bmw.csv")
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0
...,...,...,...,...,...,...,...,...,...
10776,X3,2016,19000,Automatic,40818,Diesel,150,54.3,2.0
10777,5 Series,2016,14600,Automatic,42947,Diesel,125,60.1,2.0
10778,3 Series,2017,13100,Manual,25468,Petrol,200,42.8,2.0
10779,1 Series,2014,9930,Automatic,45000,Diesel,30,64.2,2.0


## Data Cleaning

In [3]:
# Unique values per column
data.nunique()

model             24
year              25
price           3777
transmission       3
mileage         8086
fuelType           5
tax               38
mpg              102
engineSize        17
dtype: int64

In [4]:
# Check for missing values
missing_values = [np.nan, "", " ", None]
data.isin(missing_values).mean().sort_values(ascending=False)*100

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

In [5]:
# Seperating out the target column
X = data.drop(columns="price")
y = data["price"]

### Feature Engineering

In [6]:
# Classify every value in the "model" column as a car type

car_type = {"5 Series":"sedan",
            "6 Series":"coup",
            "1 Series":"coup",
            "7 Series":"sedan",
            "2 Series":"coup",
            "4 Series":"coup",
            "X3":"suv",
            "3 Series":"sedan",
            "X5":"suv",
            "X4":"suv",
            "i3":"electric",
            "X1":"suv",
            "M4":"sports",
            "X2":"suv",
            "X6":"suv",
            "8 Series":"coup",
            "Z4":"convertible",
            "X7":"suv",
            "M5":"sports",
            "i8":"electric",
            "M2":"sports",
            "M6":"sports",
            "Z3":"convertible"}

X["model"] = X["model"].str.strip()
X["car_type"] = X["model"].map(car_type)
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,car_type
0,5 Series,2014,Automatic,67068,Diesel,125,57.6,2.0,sedan
1,6 Series,2018,Automatic,14827,Petrol,145,42.8,2.0,coup
2,5 Series,2016,Automatic,62794,Diesel,160,51.4,3.0,sedan
3,1 Series,2017,Automatic,26676,Diesel,145,72.4,1.5,coup
4,7 Series,2014,Automatic,39554,Diesel,160,50.4,3.0,sedan
...,...,...,...,...,...,...,...,...,...
10776,X3,2016,Automatic,40818,Diesel,150,54.3,2.0,suv
10777,5 Series,2016,Automatic,42947,Diesel,125,60.1,2.0,sedan
10778,3 Series,2017,Manual,25468,Petrol,200,42.8,2.0,sedan
10779,1 Series,2014,Automatic,45000,Diesel,30,64.2,2.0,coup


### Data Encoding

Encode the data using OneHotEncoding

In [7]:
X = pd.get_dummies(X, drop_first=True, dtype=int)
X

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_2 Series,model_3 Series,model_4 Series,model_5 Series,model_6 Series,...,transmission_Semi-Auto,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,car_type_coup,car_type_electric,car_type_sedan,car_type_sports,car_type_suv
0,2014,67068,125,57.6,2.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2018,14827,145,42.8,2.0,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0
2,2016,62794,160,51.4,3.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,2017,26676,145,72.4,1.5,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2014,39554,160,50.4,3.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,40818,150,54.3,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10777,2016,42947,125,60.1,2.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
10778,2017,25468,200,42.8,2.0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
10779,2014,45000,30,64.2,2.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Scaling

In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_2 Series,model_3 Series,model_4 Series,model_5 Series,model_6 Series,...,transmission_Semi-Auto,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,car_type_coup,car_type_electric,car_type_sedan,car_type_sports,car_type_suv
0,0.750000,0.313399,0.215517,0.111971,0.303030,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.916667,0.069281,0.250000,0.080163,0.303030,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.833333,0.293427,0.275862,0.098646,0.454545,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.875000,0.124650,0.250000,0.143778,0.227273,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.750000,0.184828,0.275862,0.096497,0.454545,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,0.833333,0.190735,0.258621,0.104879,0.303030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10777,0.833333,0.200683,0.215517,0.117344,0.303030,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10778,0.875000,0.119005,0.344828,0.080163,0.303030,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
10779,0.750000,0.210277,0.051724,0.126155,0.303030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### Train-Test Split

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

## Model Implementation and Evaluation: Regression

* To evaluate the accuracy of the model we will use Mean Squared Error (MSE)  metric.

### Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

linear_regressor = LinearRegression()

linear_regressor.fit(X_train, y_train)

linear_predictions = linear_regressor.predict(X_test)

mean_absolute_error(y_test, linear_predictions)

2886.932121661721

In [13]:
# Percentage above below mean price in y_test data

mean_absolute_error(y_test, linear_predictions) / y_test.mean()

0.12672650943618186

We can see that thee model can predict a price of a bmw car within 12% above or below the true value. This isn't amazing but it is a good start, however this can be further improved.

### Random Forest

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

rf_regressor = RandomForestRegressor(n_estimators=1000)

rf_regressor.fit(X_train, y_train)

rf_predictions = rf_regressor.predict(X_test)

mean_absolute_error(y_test, rf_predictions)


1560.4434033254363

In [21]:
# Percentage above below mean price in y_test data

mean_absolute_error(y_test, rf_predictions) / y_test.mean()

0.06849816252774374

Using the random forest algorithm has reduced the predictions error by almost 50%. This is a significant improvement from the linear regression model. Let us see if we can improve this further with the next algorithm.

### XGBoost

We will implement a boosting algorithm now on this dataset. These algorithms have become very popular in the last few years, especially during Kaggle Data Science Competitions as they are fast, lightweight and relatively easy to implement.

In [22]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

boost_model = XGBRegressor()

boost_model.fit(X_train, y_train)

boost_predictions = boost_model.predict(X_test)

mean_absolute_error(y_test, boost_predictions)

1539.893030998615

In [23]:
# Percentage above below mean price in y_test data

mean_absolute_error(y_test, boost_predictions) / y_test.mean()

0.06759607101923507

We can see that the XGBoost algorithm has performed similary to the random forest algorithm in terms of accuracy. However, it is worth noting that the boosting algorithm took significantly less time to execute compared to the random forest algorithm. This is one of the many reasons this algorithm has become very popular amongst the data science community due to its relatively high accuracy and fast runtimes. The 6% error can be further improved upon using HyperParamater Tuning.

## Hyperparameter Tuning the Random Forest Algorithm

In [19]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)

# Minumum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create random grid
random_grid = {"n_estimators":n_estimators,
               "max_features":max_features,
               "max_depth":max_depth,
               "min_samples_split":min_samples_split,
               "min_samples_leaf":min_samples_leaf,
               "bootstrap":bootstrap}


pprint(random_grid)




{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [20]:
# Use the random grid to search for best hyperparameters
# First create base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation
# Search across 100 different combinations and use all available cores

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, n_jobs=-1)

# Fit the random search model
rf_random.fit(X_train, y_train)



Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [24]:
pprint(rf_random.best_params_)

{'bootstrap': False,
 'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 800}


### Evaluate Random Search

To determine if random search yielded an improved model, we compare the base model with the best random search model.

In [29]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    mae = mean_absolute_error(test_labels, predictions)
    mape = 100 * np.mean(mae/test_labels)
    accuracy = 100 - mape
    print("Model Performance")
    print("Average Error: {:0.3f} GBP.".format(mae))
    print("Accuracy = {:0.2f}%.".format(accuracy))

    return accuracy

In [30]:
base_accuracy = evaluate(rf_regressor, X_test, y_test)

Model Performance
Average Error: 1560.443 GBP.
Accuracy = 91.31%.


In [31]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
Average Error: 1536.952 GBP.
Accuracy = 91.44%.


We achieved an improvement of 0.13% using the random search. This isn't a significant improvement but the random search has given us hyperparamters which we can explore to further improve our results in further work.