<img src="images/img.png" />

# CS5228 Project, Group 32

In [1]:
# Auto reload
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

## Data Preprocessing
In this part, we are going to perform some data preprocessing steps. This may include:
* Data cleaning: handle missing values, duplicates, inconsistant or invalid vallues, outliers

* Data reduction: reduce number of attributes, reduce number of attribute values

* Data transformation: attribute construction, normalization

* Data discretization: encode to numerical attributes

### Load the train dataset

In [3]:
# Load file into pandas dataframe
df = pd.read_csv('./data/train.csv')

num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

There are 25000 data points in training data, each with 30 attributes.


### Data Cleaning

Before data cleaning, remove the known attributes that are not meaningful to our prediction model:
  * Meaningless idendifier: listing_id 
  * Attributes in free text: title, description, features, accessories
  * Attribute with the same value: eco_category, indicative_price
  * Attribute unlikely to affect price: curb_weight

In [4]:
columns_to_drop = [
    'listing_id',          # Meaningless identifier
    'description',
    'features',
    'accessories',
    'eco_category',        # Attribute with the same value
    'indicative_price',
    'curb_weight',         # Attribute unlikely to affect price
    'transmission',
    'original_reg_date',
    'lifespan',
]

df = df.drop(columns=columns_to_drop)

num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

There are 25000 data points in training data, each with 20 attributes.


### Handle Missing Values
Firstly, for each of the columns with missing value, check the number of rows with NaN values.
There are 3 scenarios:
1. NaN value is the major (e.g. fuel_type has 19121 rows with NaN values), we remove the corresponding attritubes.
2. NaN value is the minor. We can choose to fill or delete related data points. 

In [5]:
columns_to_check = [
    'make',
    'fuel_type',
    'manufactured',
    'power',
    'engine_cap',
    'mileage',
    'no_of_owners',
    'depreciation',
    'road_tax',
    'dereg_value',
    'omv',
    'arf',
    'opc_scheme'
]

# Calculate the number of NaN values in each specified column
nan_counts = df[columns_to_check].isna().sum()

# Print the number of NaN values for each column
print('Training data')
for column, count in nan_counts.items():
    print(f"Column '{column}' has {count} rows with NaN values.")

Training data
Column 'make' has 1316 rows with NaN values.
Column 'fuel_type' has 19121 rows with NaN values.
Column 'manufactured' has 7 rows with NaN values.
Column 'power' has 2640 rows with NaN values.
Column 'engine_cap' has 596 rows with NaN values.
Column 'mileage' has 5304 rows with NaN values.
Column 'no_of_owners' has 18 rows with NaN values.
Column 'depreciation' has 507 rows with NaN values.
Column 'road_tax' has 2632 rows with NaN values.
Column 'dereg_value' has 220 rows with NaN values.
Column 'omv' has 64 rows with NaN values.
Column 'arf' has 174 rows with NaN values.
Column 'opc_scheme' has 24838 rows with NaN values.


### We first drop columns with TOO MANY NaN values and unlikely to help prediction

In [6]:
columns_to_drop_nan = [
    'fuel_type',
    'opc_scheme',
    'power'
]

for col in columns_to_drop_nan:
    if col in df.columns:
        df = df.drop(columns=[col])

### Transform categorical value to numerical values

Transform date time attributes to numerical values
This step is required to fill up the missing values.

In [7]:
from util.DataPreprocess import CalculateCarAge
    

df = CalculateCarAge(df)
num_records, num_attributes = df.shape
print("There are {} data points, each with {} attributes.". format(num_records, num_attributes))

There are 25000 data points, each with 18 attributes.


In [8]:
df['title'] = df['title'].str.replace(r'\s*\(.*?\)', '', regex=True)

In [9]:
categorical_columns = [
    'model',
    'make',
    'type_of_vehicle',
    'title'
]

encode_dict = {}
le = LabelEncoder()
for column in categorical_columns:
    df[column] = le.fit_transform(df[column])
    encode_dict[column] = {str(label): int(index) for index, label in enumerate(le.classes_)}

with open('./data/encode.json', 'w') as file:
    json.dump(encode_dict, file, indent=4)

### Handle category attribute

In [10]:
from util.DataPreprocess import HandlingCategoryAttribute

if 'category' in df.columns:
    df = HandlingCategoryAttribute(df)
    
print(df.columns)

Number of unique categories: 15
Unique categories: {'rare & exotic', 'opc car', 'almost new car', 'electric cars', 'hybrid cars', 'low mileage car', 'consignment car', 'imported used vehicle', 'parf car', 'vintage cars', 'coe car', 'premium ad car', 'sta evaluated car', 'sgcarmart warranty cars', 'direct owner sale'}
There are 25000 data points, each with 32 attributes.
Index(['title', 'make', 'model', 'manufactured', 'type_of_vehicle',
       'engine_cap', 'no_of_owners', 'depreciation', 'coe', 'road_tax',
       'dereg_value', 'mileage', 'omv', 'arf', 'price', 'reg_year', 'car_age',
       'almost new car', 'coe car', 'consignment car', 'direct owner sale',
       'electric cars', 'hybrid cars', 'imported used vehicle',
       'low mileage car', 'opc car', 'parf car', 'premium ad car',
       'rare & exotic', 'sgcarmart warranty cars', 'sta evaluated car',
       'vintage cars'],
      dtype='object')


In [11]:
columns_to_check = [
    'title',
    'make',
    'manufactured',
    'engine_cap',
    'mileage',
    'no_of_owners',
    'depreciation',
    'road_tax',
    'dereg_value',
    'omv',
    'arf',
]

# Calculate the number of NaN values in each specified column
nan_counts = df[columns_to_check].isna().sum()

# Print the number of NaN values for each column
print('Training data')
for column, count in nan_counts.items():
    print(f"Column '{column}' has {count} rows with NaN values.")

df = df.dropna(subset=[
    'manufactured',
])

num_records, num_attributes = df.shape
print("There are {} data points, each with {} attributes.". format(num_records, num_attributes))

Training data
Column 'title' has 0 rows with NaN values.
Column 'make' has 0 rows with NaN values.
Column 'manufactured' has 7 rows with NaN values.
Column 'engine_cap' has 596 rows with NaN values.
Column 'mileage' has 5304 rows with NaN values.
Column 'no_of_owners' has 18 rows with NaN values.
Column 'depreciation' has 507 rows with NaN values.
Column 'road_tax' has 2632 rows with NaN values.
Column 'dereg_value' has 220 rows with NaN values.
Column 'omv' has 64 rows with NaN values.
Column 'arf' has 174 rows with NaN values.
There are 24993 data points, each with 32 attributes.


### Then we try to fill up other missing values.

In [12]:
from util.DataPreprocess import HandlingMissingValue
from util.DataPreprocess import HandlingMissingValueWithImpute

columns = [
    'model',
    'mileage',
    'engine_cap',
    'no_of_owners',
    'coe', 
    'road_tax',
    'omv',
    'arf',
    'price',
]

print(df.columns)

df = HandlingMissingValue(df)
df = HandlingMissingValueWithImpute(df, columns)

Index(['title', 'make', 'model', 'manufactured', 'type_of_vehicle',
       'engine_cap', 'no_of_owners', 'depreciation', 'coe', 'road_tax',
       'dereg_value', 'mileage', 'omv', 'arf', 'price', 'reg_year', 'car_age',
       'almost new car', 'coe car', 'consignment car', 'direct owner sale',
       'electric cars', 'hybrid cars', 'imported used vehicle',
       'low mileage car', 'opc car', 'parf car', 'premium ad car',
       'rare & exotic', 'sgcarmart warranty cars', 'sta evaluated car',
       'vintage cars'],
      dtype='object')
title                         0
make                          0
model                         0
manufactured                  0
type_of_vehicle               0
engine_cap                  589
no_of_owners                 18
depreciation                  0
coe                           0
road_tax                   2529
dereg_value                   0
mileage                    5094
omv                          57
arf                         165
price   

In [13]:
df.head()

Unnamed: 0,title,make,model,manufactured,type_of_vehicle,engine_cap,no_of_owners,depreciation,coe,road_tax,...,hybrid cars,imported used vehicle,low mileage car,opc car,parf car,premium ad car,rare & exotic,sgcarmart warranty cars,sta evaluated car,vintage cars
0,1452,43,595.0,2018.0,8,2995.0,2.0,34270.0,48011.0,2380.0,...,0,0,0,0,1,0,0,0,0,0
1,1928,51,192.0,2017.0,2,1991.0,2.0,21170.0,47002.0,1202.0,...,0,0,0,0,1,1,0,0,0,0
2,1012,29,546.0,2007.0,4,2354.0,3.0,12520.0,50355.0,2442.0,...,0,0,1,0,0,1,0,0,0,0
3,3204,88,156.0,2008.0,3,1598.0,3.0,10140.0,27571.0,1113.0,...,0,0,0,0,0,1,0,0,0,0
4,1474,44,398.0,2006.0,2,2995.0,6.0,13690.0,48479.0,3570.0,...,0,0,0,0,0,1,0,0,0,0


### Remove Exact Duplicates
We remove duplicated data points here.

In [14]:
df = df.drop_duplicates()

num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

There are 24417 data points in training data, each with 32 attributes.


### Data Calculation

In [15]:
from util.DataPreprocess import DataCalculation

df = DataCalculation(df)
print(df.head())

   title  make  model  manufactured  type_of_vehicle  engine_cap  \
0   1452    43  595.0        2018.0                8      2995.0   
1   1928    51  192.0        2017.0                2      1991.0   
2   1012    29  546.0        2007.0                4      2354.0   
3   3204    88  156.0        2008.0                3      1598.0   
4   1474    44  398.0        2006.0                2      2995.0   

   no_of_owners  depreciation      coe  road_tax  ...  imported used vehicle  \
0           2.0       34270.0  48011.0    2380.0  ...                      0   
1           2.0       21170.0  47002.0    1202.0  ...                      0   
2           3.0       12520.0  50355.0    2442.0  ...                      0   
3           3.0       10140.0  27571.0    1113.0  ...                      0   
4           6.0       13690.0  48479.0    3570.0  ...                      0   

   low mileage car  opc car  parf car  premium ad car  rare & exotic  \
0                0        0         1 

### Save the preprocessed data

In [16]:
file_name = './data/train_preprocessed_impute.csv'

# Check if the file exists
if os.path.exists(file_name):
    # Delete the file
    os.remove(file_name)
    print(f"Existing file '{file_name}' has been deleted.")

# Save the DataFrame to CSV
df.to_csv(file_name, index=False)
print(f"DataFrame has been saved to '{file_name}'.")

Existing file './data/train_preprocessed_impute.csv' has been deleted.
DataFrame has been saved to './data/train_preprocessed_impute.csv'.


## Data Mining

### Load preprocessed training data

In [17]:
# Load file into pandas dataframe, we saved our preprocessed file at path 'output_file'
training_file = './data/train_preprocessed_impute.csv'
df = pd.read_csv(training_file)

columns_to_keep = [
    'make',
    'model',
    'manufactured',
    'type_of_vehicle',
    'engine_cap',
    'no_of_owners',
    'depreciation',
    'road_tax',
    'mileage',
    'omv',
    'arf',
    'omv_arf_ratio',
    'dereg_coe_ratio',
    'price',
    'car_age',
    'almost new car',
    'coe car',
    'consignment car',
    'direct owner sale',
    'electric cars',
    'hybrid cars',
    'imported used vehicle',
    'low mileage car',
    'opc car',
    'parf car',
    'premium ad car',
    'rare & exotic',
    'sgcarmart warranty cars',
    'sta evaluated car',
    'vintage cars'
]
df = df[columns_to_keep]
columns_to_keep = [col for col in df.columns if col != 'price']

print(df.columns)
num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

Index(['make', 'model', 'manufactured', 'type_of_vehicle', 'engine_cap',
       'no_of_owners', 'depreciation', 'road_tax', 'mileage', 'omv', 'arf',
       'omv_arf_ratio', 'dereg_coe_ratio', 'price', 'car_age',
       'almost new car', 'coe car', 'consignment car', 'direct owner sale',
       'electric cars', 'hybrid cars', 'imported used vehicle',
       'low mileage car', 'opc car', 'parf car', 'premium ad car',
       'rare & exotic', 'sgcarmart warranty cars', 'sta evaluated car',
       'vintage cars'],
      dtype='object')
There are 24417 data points in training data, each with 30 attributes.


### Data Augmentation, copy rows with less than 5 samples by group

In [18]:
from util.DataPreprocess import DataAugmentation

df_aug = DataAugmentation(df)

num_records, num_attributes = df_aug.shape
print("There are {} data points after augmentation, each with {} attributes.". format(num_records, num_attributes))

There are 40701 data points after augmentation, each with 30 attributes.


### Save the augmentation data

In [19]:
file_name = './data/train_preprocessed_augmentation.csv'

# Check if the file exists
if os.path.exists(file_name):
    # Delete the file
    os.remove(file_name)
    print(f"Existing file '{file_name}' has been deleted.")

# Save the DataFrame to CSV
df_aug.to_csv(file_name, index=False)
print(f"DataFrame has been saved to '{file_name}'.")

Existing file './data/train_preprocessed_augmentation.csv' has been deleted.
DataFrame has been saved to './data/train_preprocessed_augmentation.csv'.


### Load test data and preprocess

In [20]:
test_file = './data/test.csv'
df_test = pd.read_csv(test_file)

### Convert date

In [21]:
from util.DataPreprocess import CalculateCarAge
df_test = CalculateCarAge(df_test)

num_records, num_attributes = df_test.shape
print("There are {} data points, each with {} attributes.". format(num_records, num_attributes))

There are 10000 data points, each with 30 attributes.


### Convert category data

In [22]:
from util.DataPreprocess import HandlingCategoryAttribute

if 'category' in df_test.columns:
    df_test = HandlingCategoryAttribute(df_test)

Number of unique categories: 15
Unique categories: {'rare & exotic', 'almost new car', 'opc car', 'electric cars', 'hybrid cars', 'low mileage car', 'consignment car', 'imported used vehicle', 'parf car', 'vintage cars', 'coe car', 'premium ad car', 'sta evaluated car', 'sgcarmart warranty cars', 'direct owner sale'}
There are 10000 data points, each with 44 attributes.


### Data calculation

In [23]:
from util.DataPreprocess import DataCalculation

df_test = DataCalculation(df_test)

   listing_id                               title    make   model  \
0     1303772                  Honda Vezel 1.5A X   honda   vezel   
1     1323166  Mazda 3 1.6A SP (COE till 10/2027)   mazda       3   
2     1308405       MINI Cooper S Countryman 2.0A    mini  cooper   
3     1216706                  Toyota Vios 1.5A G  toyota    vios   
4     1298206                     Mazda 3 HB 1.5A   mazda       3   

                                         description  manufactured  \
0                                               4614        2015.0   
1  extremely well maintained and in pristine cond...        2007.0   
2  1 owner! beautiful island blue color! eurokars...        2019.0   
3  fully agent maintain! genuine low mileage at 5...        2019.0   
4  workshop check/sta evaluation available. accid...        2015.0   

  original_reg_date  type_of_vehicle transmission  curb_weight  ...  \
0               NaN              suv         auto       1190.0  ...   
1               NaN  m

### Select attributes

In [24]:
columns_to_keep = df.columns
columns_to_keep = [col for col in columns_to_keep if col != 'price']

df_test = df_test[columns_to_keep]
total_nulls = df_test.isnull().sum()
print(total_nulls)

make                        541
model                         0
manufactured                  3
type_of_vehicle               0
engine_cap                  235
no_of_owners                  8
depreciation                201
road_tax                   1082
mileage                    2166
omv                          29
arf                          65
omv_arf_ratio                65
dereg_coe_ratio              83
car_age                       0
almost new car                0
coe car                       0
consignment car               0
direct owner sale             0
electric cars                 0
hybrid cars                   0
imported used vehicle         0
low mileage car               0
opc car                       0
parf car                      0
premium ad car                0
rare & exotic                 0
sgcarmart warranty cars       0
sta evaluated car             0
vintage cars                  0
dtype: int64


### Encode attributes on test data

In [25]:
num_records, num_attributes = df_test.shape
print("There are {} data points, each with {} attributes.". format(num_records, num_attributes))

categorical_columns = [
    'make',
    'model',
    'type_of_vehicle',
    'transmission',
]

with open('./data/encode.json', 'r') as file:
    data = json.load(file)

for col, cate_dict in data.items():
    if col in df_test.columns:
        df_test.loc[:, col] = df_test[col].map(cate_dict)

num_records, num_attributes = df_test.shape
print("There are {} data points in test data, each with {} attributes.". format(num_records, num_attributes))

There are 10000 data points, each with 29 attributes.
There are 10000 data points in test data, each with 29 attributes.


In [26]:
total_nulls = df_test.isnull().sum()
print(total_nulls)

make                        544
model                        77
manufactured                  3
type_of_vehicle               0
engine_cap                  235
no_of_owners                  8
depreciation                201
road_tax                   1082
mileage                    2166
omv                          29
arf                          65
omv_arf_ratio                65
dereg_coe_ratio              83
car_age                       0
almost new car                0
coe car                       0
consignment car               0
direct owner sale             0
electric cars                 0
hybrid cars                   0
imported used vehicle         0
low mileage car               0
opc car                       0
parf car                      0
premium ad car                0
rare & exotic                 0
sgcarmart warranty cars       0
sta evaluated car             0
vintage cars                  0
dtype: int64


### Handle missing values on test data

In [27]:
columns_to_check = [
    'make',
    'model',
    'manufactured',
    'engine_cap',
    'no_of_owners',
    'depreciation',
    'road_tax',
    'mileage',
    'omv_arf_ratio',
    'dereg_coe_ratio'
]

# Calculate the number of NaN values in each specified column
nan_counts = df_test[columns_to_check].isna().sum()

# Print the number of NaN values for each column
print('Test data')
for column, count in nan_counts.items():
    print(f"Column '{column}' has {count} rows with NaN values.")

Test data
Column 'make' has 544 rows with NaN values.
Column 'model' has 77 rows with NaN values.
Column 'manufactured' has 3 rows with NaN values.
Column 'engine_cap' has 235 rows with NaN values.
Column 'no_of_owners' has 8 rows with NaN values.
Column 'depreciation' has 201 rows with NaN values.
Column 'road_tax' has 1082 rows with NaN values.
Column 'mileage' has 2166 rows with NaN values.
Column 'omv_arf_ratio' has 65 rows with NaN values.
Column 'dereg_coe_ratio' has 83 rows with NaN values.


In [28]:
from util.DataPreprocess import HandlingCategoryAttribute

if 'category' in df_test.columns:
    df_test = HandlingCategoryAttribute(df_test)

In [29]:
from util.DataPreprocess import HandlingMissingValueWithImputeReference
from util.DataPreprocess import HandlingMissingValueWithReference

columns = [
    'make',
#     'model',
    'manufactured',
    'engine_cap',
    'no_of_owners',
    'depreciation',
    'road_tax',
    'mileage',
    'omv_arf_ratio',
    'dereg_coe_ratio'
]

# df_test = HandlingMissingValueWithReference(df, df_test)
total_nulls = df_test.isnull().sum()
print(total_nulls)
df_test = HandlingMissingValueWithImputeReference(df_test, df, columns)

make                        544
model                        77
manufactured                  3
type_of_vehicle               0
engine_cap                  235
no_of_owners                  8
depreciation                201
road_tax                   1082
mileage                    2166
omv                          29
arf                          65
omv_arf_ratio                65
dereg_coe_ratio              83
car_age                       0
almost new car                0
coe car                       0
consignment car               0
direct owner sale             0
electric cars                 0
hybrid cars                   0
imported used vehicle         0
low mileage car               0
opc car                       0
parf car                      0
premium ad car                0
rare & exotic                 0
sgcarmart warranty cars       0
sta evaluated car             0
vintage cars                  0
dtype: int64
   make  model  manufactured type_of_vehicle  engine_cap  n

In [30]:
df_test.head()

Unnamed: 0,make,model,manufactured,type_of_vehicle,engine_cap,no_of_owners,depreciation,road_tax,mileage,omv,...,hybrid cars,imported used vehicle,low mileage car,opc car,parf car,premium ad car,rare & exotic,sgcarmart warranty cars,sta evaluated car,vintage cars
0,29.0,746.0,2015.0,8,1496.0,2.0,17660.0,682.0,112000.0,19229.0,...,0,0,0,0,1,0,0,0,0,0
1,49.0,41.0,2007.0,3,1598.0,1.0,10920.0,1113.0,120000.0,14347.0,...,0,0,1,0,0,1,0,0,0,0
2,53.0,235.0,2019.0,6,1998.0,1.0,22120.0,1210.0,43000.0,39863.0,...,0,0,0,0,1,0,0,0,0,0
3,88.0,748.0,2019.0,3,1496.0,3.0,13700.0,682.0,53300.0,15573.0,...,0,0,0,0,1,1,0,0,0,0
4,49.0,41.0,2015.0,1,1496.0,3.0,14190.0,682.0,149000.0,18097.0,...,0,0,0,0,1,1,0,0,0,0


In [31]:
print(df_test.columns)
print(df.columns)

Index(['make', 'model', 'manufactured', 'type_of_vehicle', 'engine_cap',
       'no_of_owners', 'depreciation', 'road_tax', 'mileage', 'omv', 'arf',
       'omv_arf_ratio', 'dereg_coe_ratio', 'car_age', 'almost new car',
       'coe car', 'consignment car', 'direct owner sale', 'electric cars',
       'hybrid cars', 'imported used vehicle', 'low mileage car', 'opc car',
       'parf car', 'premium ad car', 'rare & exotic',
       'sgcarmart warranty cars', 'sta evaluated car', 'vintage cars'],
      dtype='object')
Index(['make', 'model', 'manufactured', 'type_of_vehicle', 'engine_cap',
       'no_of_owners', 'depreciation', 'road_tax', 'mileage', 'omv', 'arf',
       'omv_arf_ratio', 'dereg_coe_ratio', 'price', 'car_age',
       'almost new car', 'coe car', 'consignment car', 'direct owner sale',
       'electric cars', 'hybrid cars', 'imported used vehicle',
       'low mileage car', 'opc car', 'parf car', 'premium ad car',
       'rare & exotic', 'sgcarmart warranty cars', 'sta eval

### Check if train data has all models in test data

In [32]:
models_in_df = set(df['model'].unique())
models_in_df_test = set(df_test['model'].unique())

if models_in_df_test.issubset(models_in_df):
    print("df includes all models in df_test")
else:
    missing_models = models_in_df_test - models_in_df
    print("df does not include", missing_models)

df does not include {nan, 36.0, 651.0, 659.0, 411.0}


### Mining code here

In [33]:
from util.DataMining import split_dataframe, split_dataframe_flex
from util.DataMining import (
    RandomForestMining,
    RandomForestMiningByModel,
    GradientBoostingMining,
    GradientBoostingMiningByModel,
    XGBoostMining,
    LinearRegressionMining,
    LinearRegressionMiningByModel,
    CombinedDataMiningRandomForestAndLinearRegression
)

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [34]:
# run_times, rmse_sum = 1, 0
# for i in tqdm(range(run_times), desc='Running Random Forest'):
#     target_col = 'price'
#     x_train, x_test, y_train, y_test = split_dataframe(df, target_col)
#     rmse_sum += RandomForestMining(x_train, x_test, y_train, y_test)
# print('Average RMSE:', round(rmse_sum / run_times))

In [35]:
# run_times, rmse_sum = 1, 0
# for i in tqdm(range(run_times), desc='Running Random Forest'):
#     train_drop_cols = ['price']
#     test_cols = ['price', 'model']
#     x_train, x_test, y_train, y_test = split_dataframe_flex(df_aug, train_drop_cols, test_cols)
#     rmse_sum += RandomForestMiningByModel(x_train, x_test, y_train, y_test)
# print('Average RMSE:', round(rmse_sum / run_times))

In [36]:
selected_columns = [
    'make',
    'model',
#     'manufactured',
    'type_of_vehicle',
    'engine_cap',
    'no_of_owners',
    'depreciation',
    'road_tax',
    'mileage',
    'omv_arf_ratio',
    'dereg_coe_ratio',
    'car_age',
    'omv',
    'arf',
#     'almost new car',
#     'coe car',
#     'consignment car',
#     'direct owner sale',
#     'electric cars',
#     'hybrid cars',
#     'imported used vehicle',
#     'low mileage car',
#     'opc car',
#     'parf car',
#     'premium ad car',
#     'rare & exotic',
#     'sgcarmart warranty cars',
#     'sta evaluated car',
#     'vintage cars',
    'price'
]

df_aug_filtered = df_aug[selected_columns]

run_times, rmse_sum = 5, 0
for i in tqdm(range(run_times), desc='Running XGBoost'):
    train_drop_cols = ['price']
    x_train, x_test, y_train, y_test = split_dataframe(df_aug_filtered, train_drop_cols)
    rmse_sum += XGBoostMining(x_train, x_test, y_train, y_test)
print('Average RMSE:', round(rmse_sum / run_times))

Running XGBoost:  20%|███████████████████▌                                                                              | 1/5 [00:00<00:01,  2.91it/s]

Running not in develop mode
RMSE on test data: 13140.46003938495


Running XGBoost:  40%|███████████████████████████████████████▏                                                          | 2/5 [00:00<00:00,  3.09it/s]

Running not in develop mode
RMSE on test data: 14096.101051966554


Running XGBoost:  60%|██████████████████████████████████████████████████████████▊                                       | 3/5 [00:00<00:00,  3.11it/s]

Running not in develop mode
RMSE on test data: 16576.65168696996


Running XGBoost:  80%|██████████████████████████████████████████████████████████████████████████████▍                   | 4/5 [00:01<00:00,  3.17it/s]

Running not in develop mode
RMSE on test data: 17132.915678012374


Running XGBoost: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.11it/s]

Running not in develop mode
RMSE on test data: 13580.462872042624
Average RMSE: 14905





In [37]:
df_test.head()

Unnamed: 0,make,model,manufactured,type_of_vehicle,engine_cap,no_of_owners,depreciation,road_tax,mileage,omv,...,hybrid cars,imported used vehicle,low mileage car,opc car,parf car,premium ad car,rare & exotic,sgcarmart warranty cars,sta evaluated car,vintage cars
0,29.0,746.0,2015.0,8,1496.0,2.0,17660.0,682.0,112000.0,19229.0,...,0,0,0,0,1,0,0,0,0,0
1,49.0,41.0,2007.0,3,1598.0,1.0,10920.0,1113.0,120000.0,14347.0,...,0,0,1,0,0,1,0,0,0,0
2,53.0,235.0,2019.0,6,1998.0,1.0,22120.0,1210.0,43000.0,39863.0,...,0,0,0,0,1,0,0,0,0,0
3,88.0,748.0,2019.0,3,1496.0,3.0,13700.0,682.0,53300.0,15573.0,...,0,0,0,0,1,1,0,0,0,0
4,49.0,41.0,2015.0,1,1496.0,3.0,14190.0,682.0,149000.0,18097.0,...,0,0,0,0,1,1,0,0,0,0


In [38]:
df_selected = df[selected_columns]

x_train, y_train = df_selected.drop(columns=['price']), df[['price']]
x_test = df_test[x_train.columns]

res_nomodel = XGBoostMining(x_train, x_test, y_train, dev=True)
res_nomodel.reset_index(inplace=True)
res_nomodel.rename(columns={'index': 'Id'}, inplace=True)
print(res_nomodel.head(20))

    Id      Predicted
0    0   22187.541016
1    1   33934.449219
2    2  158490.984375
3    3   76098.468750
4    4   24278.529297
5    5   75574.937500
6    6   75925.726562
7    7  204159.671875
8    8   67015.765625
9    9   68682.593750
10  10   55705.699219
11  11  119554.281250
12  12  109528.281250
13  13   79220.890625
14  14   68134.585938
15  15   14353.547852
16  16   78719.882812
17  17   52718.089844
18  18  134685.000000
19  19  102051.070312


In [39]:
x_train.columns

Index(['make', 'model', 'type_of_vehicle', 'engine_cap', 'no_of_owners',
       'depreciation', 'road_tax', 'mileage', 'omv_arf_ratio',
       'dereg_coe_ratio', 'car_age', 'omv', 'arf'],
      dtype='object')

In [40]:
res_nomodel.to_csv('./data/xgb_result.csv', index=False)

In [41]:
print(df_test.columns)
print(x_train.columns)

Index(['make', 'model', 'manufactured', 'type_of_vehicle', 'engine_cap',
       'no_of_owners', 'depreciation', 'road_tax', 'mileage', 'omv', 'arf',
       'omv_arf_ratio', 'dereg_coe_ratio', 'car_age', 'almost new car',
       'coe car', 'consignment car', 'direct owner sale', 'electric cars',
       'hybrid cars', 'imported used vehicle', 'low mileage car', 'opc car',
       'parf car', 'premium ad car', 'rare & exotic',
       'sgcarmart warranty cars', 'sta evaluated car', 'vintage cars'],
      dtype='object')
Index(['make', 'model', 'type_of_vehicle', 'engine_cap', 'no_of_owners',
       'depreciation', 'road_tax', 'mileage', 'omv_arf_ratio',
       'dereg_coe_ratio', 'car_age', 'omv', 'arf'],
      dtype='object')


### This cell do prediction model by model

In [44]:
x_train, y_train = df.drop(columns=['price']), df[['price', 'model']]
x_test = df_test[x_train.columns].dropna(subset=['model'])

null_counts = x_test.isnull().sum().sum()
print(null_counts)

79


In [45]:
res_model = RandomForestMiningByModel(x_train, x_test, y_train, dev=True)
print(res_model.head())

  y_pred = pd.concat([y_pred, temp_df])


KeyError: 411.0

### This cell do prediction on test data with 'model' attribute missing

In [None]:
x_train, y_train = df.drop(columns=['price', 'model']), df[['price']]
df_test_unmapped = df_test[df_test['model'].isna()]
x_test = df_test_unmapped[x_train.columns]

res_nomodel = RandomForestMining(x_train, x_test, y_train, dev=True)
print(res_nomodel.head())

In [None]:
print(len(res_model))
print(len(res_nomodel))
res = pd.concat([res_model, res_nomodel])
res.to_csv('./data/res_by_model_original.csv')
res.reset_index(inplace=True)
res.rename(columns={'index': 'Id'}, inplace=True)
res_sorted = res.sort_values(by='Id')
res_sorted.to_csv('./data/res_by_model_rf.csv', index=False)

In [None]:
res.head()