<img src="images/img.png" />

# CS5228 Project, Group 32

In [1]:
# Auto reload
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

## Data Preprocessing
In this part, we are going to perform some data preprocessing steps. This may include:
* Data cleaning: handle missing values, duplicates, inconsistant or invalid vallues, outliers

* Data reduction: reduce number of attributes, reduce number of attribute values

* Data transformation: attribute construction, normalization

* Data discretization: encode to numerical attributes

### Load the train dataset

In [3]:
# Load file into pandas dataframe
df = pd.read_csv('./data/train.csv')

num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

There are 25000 data points in training data, each with 30 attributes.


### Data Cleaning

Before data cleaning, remove the known attributes that are not meaningful to our prediction model:
  * Meaningless idendifier: listing_id 
  * Attributes in free text: title, description, features, accessories
  * Attribute with the same value: eco_category, indicative_price
  * Attribute unlikely to affect price: curb_weight

In [4]:
columns_to_drop = [
    'listing_id',          # Meaningless identifier
    'title',               # Attributes in free text
    'description',
    'features',
    'accessories',
    'eco_category',        # Attribute with the same value
    'indicative_price',
    'curb_weight',         # Attribute unlikely to affect price
    'transmission',
    'original_reg_date',
    'lifespan',
]

df = df.drop(columns=columns_to_drop)

num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

There are 25000 data points in training data, each with 19 attributes.


### Handle Missing Values
Firstly, for each of the columns with missing value, check the number of rows with NaN values.
There are 3 scenarios:
1. NaN value is the major (e.g. fuel_type has 19121 rows with NaN values), we remove the corresponding attritubes.
2. NaN value is the minor. We can choose to fill or delete related data points. 

In [5]:
columns_to_check = [
    'make',
    'fuel_type',
    'manufactured',
    'power',
    'engine_cap',
    'mileage',
    'no_of_owners',
    'depreciation',
    'road_tax',
    'dereg_value',
    'omv',
    'arf',
    'opc_scheme'
]

# Calculate the number of NaN values in each specified column
nan_counts = df[columns_to_check].isna().sum()

# Print the number of NaN values for each column
print('Training data')
for column, count in nan_counts.items():
    print(f"Column '{column}' has {count} rows with NaN values.")

Training data
Column 'make' has 1316 rows with NaN values.
Column 'fuel_type' has 19121 rows with NaN values.
Column 'manufactured' has 7 rows with NaN values.
Column 'power' has 2640 rows with NaN values.
Column 'engine_cap' has 596 rows with NaN values.
Column 'mileage' has 5304 rows with NaN values.
Column 'no_of_owners' has 18 rows with NaN values.
Column 'depreciation' has 507 rows with NaN values.
Column 'road_tax' has 2632 rows with NaN values.
Column 'dereg_value' has 220 rows with NaN values.
Column 'omv' has 64 rows with NaN values.
Column 'arf' has 174 rows with NaN values.
Column 'opc_scheme' has 24838 rows with NaN values.


### We first drop columns with TOO MANY NaN values and unlikely to help prediction

In [6]:
columns_to_drop_nan = [
    'fuel_type',
    'opc_scheme',
    'make'
]

for col in columns_to_drop_nan:
    if col in df.columns:
        df = df.drop(columns=[col])

### Transform categorical value to numerical values

In [7]:
categorical_columns = [
    'model',
    'type_of_vehicle'
]

encode_dict = {}
le = LabelEncoder()
for column in categorical_columns:
    df[column] = le.fit_transform(df[column])
    encode_dict[column] = {str(label): int(index) for index, label in enumerate(le.classes_)}

with open('./data/encode.json', 'w') as file:
    json.dump(encode_dict, file, indent=4)

Transform date time attributes to numerical values
This step is required to fill up the missing values.

In [8]:
if 'reg_date' in df.columns:
    df['reg_date'] = pd.to_datetime(df['reg_date'], format='%d-%b-%Y')
    df['reg_year'] = df['reg_date'].dt.year
    df = df.drop(columns=['reg_date'])

num_records, num_attributes = df.shape
print("There are {} data points, each with {} attributes.". format(num_records, num_attributes))

There are 25000 data points, each with 16 attributes.


### Handle category attribute

In [9]:
from util.DataPreprocess import HandlingCategoryAttribute

if 'category' in df.columns:
    df = HandlingCategoryAttribute(df)
    
print(df.columns)

Number of unique categories: 15
Unique categories: {'premium ad car', 'rare & exotic', 'sta evaluated car', 'sgcarmart warranty cars', 'imported used vehicle', 'coe car', 'parf car', 'opc car', 'consignment car', 'almost new car', 'vintage cars', 'hybrid cars', 'low mileage car', 'direct owner sale', 'electric cars'}
There are 25000 data points, each with 30 attributes.
Index(['model', 'manufactured', 'type_of_vehicle', 'power', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'price', 'reg_year', 'almost new car',
       'coe car', 'consignment car', 'direct owner sale', 'electric cars',
       'hybrid cars', 'imported used vehicle', 'low mileage car', 'opc car',
       'parf car', 'premium ad car', 'rare & exotic',
       'sgcarmart warranty cars', 'sta evaluated car', 'vintage cars'],
      dtype='object')


Then we try to fill up other missing values.

In [10]:
from util.DataPreprocess import HandlingMissingValues
from util.DataPreprocess import HandlingMissingValueWithImpute

columns = [
    'model',
    'manufactured',
    'type_of_vehicle',
    'power',
    'engine_cap',
    'no_of_owners',
    'depreciation',
    'coe', 
    'road_tax',
    'dereg_value',
    'mileage',
    'omv',
    'arf',
    'price',
    'reg_year'
]

print(df.columns)
print(df.head())
df = HandlingMissingValueWithImpute(df, columns)

Index(['model', 'manufactured', 'type_of_vehicle', 'power', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'price', 'reg_year', 'almost new car',
       'coe car', 'consignment car', 'direct owner sale', 'electric cars',
       'hybrid cars', 'imported used vehicle', 'low mileage car', 'opc car',
       'parf car', 'premium ad car', 'rare & exotic',
       'sgcarmart warranty cars', 'sta evaluated car', 'vintage cars'],
      dtype='object')
   model  manufactured  type_of_vehicle  power  engine_cap  no_of_owners  \
0    595        2018.0                8  280.0      2995.0           2.0   
1    192        2017.0                2  135.0      1991.0           2.0   
2    546        2007.0                4  118.0      2354.0           3.0   
3    156        2008.0                3   80.0      1598.0           3.0   
4    398        2006.0                2  183.0      2995.0           6.0   

   depreciation    coe  r



In [11]:
df.head()

Unnamed: 0,model,manufactured,type_of_vehicle,power,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,...,hybrid cars,imported used vehicle,low mileage car,opc car,parf car,premium ad car,rare & exotic,sgcarmart warranty cars,sta evaluated car,vintage cars
0,595.0,2018.0,8.0,280.0,2995.0,2.0,34270.0,48011.0,2380.0,103323.0,...,0,0,0,0,1,0,0,0,0,0
1,192.0,2017.0,2.0,135.0,1991.0,2.0,21170.0,47002.0,1202.0,45179.0,...,0,0,0,0,1,1,0,0,0,0
2,546.0,2007.0,4.0,118.0,2354.0,3.0,12520.0,50355.0,2442.0,16003.0,...,0,0,1,0,0,1,0,0,0,0
3,156.0,2008.0,3.0,80.0,1598.0,3.0,10140.0,27571.0,1113.0,12184.0,...,0,0,0,0,0,1,0,0,0,0
4,398.0,2006.0,2.0,183.0,2995.0,6.0,13690.0,48479.0,3570.0,9138.0,...,0,0,0,0,0,1,0,0,0,0


### Remove Exact Duplicates
We remove duplicated data points here.

In [12]:
df = df.drop_duplicates()

num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

There are 24988 data points in training data, each with 30 attributes.


### Remove outliers

In [13]:
# from util.DataPreprocess import OutlierRemoval

# df = OutlierRemoval(df, 'model', 'price')

### Save the preprocessed data

In [14]:
file_name = './data/train_preprocessed_impute.csv'

# Check if the file exists
if os.path.exists(file_name):
    # Delete the file
    os.remove(file_name)
    print(f"Existing file '{file_name}' has been deleted.")

# Save the DataFrame to CSV
df.to_csv(file_name, index=False)
print(f"DataFrame has been saved to '{file_name}'.")

Existing file './data/train_preprocessed_impute.csv' has been deleted.
DataFrame has been saved to './data/train_preprocessed_impute.csv'.


## Data Mining

### Load preprocessed training data

In [52]:
# Load file into pandas dataframe, we saved our preprocessed file at path 'output_file'
training_file = './data/train_preprocessed_impute.csv'
df = pd.read_csv(training_file)

columns_to_keep = [
    'model',
    'manufactured',
    'type_of_vehicle',
    'power',
    'engine_cap',
    'no_of_owners',
    'depreciation',
    'coe',
    'road_tax',
    'dereg_value',
    'mileage',
    'omv',
    'arf',
    'price',
    'reg_year',
    'almost new car',
    'coe car',
    'consignment car',
    'direct owner sale',
    'electric cars',
    'hybrid cars',
    'imported used vehicle',
    'low mileage car',
    'opc car',
    'parf car',
    'premium ad car',
    'rare & exotic',
    'sgcarmart warranty cars',
    'sta evaluated car',
    'vintage cars'
]
df = df[columns_to_keep]
columns_to_keep = [col for col in df.columns if col != 'price']

print(df.columns)
num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

Index(['model', 'manufactured', 'type_of_vehicle', 'power', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'price', 'reg_year', 'almost new car',
       'coe car', 'consignment car', 'direct owner sale', 'electric cars',
       'hybrid cars', 'imported used vehicle', 'low mileage car', 'opc car',
       'parf car', 'premium ad car', 'rare & exotic',
       'sgcarmart warranty cars', 'sta evaluated car', 'vintage cars'],
      dtype='object')
There are 24988 data points in training data, each with 30 attributes.


### Data Augmentation, copy rows with less than 5 samples by group

In [53]:
from util.DataPreprocess import DataAugmentation

df_aug = DataAugmentation(df)

num_records, num_attributes = df_aug.shape
print("There are {} data points after augmentation, each with {} attributes.". format(num_records, num_attributes))

There are 41980 data points after augmentation, each with 30 attributes.


### Save the augmentation data

In [54]:
file_name = './data/train_preprocessed_augmentation.csv'

# Check if the file exists
if os.path.exists(file_name):
    # Delete the file
    os.remove(file_name)
    print(f"Existing file '{file_name}' has been deleted.")

# Save the DataFrame to CSV
df_aug.to_csv(file_name, index=False)
print(f"DataFrame has been saved to '{file_name}'.")

Existing file './data/train_preprocessed_augmentation.csv' has been deleted.
DataFrame has been saved to './data/train_preprocessed_augmentation.csv'.


### Load test data and preprocess

In [59]:
test_file = './data/test.csv'
df_test = pd.read_csv(test_file)

### Convert date

In [60]:
if 'reg_date' in df_test.columns:
    df_test['reg_date'] = pd.to_datetime(df_test['reg_date'], format='%d-%b-%Y')
    df_test['reg_year'] = df_test['reg_date'].dt.year
    df_test = df_test.drop(columns=['reg_date'])

num_records, num_attributes = df_test.shape
print("There are {} data points, each with {} attributes.". format(num_records, num_attributes))

There are 10000 data points, each with 29 attributes.


### Convert category data

In [61]:
from util.DataPreprocess import HandlingCategoryAttribute

if 'category' in df_test.columns:
    df_test = HandlingCategoryAttribute(df_test)

Number of unique categories: 15
Unique categories: {'premium ad car', 'rare & exotic', 'sta evaluated car', 'sgcarmart warranty cars', 'imported used vehicle', 'coe car', 'parf car', 'opc car', 'consignment car', 'almost new car', 'vintage cars', 'hybrid cars', 'low mileage car', 'direct owner sale', 'electric cars'}
There are 10000 data points, each with 43 attributes.


### Select attributes

In [62]:
columns_to_keep = df.columns
columns_to_keep = [col for col in columns_to_keep if col != 'price']

df_test = df_test[columns_to_keep]

### Encode attributes on test data

In [63]:
num_records, num_attributes = df_test.shape
print("There are {} data points, each with {} attributes.". format(num_records, num_attributes))

categorical_columns = [
    'make',
    'model',
    'type_of_vehicle',
    'transmission',
]

with open('./data/encode.json', 'r') as file:
    data = json.load(file)

for col, cate_dict in data.items():
    if col in df_test.columns:
        df_test.loc[:, col] = df_test[col].map(cate_dict)

num_records, num_attributes = df_test.shape
print("There are {} data points in test data, each with {} attributes.". format(num_records, num_attributes))

There are 10000 data points, each with 29 attributes.
There are 10000 data points in test data, each with 29 attributes.


### Handle missing values on test data

In [64]:
columns_to_check = [
    'model',
    'manufactured',
    'power',
    'engine_cap',
    'mileage',
    'no_of_owners',
    'depreciation',
    'road_tax',
    'dereg_value',
    'omv',
    'arf',
]

# Calculate the number of NaN values in each specified column
nan_counts = df_test[columns_to_check].isna().sum()

# Print the number of NaN values for each column
print('Training data')
for column, count in nan_counts.items():
    print(f"Column '{column}' has {count} rows with NaN values.")

Training data
Column 'model' has 77 rows with NaN values.
Column 'manufactured' has 3 rows with NaN values.
Column 'power' has 1086 rows with NaN values.
Column 'engine_cap' has 235 rows with NaN values.
Column 'mileage' has 2166 rows with NaN values.
Column 'no_of_owners' has 8 rows with NaN values.
Column 'depreciation' has 201 rows with NaN values.
Column 'road_tax' has 1082 rows with NaN values.
Column 'dereg_value' has 83 rows with NaN values.
Column 'omv' has 29 rows with NaN values.
Column 'arf' has 65 rows with NaN values.


In [65]:
from util.DataPreprocess import HandlingCategoryAttribute

if 'category' in df_test.columns:
    df_test = HandlingCategoryAttribute(df_test)

In [66]:
df_test.head()

Unnamed: 0,model,manufactured,type_of_vehicle,power,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,...,hybrid cars,imported used vehicle,low mileage car,opc car,parf car,premium ad car,rare & exotic,sgcarmart warranty cars,sta evaluated car,vintage cars
0,746.0,2015.0,8,96.0,1496.0,2.0,17660.0,57199,682.0,9582.0,...,0,0,0,0,1,0,0,0,0,0
1,41.0,2007.0,3,79.0,1598.0,1.0,10920.0,42564,1113.0,13644.0,...,0,0,1,0,0,1,0,0,0,0
2,235.0,2019.0,6,141.0,1998.0,1.0,22120.0,32801,1210.0,54818.0,...,0,0,0,0,1,0,0,0,0,0
3,748.0,2019.0,3,79.0,1496.0,3.0,13700.0,29159,682.0,26363.0,...,0,0,0,0,1,1,0,0,0,0
4,41.0,2015.0,1,88.0,1496.0,3.0,14190.0,56001,682.0,15197.0,...,0,0,0,0,1,1,0,0,0,0


In [67]:
from util.DataPreprocess import HandlingMissingValueWithImputeReference

columns = [
    'manufactured',
    'type_of_vehicle',
    'power',
    'engine_cap',
    'no_of_owners',
    'depreciation',
    'coe', 
    'road_tax',
    'dereg_value',
    'mileage',
    'omv',
    'arf',
    'reg_year'
]

df_test = HandlingMissingValueWithImputeReference(df_test, df, columns)

   model  manufactured  type_of_vehicle  power  engine_cap  no_of_owners  \
0  746.0        2015.0              8.0   96.0      1496.0           2.0   
1   41.0        2007.0              3.0   79.0      1598.0           1.0   
2  235.0        2019.0              6.0  141.0      1998.0           1.0   
3  748.0        2019.0              3.0   79.0      1496.0           3.0   
4   41.0        2015.0              1.0   88.0      1496.0           3.0   

   depreciation      coe  road_tax  dereg_value  ...  hybrid cars  \
0       17660.0  57199.0     682.0       9582.0  ...            0   
1       10920.0  42564.0    1113.0      13644.0  ...            0   
2       22120.0  32801.0    1210.0      54818.0  ...            0   
3       13700.0  29159.0     682.0      26363.0  ...            0   
4       14190.0  56001.0     682.0      15197.0  ...            0   

   imported used vehicle  low mileage car  opc car  parf car  premium ad car  \
0                      0                0       



In [68]:
df_test.head()

Unnamed: 0,model,manufactured,type_of_vehicle,power,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,...,hybrid cars,imported used vehicle,low mileage car,opc car,parf car,premium ad car,rare & exotic,sgcarmart warranty cars,sta evaluated car,vintage cars
0,746.0,2015.0,8.0,96.0,1496.0,2.0,17660.0,57199.0,682.0,9582.0,...,0,0,0,0,1,0,0,0,0,0
1,41.0,2007.0,3.0,79.0,1598.0,1.0,10920.0,42564.0,1113.0,13644.0,...,0,0,1,0,0,1,0,0,0,0
2,235.0,2019.0,6.0,141.0,1998.0,1.0,22120.0,32801.0,1210.0,54818.0,...,0,0,0,0,1,0,0,0,0,0
3,748.0,2019.0,3.0,79.0,1496.0,3.0,13700.0,29159.0,682.0,26363.0,...,0,0,0,0,1,1,0,0,0,0
4,41.0,2015.0,1.0,88.0,1496.0,3.0,14190.0,56001.0,682.0,15197.0,...,0,0,0,0,1,1,0,0,0,0


In [69]:
print(df_test.columns)
print(df.columns)

Index(['model', 'manufactured', 'type_of_vehicle', 'power', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'reg_year', 'almost new car', 'coe car',
       'consignment car', 'direct owner sale', 'electric cars', 'hybrid cars',
       'imported used vehicle', 'low mileage car', 'opc car', 'parf car',
       'premium ad car', 'rare & exotic', 'sgcarmart warranty cars',
       'sta evaluated car', 'vintage cars'],
      dtype='object')
Index(['model', 'manufactured', 'type_of_vehicle', 'power', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'price', 'reg_year', 'almost new car',
       'coe car', 'consignment car', 'direct owner sale', 'electric cars',
       'hybrid cars', 'imported used vehicle', 'low mileage car', 'opc car',
       'parf car', 'premium ad car', 'rare & exotic',
       'sgcarmart warranty cars', 'sta evaluated car', 'vintage car

### Check if train data has all models in test data

In [70]:
models_in_df = set(df['model'].unique())
models_in_df_test = set(df_test['model'].unique())

if models_in_df_test.issubset(models_in_df):
    print("df includes all models in df_test")
else:
    missing_models = models_in_df_test - models_in_df
    print("df does not include", missing_models)

df does not include {nan}


### Mining code here

In [71]:
from util.DataMining import split_dataframe, split_dataframe_flex
from util.DataMining import (
    RandomForestMining,
    RandomForestMiningByModel,
    GradientBoostingMining,
    GradientBoostingMiningByModel,
    XGBoostMining,
    LinearRegressionMining,
    LinearRegressionMiningByModel,
    CombinedDataMiningRandomForestAndLinearRegression
)

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [31]:
# run_times, rmse_sum = 1, 0
# for i in tqdm(range(run_times), desc='Running Random Forest'):
#     target_col = 'price'
#     x_train, x_test, y_train, y_test = split_dataframe(df, target_col)
#     rmse_sum += RandomForestMining(x_train, x_test, y_train, y_test)
# print('Average RMSE:', round(rmse_sum / run_times))

Running Random Forest: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:01<00:00, 61.83s/it]

Running not in develop mode
RMSE on test data: 20512.807808161826
Average RMSE: 20513





In [72]:
# run_times, rmse_sum = 1, 0
# for i in tqdm(range(run_times), desc='Running Random Forest'):
#     train_drop_cols = ['price']
#     test_cols = ['price', 'model']
#     x_train, x_test, y_train, y_test = split_dataframe_flex(df_aug, train_drop_cols, test_cols)
#     rmse_sum += RandomForestMiningByModel(x_train, x_test, y_train, y_test)
# print('Average RMSE:', round(rmse_sum / run_times))

In [73]:
run_times, rmse_sum = 1, 0
for i in tqdm(range(run_times), desc='Running XGBoost'):
    train_drop_cols = ['price']
    x_train, x_test, y_train, y_test = split_dataframe(df_aug, train_drop_cols)
    rmse_sum += XGBoostMining(x_train, x_test, y_train, y_test)
print('Average RMSE:', round(rmse_sum / run_times))

Running XGBoost: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [02:06<00:00, 126.57s/it]

Running not in develop mode
RMSE on test data: 17117.394423159407
Average RMSE: 17117





In [80]:
null_counts = df_test.isnull().sum()
print(null_counts)

columns_to_keep = [
    'model',
    'manufactured',
    'power',
    'engine_cap',
    'no_of_owners',
    'depreciation',
    'coe',
    'road_tax',
    'dereg_value',
    'mileage',
    'omv',
    'arf',
    'price',
    'reg_year',
    'coe car',
    'low mileage car',
    'opc car',
    'parf car',
]

df_selected = df[columns_to_keep]

x_train, y_train = df_selected.drop(columns=['price', 'model']), df[['price']]
x_test = df_test[x_train.columns]

res_nomodel = XGBoostMining(x_train, x_test, y_train, dev=True)
res_nomodel.reset_index(inplace=True)
res_nomodel.rename(columns={'index': 'Id'}, inplace=True)
print(res_nomodel.head())

model                      77
manufactured                0
type_of_vehicle             0
power                       0
engine_cap                  0
no_of_owners                0
depreciation                0
coe                         0
road_tax                    0
dereg_value                 0
mileage                     0
omv                         0
arf                         0
reg_year                    0
almost new car              0
coe car                     0
consignment car             0
direct owner sale           0
electric cars               0
hybrid cars                 0
imported used vehicle       0
low mileage car             0
opc car                     0
parf car                    0
premium ad car              0
rare & exotic               0
sgcarmart warranty cars     0
sta evaluated car           0
vintage cars                0
dtype: int64
   Id      Predicted
0   0   19610.484375
1   1   34358.296875
2   2  151862.703125
3   3   77911.984375
4   4   2638

In [81]:
x_train.columns

Index(['manufactured', 'power', 'engine_cap', 'no_of_owners', 'depreciation',
       'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'reg_year',
       'coe car', 'low mileage car', 'opc car', 'parf car'],
      dtype='object')

In [82]:
res_nomodel.to_csv('./data/xgb_result.csv', index=False)

In [83]:
print(df_test.columns)
print(x_train.columns)

Index(['model', 'manufactured', 'type_of_vehicle', 'power', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'reg_year', 'almost new car', 'coe car',
       'consignment car', 'direct owner sale', 'electric cars', 'hybrid cars',
       'imported used vehicle', 'low mileage car', 'opc car', 'parf car',
       'premium ad car', 'rare & exotic', 'sgcarmart warranty cars',
       'sta evaluated car', 'vintage cars'],
      dtype='object')
Index(['manufactured', 'power', 'engine_cap', 'no_of_owners', 'depreciation',
       'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'reg_year',
       'coe car', 'low mileage car', 'opc car', 'parf car'],
      dtype='object')


### This cell do prediction model by model

In [108]:
x_train, y_train = df.drop(columns=['price']), df[['price', 'model']]
x_test = df_test[x_train.columns].dropna(subset=['model'])

null_counts = x_test.isnull().sum().sum()
print(null_counts)

0


In [109]:
res_model = RandomForestMiningByModel(x_train, x_test, y_train, dev=True)
print(res_model.head())

  y_pred = pd.concat([y_pred, temp_df])


    Predicted
0   20320.695
1   33179.915
2  145620.640
3   71142.640
4   26596.770


### This cell do prediction on test data with 'model' attribute missing

In [110]:
x_train, y_train = df.drop(columns=['price', 'model']), df[['price']]
df_test_unmapped = df_test[df_test['model'].isna()]
x_test = df_test_unmapped[x_train.columns]

res_nomodel = RandomForestMining(x_train, x_test, y_train, dev=True)
print(res_nomodel.head())

  return fit_method(estimator, *args, **kwargs)


         Predicted
21    61466.892653
195  282726.617879
212  164401.565762
412   71105.936234
440  178263.588893


In [111]:
print(len(res_model))
print(len(res_nomodel))
res = pd.concat([res_model, res_nomodel])
res.to_csv('./data/res_by_model_original.csv')
res.reset_index(inplace=True)
res.rename(columns={'index': 'Id'}, inplace=True)
res_sorted = res.sort_values(by='Id')
res_sorted.to_csv('./data/res_by_model_rf.csv', index=False)

9923
77


In [112]:
res.head()

Unnamed: 0,Id,Predicted
0,0,20320.695
1,1,33179.915
2,2,145620.64
3,3,71142.64
4,4,26596.77
