<img src="images/img.png" />

# CS5228 Project, Group 32

In [1]:
# Auto reload
%load_ext autoreload
%autoreload 2

## Data Preprocessing
In this part, we are going to perform some data preprocessing steps. This may include:
* Data cleaning: handle missing values, duplicates, inconsistant or invalid vallues, outliers

* Data reduction: reduce number of attributes, reduce number of attribute values

* Data transformation: attribute construction, normalization

* Data discretization: encode to numerical attributes

### Setting up the Notebook

In [2]:
import os
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Load file into pandas dataframe
df = pd.read_csv('./data/train.csv')

num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

There are 25000 data points in training data, each with 30 attributes.


### Data Cleaning

Before data cleaning, remove the known attributes that are not meaningful to our prediction model:
  * Meaningless idendifier: listing_id 
  * Attributes in free text: title, description, features, accessories
  * Attribute with the same value: eco_category, indicative_price
  * Attribute unlikely to affect price: curb_weight

In [4]:
columns_to_drop = [
    'listing_id',          # Meaningless identifier
    'title',               # Attributes in free text
    'description',
    'features',
    'accessories',
    'eco_category',        # Attribute with the same value
    'indicative_price',
    'curb_weight',         # Attribute unlikely to affect price
    'original_reg_date',
    'lifespan',
]

df = df.drop(columns=columns_to_drop)

num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

There are 25000 data points in training data, each with 20 attributes.


### Handle Missing Values
Firstly, for each of the columns with missing value, check the number of rows with NaN values.
There are 3 scenarios:
1. NaN value is the major (e.g. fuel_type has 19121 rows with NaN values), we remove the corresponding attritubes.
2. NaN value is the minor. We can choose to fill or delete related data points. 

In [5]:
columns_to_check = [
    'make',
    'fuel_type',
    'manufactured',
    'power',
    'engine_cap',
    'mileage',
    'no_of_owners',
    'depreciation',
    'road_tax',
    'dereg_value',
    'omv',
    'arf',
    'opc_scheme'
]

# Calculate the number of NaN values in each specified column
nan_counts = df[columns_to_check].isna().sum()

# Print the number of NaN values for each column
print('Training data')
for column, count in nan_counts.items():
    print(f"Column '{column}' has {count} rows with NaN values.")

Training data
Column 'make' has 1316 rows with NaN values.
Column 'fuel_type' has 19121 rows with NaN values.
Column 'manufactured' has 7 rows with NaN values.
Column 'power' has 2640 rows with NaN values.
Column 'engine_cap' has 596 rows with NaN values.
Column 'mileage' has 5304 rows with NaN values.
Column 'no_of_owners' has 18 rows with NaN values.
Column 'depreciation' has 507 rows with NaN values.
Column 'road_tax' has 2632 rows with NaN values.
Column 'dereg_value' has 220 rows with NaN values.
Column 'omv' has 64 rows with NaN values.
Column 'arf' has 174 rows with NaN values.
Column 'opc_scheme' has 24838 rows with NaN values.


We delete attributes with TOO many NaN value here.

In [6]:
columns_to_drop_nan = [
    'fuel_type',
    'opc_scheme'
]

df = df.drop(columns=columns_to_drop_nan)

Then we try to fill up other missing values.

In [7]:
from util.DataPreprocess import HandlingMissingValues

df = HandlingMissingValues(df)

NaN values after handling:  0


### Remove Exact Duplicates
We remove duplicated data points here.

In [8]:
df = df.drop_duplicates()

num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

There are 24258 data points in training data, each with 18 attributes.


### Merge rows with fewer data points on specific attributes

### Transform categorical value to numerical values

In [9]:
categorical_columns = [
    'make',
    'model',
    'type_of_vehicle',
    'transmission',
]

encode_dict = {}
le = LabelEncoder()
for column in categorical_columns:
    df[column] = le.fit_transform(df[column])
    encode_dict[column] = {str(label): int(index) for index, label in enumerate(le.classes_)}

with open('./data/encode.json', 'w') as file:
    json.dump(encode_dict, file, indent=4)

### Transform date time attributes to numerical values

In [10]:
df['reg_date'] = pd.to_datetime(df['reg_date'], format='%d-%b-%Y')
df['reg_year'] = df['reg_date'].dt.year
df = df.drop(columns=['reg_date'])

num_records, num_attributes = df.shape
print("There are {} data points, each with {} attributes.". format(num_records, num_attributes))

There are 24258 data points, each with 18 attributes.


### Handle category attribute

In [11]:
from util.DataPreprocess import HandlingCategoryAttribute

df = HandlingCategoryAttribute(df)

Number of unique categories: 15
Unique categories: {'sgcarmart warranty cars', 'hybrid cars', 'rare & exotic', 'almost new car', 'direct owner sale', 'consignment car', 'electric cars', 'imported used vehicle', 'vintage cars', 'sta evaluated car', 'parf car', 'opc car', 'coe car', 'low mileage car', 'premium ad car'}
There are 24258 data points, each with 32 attributes.


### Remove outliers

In [12]:
# from util.DataPreprocess import OutlierRemoval

# df = OutlierRemoval(df, 'model', 'price')

### Saving the Data

In [13]:
file_name = './data/train_preprocessed.csv'

# Check if the file exists
if os.path.exists(file_name):
    # Delete the file
    os.remove(file_name)
    print(f"Existing file '{file_name}' has been deleted.")

# Save the DataFrame to CSV
df.to_csv(file_name, index=False)
print(f"DataFrame has been saved to '{file_name}'.")

Existing file './data/train_preprocessed.csv' has been deleted.
DataFrame has been saved to './data/train_preprocessed.csv'.


## Data Mining

### Load preprocessed training data

In [14]:
# Load file into pandas dataframe, we saved our preprocessed file at path 'output_file'
training_file = './data/train_preprocessed.csv'
df = pd.read_csv(training_file)

columns_to_keep = [
    'model',
    'mileage',
    'low mileage car',
    'manufactured',
    'reg_year',
    'type_of_vehicle',
    'dereg_value',
    'depreciation',
    'power',
    'coe',
    'arf',
    'omv',
    'price',
    'road_tax',
    'almost new car',
    'coe car',
    'parf car',
]

df = df[columns_to_keep]
columns_to_keep = [col for col in df.columns if col != 'price']

num_records, num_attributes = df.shape
print("There are {} data points in training data, each with {} attributes.". format(num_records, num_attributes))

There are 24258 data points in training data, each with 17 attributes.


### Load test data and preprocess

In [15]:
test_file = './data/test.csv'
df_test = pd.read_csv(test_file)

df_test['reg_date'] = pd.to_datetime(df_test['reg_date'], format='%d-%b-%Y')
df_test['reg_year'] = df_test['reg_date'].dt.year
df_test = df_test.drop(columns=['reg_date'])

# Replace '-' with an empty string
df_test['category'] = df_test['category'].replace('-', '')

# Split the 'category' column into lists
df_test['category_list'] = df_test['category'].str.split(', ')

# Handle empty strings by replacing them with empty lists
df_test['category_list'] = df_test['category_list'].apply(lambda x: [] if x == [''] else x)

# Import itertools for flattening lists
from itertools import chain

# Flatten the list of lists to a single list
all_categories = list(chain.from_iterable(df_test['category_list']))

# Get the unique categories
unique_categories = set(all_categories)

# Print the number of unique categories
print(f"Number of unique categories: {len(unique_categories)}")
print("Unique categories:", unique_categories)

# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the category lists
category_dummies = mlb.fit_transform(df_test['category_list'])

# Create a DataFrame with the one-hot encoded categories
category_df = pd.DataFrame(category_dummies, columns=mlb.classes_, index=df_test.index)

# Concatenate the new dummy columns to the original DataFrame
df_test = pd.concat([df_test, category_df], axis=1)

# Drop the temporary 'category_list' column if desired
df_test.drop('category_list', axis=1, inplace=True)
df_test.drop('category', axis=1, inplace=True)

num_records, num_attributes = df.shape

print("There are {} data points, each with {} attributes.". format(num_records, num_attributes))

Number of unique categories: 15
Unique categories: {'sgcarmart warranty cars', 'hybrid cars', 'rare & exotic', 'almost new car', 'direct owner sale', 'imported used vehicle', 'consignment car', 'vintage cars', 'electric cars', 'sta evaluated car', 'parf car', 'opc car', 'coe car', 'low mileage car', 'premium ad car'}
There are 24258 data points, each with 17 attributes.


### Data Augmentation, copy rows with less than 5 samples by group

In [16]:
from util.DataPreprocess import DataAugmentation

df_aug = DataAugmentation(df)

num_records, num_attributes = df_aug.shape
print("There are {} data points after augmentation, each with {} attributes.". format(num_records, num_attributes))

There are 40290 data points after augmentation, each with 17 attributes.


### Select attributes on test data

In [17]:
num_records, num_attributes = df_test.shape
print("There are {} data points, each with {} attributes.". format(num_records, num_attributes))

categorical_columns = [
    'make',
    'model',
    'type_of_vehicle',
    'transmission',
]

with open('./data/encode.json', 'r') as file:
    data = json.load(file)

for col, cate_dict in data.items():
    if col in df_test.columns:
        df_test[col] = df_test[col].map(cate_dict)

df_test = df_test[columns_to_keep]

num_records, num_attributes = df_test.shape
print("There are {} data points in test data, each with {} attributes.". format(num_records, num_attributes))

There are 10000 data points, each with 43 attributes.
There are 10000 data points in test data, each with 16 attributes.


In [18]:
df_test_unmapped = df_test[df_test['model'].isna()]
nan_counts = df_test[df_test.columns].isna().sum()

      model  mileage  low mileage car  manufactured  reg_year  \
21      NaN      NaN                0        2013.0      2014   
195     NaN    652.0                1        2021.0      2022   
212     NaN  49523.0                1        2014.0      2016   
402     NaN      NaN                0        2014.0      2016   
412     NaN      NaN                0        1949.0      1974   
...     ...      ...              ...           ...       ...   
9466    NaN      NaN                0        1983.0      2021   
9579    NaN      NaN                0        1955.0      2018   
9647    NaN      NaN                0        1989.0      1990   
9704    NaN   5000.0                1        2011.0      2011   
9954    NaN    900.0                1        2024.0      2024   

      type_of_vehicle  dereg_value  depreciation  power    coe       arf  \
21                  9      33393.0       12550.0    NaN  35660    2399.0   
195                 8     118937.0       31900.0  184.0  83684   69

### Check if train data has all models in test data

In [19]:
models_in_df = set(df['model'].unique())
models_in_df_test = set(df_test['model'].unique())

if models_in_df_test.issubset(models_in_df):
    print("df includes all models in df_test")
else:
    missing_models = models_in_df_test - models_in_df
    print("df does not include", missing_models)

df does not include {np.float64(nan)}


### Mining code here

In [20]:
from util.DataMining import split_dataframe, split_dataframe_flex
from util.DataMining import (
    RandomForestMining,
    RandomForestMiningByModel,
    GradientBoostingMining,
    LinearRegressionMining,
    LinearRegressionMiningByModel,
    CombinedDataMiningRandomForestAndLinearRegression
)

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [21]:
# run_times, rmse_sum = 5, 0
# for i in tqdm(range(run_times), desc='Running Random Forest'):
#     target_col = 'price'
#     x_train, x_test, y_train, y_test = split_dataframe(df, target_col)
#     rmse_sum += RandomForestMining(x_train, x_test, y_train, y_test)
# print('Average RMSE:', round(rmse_sum / run_times))

In [26]:
run_times, rmse_sum = 1, 0
for i in tqdm(range(run_times), desc='Running Random Forest'):
    train_drop_cols = ['price']
    test_cols = ['price', 'model']
    x_train, x_test, y_train, y_test = split_dataframe_flex(df_aug, train_drop_cols, test_cols)
    rmse_sum += RandomForestMiningByModel(x_train, x_test, y_train, y_test)
print('Average RMSE:', round(rmse_sum / run_times))

  y_pred = pd.concat([y_pred, temp_df])
Running Random Forest: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:24<00:00, 84.40s/it]

Data saved to results.csv
Running not in develop mode
RMSE on test data: 12594.510893969247
Average RMSE: 12595





In [23]:
# x_train, y_train = df.drop(columns=['price']), df['price']
# x_test = df_test[x_train.columns]

# res = RandomForestMining(x_train, x_test, y_train, dev=True)
# res.to_csv('./data/res.csv', index=False)

### This cell do prediction model by model

In [24]:
x_train, y_train = df.drop(columns=['price']), df[['price', 'model']]
x_test = df_test[x_train.columns].dropna(subset=['model'])

res_model = RandomForestMiningByModel(x_train, x_test, y_train, dev=True)
print(res_model.head())

  y_pred = pd.concat([y_pred, temp_df])


0     19659.175
1     34017.420
2    143951.340
3     72370.765
4     27167.950
Name: Predicted, dtype: float64


### This cell do prediction on test data with 'model' attribute missing

In [25]:
x_train, y_train = df.drop(columns=['price', 'model']), df[['price']]
x_test = df_test_unmapped[x_train.columns]

res_nomodel = RandomForestMining(x_train, x_test, y_train, dev=True)
print(res_nomodel.head())

ValueError: Found input variables with inconsistent numbers of samples: [24258, 40290]

In [None]:
print(len(res_model))
print(len(res_nomodel))
res = pd.concat([res_model, res_nomodel])
res.to_csv('./data/res_by_model_original.csv')
res.reset_index(inplace=True)
res.rename(columns={'index': 'Id'}, inplace=True)
res_sorted = res.sort_values(by='Id')
res_sorted.to_csv('./data/res_by_model.csv', index=False)