# Task 3

## Setting up the Notebook

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.ensemble import RandomForestRegressor
import utils

## Load the Data
We can import data from the training dataset or from the website.

In [2]:
df_origin = pd.read_csv('../data/train.csv')
df_predict = pd.read_csv('../data/train.csv')
df_origin.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price,price
0,1030324,BMW 3 Series 320i Gran Turismo M-Sport,bmw,320i,1 owner! 320i gt m-sports model! big brake kit...,2013.0,,09-dec-2013,luxury sedan,"parf car, premium ad car, low mileage car",...,73000.0,45330.0,50462.0,,,uncategorized,"5 doors gt, powerful and fuel efficient 2.0l t...","bmw i-drive, navigation, bluetooth/aux/usb inp...",,71300.0
1,1021510,Toyota Hiace 3.0M,,hiace,high loan available! low mileage unit. wear an...,2014.0,,26-jan-2015,van,premium ad car,...,110112.0,27502.0,1376.0,,25-jan-2035,uncategorized,low mileage unit. well maintained vehicle. vie...,factory radio setting. front recording camera....,,43800.0
2,1026909,Mercedes-Benz CLA-Class CLA180,mercedes-benz,cla180,1 owner c&c unit. full agent service with 1 mo...,2016.0,,25-jul-2016,luxury sedan,"parf car, premium ad car",...,80000.0,27886.0,26041.0,,,uncategorized,responsive and fuel efficient 1.6l inline 4 cy...,dual electric/memory seats. factory fitted aud...,,95500.0
3,1019371,Mercedes-Benz E-Class E180 Avantgarde,mercedes-benz,e180,"fully agent maintained, 3 years warranty 10 ye...",2019.0,,17-nov-2020,luxury sedan,"parf car, almost new car, consignment car",...,9800.0,46412.0,56977.0,,,uncategorized,"1.5l inline-4 twin scroll turbocharged engine,...",64 colour ambient lighting. active parking ass...,,197900.0
4,1031014,Honda Civic 1.6A VTi,,civic,"kah motor unit! 1 owner, lowest 1.98% for full...",2019.0,,20-sep-2019,mid-sized sedan,parf car,...,40000.0,20072.0,20101.0,,,uncategorized,"1.6l i-vtec engine, 123 bhp, earth dreams cvt ...","s/rims, premium leather seats, factory touch s...",,103200.0


## Data Preprocessing

In [3]:
def data_preprocess(df:pd.DataFrame) -> pd.DataFrame:
    df['make'] = df.apply(lambda row: row['title'].split()[0].lower() if pd.isna(row['make']) else row['make'],axis=1)
    df['make'] = utils.ordinal_encoder(df['make'])

    df['type_of_vehicle'] = utils.ordinal_encoder(df['type_of_vehicle'])
    df['category'] = utils.ordinal_encoder(df['category'])
    df['transmission'] = utils.ordinal_encoder(df['transmission'])

    utils.fill_with_mean(df['curb_weight'])
    utils.fill_with_mean(df['power'])
    utils.fill_with_mean(df['engine_cap'])
    utils.fill_with_mean(df['no_of_owners'])

    df["depreciation"] = utils.del_outlier(df["depreciation"], lower_val=0.0, upper_val=0.99)
    utils.fill_with_mean(df['depreciation'])

    df["coe"] = utils.del_outlier(df["coe"], lower_val=0.01, upper_val=1.0)
    utils.fill_with_mean(df["coe"])

    utils.fill_with_mean(df["road_tax"])
    df["road_tax"] = utils.data_discretization(df["road_tax"], num=15)

    utils.fill_with_mean(df["dereg_value"])
    utils.fill_with_mean(df["mileage"])
    df["mileage"] = utils.data_discretization(df["mileage"], num=15)

    utils.fill_with_mean(df["omv"])
    utils.fill_with_mean(df["arf"]) 

    df.drop(columns='listing_id', inplace=True)
    df.drop(columns='title', inplace=True)
    df.drop(columns='model', inplace=True)
    df.drop(columns='description', inplace=True)
    df.drop(columns='manufactured', inplace=True)
    df.drop(columns='original_reg_date', inplace=True)
    df.drop(columns='reg_date', inplace=True)
    df.drop(columns='fuel_type', inplace=True)
    df.drop(columns='opc_scheme', inplace=True)
    df.drop(columns='lifespan', inplace=True)
    df.drop(columns='eco_category', inplace=True)
    df.drop(columns='features', inplace=True)
    df.drop(columns='accessories', inplace=True)
    df.drop(columns='indicative_price', inplace=True)

    return df

## Model 

In [4]:
def train_model(model, df:pd.DataFrame):
    X = df.drop(columns='price')
    Y = df['price']
    X_train = torch.Tensor(X.values)
    Y_train = torch.Tensor(Y.values)
    model.fit(X_train, Y_train)

def predict(model, df:pd.DataFrame):
    X = df.drop(columns='price')
    Y = df['price']
    X_test = torch.Tensor(X.values)
    real_prices = Y.values
    predict_prices = model.predict(X_test)
    return predict_prices - real_prices

## Training

In [5]:
model = RandomForestRegressor()
df = df_origin.copy()
data_preprocess(df)
train_model(model, df)
result = predict(model, df)


## Result

In [7]:
df_output = df_predict.copy()
df_output['bargain'] = result
df_output = df_output.sort_values(by='bargain', ascending=False)
df_output['make'] = df_output.apply(
    lambda row: row['title'].split()[0].lower(
    ) if pd.isna(row['make']) else row['make'],
    axis=1
)
drops = []
columns = ['curb_weight', 'power', 'engine_cap', 'no_of_owners', 
           'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf']
for index, row in df_output.iterrows():
    nan_cnt = 0
    for col in columns:
        if pd.isna(row[col]):
            nan_cnt += 1
    if nan_cnt >= 6:
        drops.append(index)
df_output.drop(drops, inplace=True)

df_output.to_csv('output.csv')


In [10]:
import json
df_temp = df_output.copy()
df_temp = df_temp[df_temp['bargain']>=5000]
make_dict = {}
make_model_dict = {}
for index, row in df_temp.iterrows():
    make = row['make']
    model = row['model']
    if make in make_dict:
        make_dict[make] += 1
    else:
        make_dict[make] = 1
    make_model = make + ' ' + model
    if make_model in make_model_dict:
        make_model_dict[make_model] += 1
    else:
        make_model_dict[make_model] = 1
make_dict = {k:v/len(df_temp)*100 for k, v in sorted(make_dict.items(), key=lambda x:x[1], reverse=True)}
make_model_dict = {k:v/len(df_temp)*100 for k, v in sorted(make_model_dict.items(), key=lambda x:x[1], reverse=True)}
with open('percent.json', 'w', encoding='utf-8') as f:
    json.dump(make_dict, f, indent=2)
with open('percent-model.json', 'w', encoding='utf-8') as f:
    json.dump(make_model_dict, f, indent=2)
    

In [12]:
df_output = df_predict.copy()
df_output['bargain'] = result
df_output['bargain'] = df_output.apply(lambda row:row['bargain']/row['price']*100, axis=1)
df_output = df_output.sort_values(by='bargain', ascending=False)
df_output['make'] = df_output.apply(
    lambda row: row['title'].split()[0].lower(
    ) if pd.isna(row['make']) else row['make'],
    axis=1
)
drops = []
columns = ['curb_weight', 'power', 'engine_cap', 'no_of_owners',
           'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf']
for index, row in df_output.iterrows():
    nan_cnt = 0
    for col in columns:
        if pd.isna(row[col]):
            nan_cnt += 1
    if nan_cnt >= 6:
        drops.append(index)
df_output.drop(drops, inplace=True)

df_output.to_csv('output2.csv')


In [14]:
import json
df_temp = df_output.copy()
df_temp = df_temp[df_temp['bargain'] >= 20]
make_dict = {}
make_model_dict = {}
for index, row in df_temp.iterrows():
    make = row['make']
    model = row['model']
    if make in make_dict:
        make_dict[make] += 1
    else:
        make_dict[make] = 1
    make_model = make + ' ' + model
    if make_model in make_model_dict:
        make_model_dict[make_model] += 1
    else:
        make_model_dict[make_model] = 1
make_dict = {k: v/len(df_temp)*100 for k,
             v in sorted(make_dict.items(), key=lambda x: x[1], reverse=True)}
make_model_dict = {k: v/len(df_temp)*100 for k, v in sorted(
    make_model_dict.items(), key=lambda x: x[1], reverse=True)}
with open('percent2.json', 'w', encoding='utf-8') as f:
    json.dump(make_dict, f, indent=2)
with open('percent-model2.json', 'w', encoding='utf-8') as f:
    json.dump(make_model_dict, f, indent=2)
