In [1]:
import pandas as pd
import numpy as np

In [2]:
def to_file_text(path, lst):
    with open(path, mode='w', encoding='utf-8') as f:
        for val in lst:
            f.write(f"{val}\n")

def check_unique(df: pd.DataFrame, cols: list, display=False):
    infor = dict()
    for col in cols:
        unique = df[col].unique()
        infor[col] = f"{len(unique)} unique values"
        if display:
            print(f"{col}:      {len(unique)} unique values: {unique}")
    print(pd.Series(infor))

def check_missing(df: pd.DataFrame, cols: list):
    infor = dict()
    for col in cols:
        missing = df[col].isnull().sum()
        infor[col] = f"{missing} missing values"
    print(pd.Series(infor))

In [3]:
df = pd.read_csv("data\Clean Data_pakwheels.csv", index_col=0)
df = df.drop(columns='Location')
df.head(2)

Unnamed: 0,Company Name,Model Name,Price,Model Year,Mileage,Engine Type,Engine Capacity,Color,Assembly,Body Type,Transmission Type,Registration Status
0,Toyota,Vitz,2385000,2017,9869,Petrol,1000,Silver,Imported,Hatchback,Automatic,Un-Registered
1,Toyota,Corolla,111000,2019,11111,Petrol,1300,White,Local,Sedan,Automatic,Registered


In [4]:
num_dup = df.duplicated().sum()
duplicated_row = df[df.duplicated()]
print(f"Số dòng trong dữ liệu {len(df)}")
print(f"Số dòng trùng lặp: {num_dup}")
duplicated_row

Số dòng trong dữ liệu 46022
Số dòng trùng lặp: 1107


Unnamed: 0,Company Name,Model Name,Price,Model Year,Mileage,Engine Type,Engine Capacity,Color,Assembly,Body Type,Transmission Type,Registration Status
600,Daihatsu,Mira,1345000,2014,77000,Petrol,660,Beige,Imported,Hatchback,Automatic,Registered
1319,Suzuki,Swift,1450000,2013,81939,Petrol,1300,Silver,Local,Hatchback,Automatic,Registered
1664,Honda,City,1250000,2007,150000,Petrol,1300,Black,Local,Sedan,Manual,Registered
1684,Honda,City,2065000,2017,52000,Petrol,1300,Black,Local,Sedan,Manual,Registered
1813,Suzuki,Wagon,1300000,2011,150000,Petrol,660,Black,Imported,Hatchback,Automatic,Registered
...,...,...,...,...,...,...,...,...,...,...,...,...
45847,Suzuki,Cultus,540000,2006,126000,Petrol,1000,Black,Local,Hatchback,Manual,Registered
45890,Suzuki,Wagon,1550000,2018,38000,Petrol,1000,White,Local,Hatchback,Manual,Registered
45920,Suzuki,Mehran,1000000,2018,14000,Petrol,800,White,Local,Hatchback,Manual,Registered
45994,Toyota,Corolla,1680000,2012,28000,Petrol,1300,White,Local,Sedan,Manual,Registered


In [5]:
df = df.drop_duplicates()
len(df)

44915

In [6]:
check_missing(df, df.columns)

Company Name           0 missing values
Model Name             0 missing values
Price                  0 missing values
Model Year             0 missing values
Mileage                0 missing values
Engine Type            0 missing values
Engine Capacity        0 missing values
Color                  0 missing values
Assembly               0 missing values
Body Type              0 missing values
Transmission Type      0 missing values
Registration Status    0 missing values
dtype: object


In [7]:
check_unique(df, df.select_dtypes(include='object').columns)

Company Name            31 unique values
Model Name             196 unique values
Engine Type              3 unique values
Color                   24 unique values
Assembly                 2 unique values
Body Type                6 unique values
Transmission Type        2 unique values
Registration Status      2 unique values
dtype: object


In [8]:
df.head()

Unnamed: 0,Company Name,Model Name,Price,Model Year,Mileage,Engine Type,Engine Capacity,Color,Assembly,Body Type,Transmission Type,Registration Status
0,Toyota,Vitz,2385000,2017,9869,Petrol,1000,Silver,Imported,Hatchback,Automatic,Un-Registered
1,Toyota,Corolla,111000,2019,11111,Petrol,1300,White,Local,Sedan,Automatic,Registered
2,Suzuki,Alto,1530000,2019,17500,Petrol,660,White,Local,Hatchback,Automatic,Un-Registered
3,Suzuki,Alto,1650000,2019,9600,Petrol,660,White,Local,Hatchback,Manual,Registered
4,Toyota,Corolla,1435000,2010,120000,Petrol,1300,Black,Local,Sedan,Manual,Registered


In [9]:
df['Price'] = np.log10(df['Price'])

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score

In [11]:
def my_evaluate(y_true, y_pred):
    eval = dict()
    eval['mae'] = mean_absolute_error(y_true, y_pred)
    eval['mse'] = mean_squared_error(y_true, y_pred)
    eval['rmse'] = root_mean_squared_error(y_true, y_pred)
    eval['r2'] = r2_score(y_true, y_pred)
    return eval

In [12]:
df_encoded = df.copy()

In [13]:
symbol_cols = list(df.select_dtypes(include='object').columns)
number_cols = list(df.select_dtypes(include='number').columns)
print(symbol_cols)
print(number_cols)

['Company Name', 'Model Name', 'Engine Type', 'Color', 'Assembly', 'Body Type', 'Transmission Type', 'Registration Status']
['Price', 'Model Year', 'Mileage', 'Engine Capacity']


In [14]:
check_unique(df, symbol_cols)

Company Name            31 unique values
Model Name             196 unique values
Engine Type              3 unique values
Color                   24 unique values
Assembly                 2 unique values
Body Type                6 unique values
Transmission Type        2 unique values
Registration Status      2 unique values
dtype: object


## Tiền xử lý train test riêng

In [15]:
# X = df_encoded.drop(columns='Price')
# y = df_encoded['Price']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [None]:
# preprocessor = ColumnTransformer([
#     ("oe", OrdinalEncoder(), ['Assembly', 'Transmission Type', 'Registration Status']),
#     ("ohe", OneHotEncoder(sparse_output=False),['Engine Type', 'Body Type']),
#     ("te", ce.TargetEncoder(smoothing=5), ['Company Name', 'Model Name', 'Color'])
# ], remainder='passthrough', verbose_feature_names_out=False)

# pipe = Pipeline([
#     ("pre", preprocessor),
#     ("model", RandomForestRegressor(n_estimators=200, max_depth=12, random_state=40, n_jobs=-1))
# ])

In [17]:
# pipe.fit(X_train, y_train)
# y_pred = pipe.predict(X_test)
# my_evaluate(y_test, y_pred)

## Tiền xử lý train test chung

In [22]:
df_encoded[['Assembly', 'Transmission Type', 'Registration Status']].head(10)

Unnamed: 0,Assembly,Transmission Type,Registration Status
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,1.0,0.0,1.0
3,1.0,1.0,0.0
4,1.0,1.0,0.0
5,1.0,0.0,0.0
6,1.0,1.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,1.0,0.0,0.0


In [18]:
oe = OrdinalEncoder()
ohe = OneHotEncoder(sparse_output=False)
te = ce.TargetEncoder()

df_encoded[['Assembly', 'Transmission Type', 'Registration Status']] = oe.fit_transform(df_encoded[['Assembly', 'Transmission Type', 'Registration Status']])
df_encoded[['Assembly', 'Transmission Type', 'Registration Status']].head()

Unnamed: 0,Assembly,Transmission Type,Registration Status
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,1.0,0.0,1.0
3,1.0,1.0,0.0
4,1.0,1.0,0.0


In [None]:
ohe_cols = ohe.fit_transform(df_encoded[['Engine Type', 'Body Type']])
ohe_df = pd.DataFrame(ohe_cols, columns=ohe.get_feature_names_out(), index=df_encoded.index)
df_encoded = pd.concat([df_encoded, ohe_df], axis=1)
df_encoded = df_encoded.drop(columns=['Engine Type', 'Body Type'])
df_encoded[ohe.get_feature_names_out()].head()

Unnamed: 0,Engine Type_Diesel,Engine Type_Hybrid,Engine Type_Petrol,Body Type_Cross Over,Body Type_Hatchback,Body Type_Mini Van,Body Type_SUV,Body Type_Sedan,Body Type_Van
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
46018,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
46019,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
46020,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
46021,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [28]:
te = ce.TargetEncoder(cols=['Company Name', 'Model Name', 'Color'], smoothing=5)
X = df_encoded.drop(columns='Price')
y = df_encoded['Price']
df_encoded = te.fit_transform(X, y)
df_encoded['Price'] = y
df_encoded.head(3)

Unnamed: 0,Company Name,Model Name,Model Year,Mileage,Engine Capacity,Color,Assembly,Transmission Type,Registration Status,Engine Type_Diesel,Engine Type_Hybrid,Engine Type_Petrol,Body Type_Cross Over,Body Type_Hatchback,Body Type_Mini Van,Body Type_SUV,Body Type_Sedan,Body Type_Van,Price
0,6.350955,6.227349,2017,9869,1000,6.155257,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,6.377488
1,6.350955,6.267934,2019,11111,1300,6.170158,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5.045323
2,5.90415,5.935263,2019,17500,660,6.170158,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,6.184691


In [None]:
# df_encoded.to_csv("data_latest_final/data_encoded.csv")

In [None]:
X = df_encoded.drop(columns='Price')
y = df_encoded['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [None]:
model = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
my_evaluate(y_test, y_pred)

{'mae': 0.03910873786211721,
 'mse': 0.003589614377031101,
 'rmse': 0.05991339063207073,
 'r2': 0.9658789745831396}

## Export mô hình

In [None]:
import joblib
joblib.dump(model, "rf_used_car.joblib")