In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [2]:
df = pd.read_csv("Dataset/Final/Remove null, and fill null.csv")
df = df.drop(['url', 'price', 'ad_id', 'condition'], axis=1)
df.columns

Index(['origin', 'car_model', 'mileage', 'exterior_color', 'interior_color',
       'num_of_doors', 'seating_capacity', 'engine', 'engine_capacity',
       'transmission', 'drive_type', 'fuel_consumption', 'brand', 'grade',
       'year_of_manufacture', 'car_name', 'price_in_billion'],
      dtype='object')

In [3]:
categorical_columns = ['origin', 'car_model', 'exterior_color', 'interior_color', 'engine', 'transmission', 'drive_type', 'brand', 'grade','car_name']
target_column = 'price_in_billion'

In [4]:
# Chia ma trận đặc trưng và vector nhãn
X = df.drop(columns=[target_column])
y = df[target_column]


X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.4, random_state=42)
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, test_size=0.5, random_state=42)

# Áp dụng One-Hot Encoding cho tập huấn luyện và tập kiểm tra với handle_unknown='ignore'
encoder = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
encoder.fit(X_train[categorical_columns])

X_train_encoded = encoder.transform(X_train[categorical_columns])
X_cv_encoded = encoder.transform(X_cv[categorical_columns])
X_test_encoded = encoder.transform(X_test[categorical_columns])

# Kết hợp lại với các cột không phải categorical
scaler = StandardScaler()
X_train_encoded = np.hstack((X_train.drop(columns=categorical_columns).values, X_train_encoded))
X_train_encoded = scaler.fit_transform(X_train_encoded)

X_cv_encoded = np.hstack((X_cv.drop(columns = categorical_columns).values, X_cv_encoded))
X_cv_encoded = scaler.transform(X_cv_encoded)

X_test_encoded = np.hstack((X_test.drop(columns=categorical_columns).values, X_test_encoded))
X_test_encoded = scaler.transform(X_test_encoded)





In [5]:
# Bước 3: Huấn luyện mô hình KNN Regressor
knn = KNeighborsRegressor(n_neighbors=3,  weights='distance', metric='euclidean')
knn.fit(X_train_encoded, y_train)

# Đo R2 score ban đầu trên tập kiểm tra
baseline_mse = mean_squared_error(y_cv, knn.predict(X_cv_encoded))
print(f'Baseline MSE: {baseline_mse}')

# Bước 4: Đánh giá độ quan trọng của từng đặc trưng gốc
def calculate_importance(X_train, X_cv, y_train, y_cv, column):
    
    # Loại bỏ cột từ tất cả các tập dữ liệu
    X_train_without_column = X_train.drop(columns=column)
    X_cv_without_column = X_cv.drop(columns=column)
    if column in categorical_columns:
        updated_categorical_columns = [col for col in categorical_columns if col != column]
    else:
        updated_categorical_columns = categorical_columns.copy()
	# if column in categorical_columns:
	# 	updated_categorical_columns = [col for col in categorical_column if col != column]
	# else:
	# 	updated_categorical_columns = categorical_columns.copy()

        

    # Áp dụng One-Hot Encoding cho tập huấn luyện, validation và tập kiểm tra mà không bao gồm cột này
    encoder_without_column = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
    encoder_without_column.fit(X_train_without_column[updated_categorical_columns])

    X_train_encoded = encoder_without_column.transform(X_train_without_column[updated_categorical_columns])
    X_cv_encoded = encoder_without_column.transform(X_cv_without_column[updated_categorical_columns])
    

    # Kết hợp lại với các cột không phải categorical
    X_train_encoded = np.hstack((X_train_without_column.drop(columns=updated_categorical_columns).values, X_train_encoded))
    X_train_encoded = scaler.fit_transform(X_train_encoded)
    
    X_cv_encoded = np.hstack((X_cv_without_column.drop(columns=updated_categorical_columns).values, X_cv_encoded))
    X_cv_encoded = scaler.transform(X_cv_encoded)
    
    # Huấn luyện và dự đoán
    knn.fit(X_train_encoded, y_train)
    validation_mse = mean_squared_error(y_cv, knn.predict(X_cv_encoded))

    # Tính độ quan trọng
    return validation_mse


# Tính độ quan trọng cho từng cột gốc
feature_importance = []
for column in X.columns:
    importance = calculate_importance(X_train, X_cv,y_train, y_cv, column) - baseline_mse
    if importance is not None:
        feature_importance.append((column, importance))

# Hiển thị độ quan trọng của từng đặc trưng
feature_importance_df = pd.DataFrame(feature_importance, columns=['Feature', 'Validation MSE'])
print(feature_importance_df.sort_values(by='Validation MSE', ascending=False))


Baseline MSE: 0.03219406323065822




                Feature  Validation MSE
13                grade        0.014088
10           drive_type        0.006067
1             car_model        0.005301
7                engine        0.003045
11     fuel_consumption        0.001826
14  year_of_manufacture        0.001812
2               mileage        0.001753
8       engine_capacity        0.000210
5          num_of_doors        0.000025
9          transmission        0.000006
4        interior_color       -0.000108
0                origin       -0.000310
6      seating_capacity       -0.000363
3        exterior_color       -0.001061
12                brand       -0.002012
15             car_name       -0.016198


In [6]:
# Filter features based on importance score
selected_features = feature_importance_df[feature_importance_df['Validation MSE'] > 0]['Feature'].tolist()
print(selected_features)
new_categorical_columns = [x for x in categorical_columns if x in selected_features]

# Train a new KNeighborsRegressor model using only the selected features
X_train_selected = X_train[selected_features]
X_cv_selected = X_cv[selected_features]
X_test_selected = X_test[selected_features]

# Apply OneHotEncoding to selected categorical columns
encoder_selected = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
encoder_selected.fit(X_train_selected[new_categorical_columns])

X_train_encoded_selected = encoder_selected.transform(X_train_selected[new_categorical_columns])
X_cv_encoded_selected = encoder_selected.transform(X_cv_selected[new_categorical_columns])
X_test_encoded_selected = encoder_selected.transform(X_test_selected[new_categorical_columns])

# Combine with non-categorical columns and scale the features
X_train_encoded_selected = np.hstack((X_train_selected.drop(columns=new_categorical_columns).values, X_train_encoded_selected))
X_train_encoded_selected = scaler.fit_transform(X_train_encoded_selected)

X_cv_encoded_selected = np.hstack((X_cv_selected.drop(columns=new_categorical_columns).values, X_cv_encoded_selected))
X_cv_encoded_selected = scaler.transform(X_cv_encoded_selected)

X_test_encoded_selected = np.hstack((X_test_selected.drop(columns=new_categorical_columns).values, X_test_encoded_selected))
X_test_encoded_selected = scaler.transform(X_test_encoded_selected)

['car_model', 'mileage', 'num_of_doors', 'engine', 'engine_capacity', 'transmission', 'drive_type', 'fuel_consumption', 'grade', 'year_of_manufacture']




In [7]:
# Merge X_train_encoded_selected with X_cv_encoded_selected
X_merged_encoded_selected = np.vstack((X_train_encoded_selected, X_cv_encoded_selected))

# Create labels to identify the origin of each row
y_merged = np.hstack((y_train, y_cv))

In [8]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsRegressor()
search_space = {
	'n_neighbors': [1, 3, 5, 10, 20],
    'metric':['euclidean', 'manhattan', 'minkowski', 'chebyshev'],
    'weights': ['uniform', 'distance']
}
best_model = GridSearchCV(estimator=knn, param_grid=search_space, 
                          scoring = 'neg_mean_squared_error', cv=5, n_jobs = -1)
best_model.fit(X_merged_encoded_selected, y_merged)
print(best_model.best_params_)
y_pred = best_model.predict(X_test_encoded_selected)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

{'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
Mean Squared Error: 0.009516722486138515


In [11]:
from sklearn.metrics import r2_score
print('*' * 50)
print('TRAIN SET:')
print('Root mean squared root: ', np.sqrt(mean_squared_error(best_model.predict(X_merged_encoded_selected), y_merged)))
print('R2 Score: ', r2_score(best_model.predict(X_merged_encoded_selected), y_merged))
print('*' * 50)
print('TEST SET:')
print('Root mean squared root: ', np.sqrt(mean_squared_error(best_model.predict(X_test_encoded_selected), y_test)))
print('R2 Score: ', r2_score(best_model.predict(X_test_encoded_selected), y_test))

**************************************************
TRAIN SET:
Root mean squared root:  0.037554031867154614
R2 Score:  0.9896421389245187
**************************************************
TEST SET:
Root mean squared root:  0.09755369027432287
R2 Score:  0.9297841818446584
