In [None]:
pip install category_encoders

In [None]:
!pip install scikit-optimize


# Step 1 - Explore and prep data

In [None]:
import pandas as pd

df = pd.read_csv("C:\\Users\\rmct2\\OneDrive - Sri Lanka Institute of Information Technology\\Desktop\\SLIIT\\Y3S1\\FDM\\car_prices.csv")

In [None]:
#list and drop columns that are less related to the target based on my judgement
cols_to_drop = ['transmission','vin', 'state','sellingprice','saledate']

#at the same time rename the columns so they are understandable.
df = df.drop(columns = cols_to_drop).rename(columns = {'year':'manufacture_year','make':'brand','model':'specific_model',
                                                      'trim':'additional_designation','interior':'interior_color','mmr':'estimated_value'})



In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(subset=['brand', 'specific_model', 'additional_designation','body','condition','odometer','color','interior_color','estimated_value'])

In [None]:
#Split to training and testing

from sklearn.model_selection import train_test_split

x = df.drop(columns = 'estimated_value')
y = df['estimated_value']

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2,random_state=8)

# Step 2 - Build a pipeline of training

In [None]:
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor

estimators = [
    ('encoder', TargetEncoder()),  # Categorical encoding
    ('reg', XGBRegressor(random_state=8))  # Regression model for continuous target
]
pipe = Pipeline(steps=estimators)
pipe


# Step 3 - Set up hyperparameter tuning

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Updated search space
search_space = {
    'reg__n_estimators': (100, 1000),  # Prefix with 'reg__'
    'reg__max_depth': (3, 10),
    'reg__learning_rate': (0.01, 0.3, 'uniform')
}

# Perform Bayesian search over the pipeline
opt = BayesSearchCV(pipe, search_space, n_iter=50, scoring='neg_mean_squared_error', n_jobs=-1)

# Step 4 - Train the XGBoost Model

In [None]:
opt.fit(X_train, Y_train)

# Step 5 - Evaluate the model and make predictions

In [None]:
opt.best_estimator_

In [None]:
opt.best_score_

In [None]:
opt.score(X_test, Y_test)

In [None]:
opt.predict(X_test)

In [None]:
# For regression, use predict instead 
predictions = opt.predict(X_test)

In [None]:
from sklearn.metrics import r2_score

# Predictions from your model
predictions = opt.predict(X_test)

# Calculate R-squared
r2 = r2_score(Y_test, predictions)
print(f'R-squared: {r2}')


In [None]:
from sklearn.metrics import mean_absolute_error

# Calculate Mean Absolute Error
mae = mean_absolute_error(Y_test, predictions)
print(f'Mean Absolute Error: {mae}')


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

# Make predictions on the test set
predictions = opt.predict(X_test)

# Calculate R-squared on the test set
r2 = r2_score(Y_test, predictions)
print(f'R-squared on Test Data: {r2}')

# Calculate Mean Absolute Error on the test set
mae = mean_absolute_error(Y_test, predictions)
print(f'Mean Absolute Error on Test Data: {mae}')


# Step 6 - Measure feature importance

In [None]:
opt.best_estimator_.steps

In [None]:
from xgboost import plot_importance

xgboost_step = opt.best_estimator_.steps[1]
xgboost_model = xgboost_step[1]
plot_importance(xgboost_model)