In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
import joblib

In [27]:
df = pd.read_csv("Dataset/Final/Remove-noise-not-outlier-and-fill-null.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Dataset/Final/Remove-noise-not-outlier-and-fill-null.csv'

In [5]:
# Save'price_in_billion' individually
price_column = df['price_in_billion']
df = df.drop(['price_in_billion','ad_id','brand','url','grade','condition','price'], axis=1)

# Define columns that need encoded and scaled
columns_for_encoding = ['origin','car_model','exterior_color','interior_color','engine','transmission','drive_type','car_name']
columns_for_scaling = ['num_of_doors','seating_capacity','engine_capacity','fuel_consumption','mileage','year_of_manufacture']

# Apply StandardScaler to numerical features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[columns_for_scaling])

# Apply OneHotEncoder to categorical features
encoder = OneHotEncoder(sparse_output=False)
df_encoded = encoder.fit_transform(df[columns_for_encoding])


# Apply Truncated SVD to the one-hot encoded categorical features
n_components_svd = min(df_encoded.shape[1] - 1, 500)  # Adjust the number of components as necessary
svd = TruncatedSVD(n_components=n_components_svd, random_state=42)
df_svd = svd.fit_transform(df_encoded)

# Combine scaled numerical features and SVD-transformed categorical features
df_combined = np.hstack((df_scaled, df_svd))

X_train, X_test, y_train, y_test = train_test_split(df_combined, price_column, test_size=0.3, random_state=42)



In [6]:
#Define pipeline
pipeline = Pipeline(steps=[
    ('svr', SVR())
])


In [7]:
#Define hyperparameters of pipeline that need tuning and the range values of them
param_grid = {'svr__C': [10,20,50,100,200,300,400,500],  
              'svr__gamma': [0.001,0.01,0.1,1.0], 
              'svr__kernel': ['poly','rbf'],
              'svr__epsilon':[0.01,0.1,0.25,1.0,4.0,16.0,64.0,256.0],
             }
  

In [8]:
# Define the halving grid search with cross-validation
halving_grid_search = HalvingGridSearchCV(pipeline, param_grid, scoring="r2", cv=5, verbose=6, factor=3,error_score='raise')
halving_grid_search.fit(X_train, y_train)


n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 88
max_resources_: 21456
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 512
n_resources: 88
Fitting 5 folds for each of 512 candidates, totalling 2560 fits
[CV 1/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kernel=poly;, score=(train=-0.116, test=-0.049) total time=   0.0s
[CV 2/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kernel=poly;, score=(train=-0.097, test=-0.076) total time=   0.0s
[CV 3/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kernel=poly;, score=(train=-0.063, test=-0.179) total time=   0.0s
[CV 4/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kernel=poly;, score=(train=-0.101, test=-0.062) total time=   0.0s
[CV 5/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kernel=poly;, score=(train=-0.078, test=-0.292) total time=   0.0s
[CV 1/5] END svr__C=10, svr__epsilon=0.01, svr__gamma=0.001, svr__kerne

In [9]:
best_params = halving_grid_search.best_params_
best_model = halving_grid_search.best_estimator_
best_score = halving_grid_search.best_score_

In [10]:
# Save the best model
joblib_file = "best_model.joblib"
joblib.dump(best_model, joblib_file)
print(f"Model saved to {joblib_file}")

# Load the model from the file
loaded_model = joblib.load(joblib_file)

Model saved to best_model.joblib


In [11]:
y_pred = loaded_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Best parameters: {best_params}")
print(f"Best scores: {best_score}")
print(f"Test MSE: {mse}")
print(f"Model score: {best_model.score(X_test, y_test)}")


Best parameters: {'svr__C': 20, 'svr__epsilon': 0.01, 'svr__gamma': 0.1, 'svr__kernel': 'rbf'}
Best scores: 0.8639279857761399
Test MSE: 0.6157878085505033
Model score: 0.851871086940417
