In [30]:
import optuna
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge

In [31]:
plt.rcParams['figure.figsize'] = (14, 10)
plt.rcParams['font.size'] = 24
sampler_seed = 99
random_state = 45

df = pd.read_csv("data/processed/fiat/dataset.csv")
df

Unnamed: 0,engine_power,age_in_days,km,previous_owners,lat,lon,price,model_lounge,model_pop,model_sport
0,51,882,25000,1,0.640201,-1.267881,8900,1,0,0
1,51,1186,32500,1,0.996006,0.291412,8800,0,1,0
2,74,4658,142228,1,0.919579,-0.062533,4200,0,0,1
3,51,2739,160000,1,-1.363096,2.607683,6000,1,0,0
4,73,3074,106880,1,-0.767812,0.400406,5700,0,1,0
...,...,...,...,...,...,...,...,...,...,...
1533,51,3712,115280,1,0.716337,-1.657300,5200,0,0,1
1534,74,3835,112000,1,1.080061,-1.244125,4600,1,0,0
1535,51,2223,60457,1,0.909380,-0.923442,7500,0,1,0
1536,51,2557,80750,1,0.684007,-1.667028,5990,1,0,0


In [32]:
feature = df.drop("price", axis=1)
target = df["price"]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=random_state)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)

In [34]:
def knn(trial):
    knn_n_neighbours = trial.suggest_int("knn_neighbours", 2, 50)
    knn_weights = trial.suggest_categorical('knn_weights', ['uniform','distance'])
    knn_algorithm = trial.suggest_categorical('knn_algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    knn_leaf_size = trial.suggest_int("knn_leaf_size", 10, 100)
    knn_p = trial.suggest_int("knn_p", 1, 5)
    return KNeighborsRegressor(n_neighbors=knn_n_neighbours, weights=knn_weights, algorithm=knn_algorithm, leaf_size=knn_leaf_size, p=knn_p)

In [35]:
def svm(trial):
    svr_c = trial.suggest_float("svr_c", 1e-10, 1e2, log=True)
    svr_epsilon = trial.suggest_float("svr_epsilon", 1e-2, 1e1, log=True)
    svr_kernel = trial.suggest_categorical("svr_kernel", ["linear", "poly", "rbf", "sigmoid"])
    svr_tol = trial.suggest_float("svr_tol", 1e-2, 1e1, log=True)
    svr_degree = trial.suggest_int("svr_degree", 1, 6)
    return SVR(C=svr_c, epsilon=svr_epsilon, kernel=svr_kernel, degree=svr_degree, max_iter=10000, tol=svr_tol)

In [36]:
def bayes(trial):
    bayes_n_iter = trial.suggest_int("bayes_n_iter", 100, 100)
    bayes_alpha_1 = trial.suggest_float("bayes_alpha_1", 1e-10, 1e-2, log=True)
    bayes_alpha_2 = trial.suggest_float("bayes_alpha_2", 1e-10, 1e-2, log=True)
    bayes_lambda_1 = trial.suggest_float("bayes_lambda_1", 1e-10, 1e-2, log=True)
    bayes_lambda_2 = trial.suggest_float("bayes_lambda_2", 1e-10, 1e-2, log=True)
    return BayesianRidge(n_iter=bayes_n_iter, alpha_1=bayes_alpha_1, alpha_2=bayes_alpha_2, lambda_1=bayes_lambda_1, lambda_2=bayes_lambda_2)

In [37]:
def tree(trial):
    tree_max_depth = trial.suggest_int("tree_max_depth", 2, 32)
    tree_splitter = trial.suggest_categorical("tree_splitter", ["best", "random"])
    tree_min_samples_split = trial.suggest_int("tree_min_samples_split", 4, 10)
    tree_min_samples_leaf = trial.suggest_int("tree_min_samples_leaf", 1, 10)
    return DecisionTreeRegressor(max_depth=tree_max_depth, random_state=random_state, splitter=tree_splitter, min_samples_split=tree_min_samples_split, min_samples_leaf=tree_min_samples_leaf)

In [38]:
def forest(trial):
    rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32)
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 50, 500)
    rf_min_samples_split = trial.suggest_int("rf_min_samples_split", 4, 10)
    rf_min_samples_leaf = trial.suggest_int("rf_min_samples_leaf", 1, 10)
    rf_min_weight_fraction_leaf = trial.suggest_float("rf_min_weight_fraction_leaf", 0, 0.4)
    return RandomForestRegressor(max_depth=rf_max_depth, min_samples_split=rf_min_samples_split, n_estimators=rf_n_estimators, min_samples_leaf=rf_min_samples_leaf, min_weight_fraction_leaf=rf_min_weight_fraction_leaf, random_state=random_state, n_jobs=-1)

In [39]:
def objective(trial):
    regressor_name = trial.suggest_categorical("regressor", ["KNN", "SVM", "Bayes", "Tree", "RandomForest"])
    regressor_obj = None
    match regressor_name:
        case "KNN":
            regressor_obj = knn(trial)
        case "SVM":
            regressor_obj = svm(trial)
        case "Bayes":
            regressor_obj = bayes(trial)
        case "Tree":
            regressor_obj = tree(trial)
        case "RandomForest":
            regressor_obj = forest(trial)

    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)
    error = mean_squared_error(y_val, y_pred)
    return error

In [40]:
def objective_with_reg(regressor, trial):
    regressor_obj = regressor(trial)
    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)
    error = mean_squared_error(y_val, y_pred)
    return error

In [41]:
from functools import partial

attempts = 500
def find_solution():
    result = []
    for reg in [knn, svm, bayes, tree, forest]:
        study = optuna.create_study(study_name="study_linear", direction="minimize")
        study.optimize(partial(objective_with_reg, reg), n_trials=attempts, n_jobs=-1, timeout=600)
        study_random = optuna.create_study(study_name="study_random", direction="minimize", sampler=optuna.samplers.RandomSampler(seed=sampler_seed))
        study_random.optimize(partial(objective_with_reg, reg), n_trials=attempts, n_jobs=-1, timeout=600)
        result.append([study, study_random])
    return result

In [42]:
sol = find_solution()

[32m[I 2023-03-02 16:36:47,417][0m A new study created in memory with name: study_linear[0m
[32m[I 2023-03-02 16:36:47,438][0m Trial 1 finished with value: 941681.8241238798 and parameters: {'knn_neighbours': 2, 'knn_weights': 'distance', 'knn_algorithm': 'auto', 'knn_leaf_size': 10, 'knn_p': 5}. Best is trial 1 with value: 941681.8241238798.[0m
[32m[I 2023-03-02 16:36:47,441][0m Trial 0 finished with value: 779776.0695546729 and parameters: {'knn_neighbours': 27, 'knn_weights': 'distance', 'knn_algorithm': 'auto', 'knn_leaf_size': 13, 'knn_p': 2}. Best is trial 0 with value: 779776.0695546729.[0m
[32m[I 2023-03-02 16:36:47,453][0m Trial 2 finished with value: 776157.8698013074 and parameters: {'knn_neighbours': 12, 'knn_weights': 'distance', 'knn_algorithm': 'kd_tree', 'knn_leaf_size': 24, 'knn_p': 2}. Best is trial 2 with value: 776157.8698013074.[0m
[32m[I 2023-03-02 16:36:47,475][0m Trial 3 finished with value: 858782.6428282829 and parameters: {'knn_neighbours': 30, 

In [43]:
for (study, study_random) in sol:
    print()
    print("linear:")
    print(study.best_params)
    print("random:")
    print(study_random.best_params)
    print()


linear:
{'knn_neighbours': 24, 'knn_weights': 'distance', 'knn_algorithm': 'brute', 'knn_leaf_size': 87, 'knn_p': 1}
random:
{'knn_neighbours': 24, 'knn_weights': 'distance', 'knn_algorithm': 'brute', 'knn_leaf_size': 34, 'knn_p': 1}


linear:
{'svr_c': 0.00010582747745606503, 'svr_epsilon': 2.8265412085679786, 'svr_kernel': 'linear', 'svr_tol': 0.016979364097942012, 'svr_degree': 3}
random:
{'svr_c': 0.00012385837513627555, 'svr_epsilon': 0.01603778840914802, 'svr_kernel': 'linear', 'svr_tol': 0.23359211556301784, 'svr_degree': 5}


linear:
{'bayes_n_iter': 100, 'bayes_alpha_1': 8.984672878342451e-10, 'bayes_alpha_2': 8.869396542987234e-06, 'bayes_lambda_1': 1.7680719674333326e-10, 'bayes_lambda_2': 0.009999248752659964}
random:
{'bayes_n_iter': 100, 'bayes_alpha_1': 9.872900990157328e-05, 'bayes_alpha_2': 1.196645499649013e-06, 'bayes_lambda_1': 7.494786790392616e-08, 'bayes_lambda_2': 0.00910980160429437}


linear:
{'tree_max_depth': 13, 'tree_splitter': 'random', 'tree_min_samples

In [44]:
names = ["knn", "svm", "bayes", "tree", "forest"]
for (i, (study, study_random)) in enumerate(sol):
    print(names[i])
    optuna.visualization.plot_optimization_history(study).show()
    print(f"random {names[i]}")
    optuna.visualization.plot_optimization_history(study_random).show()

knn


random knn


svm


random svm


bayes


random bayes


tree


random tree


forest


random forest


In [45]:
for (i, (study, study_random)) in enumerate(sol):
    print(names[i])
    optuna.visualization.plot_param_importances(study).show()
    print("random")
    optuna.visualization.plot_param_importances(study_random).show()

knn


random


svm


random


bayes


random


tree


random


forest


random


In [53]:
for (i, (study, study_random)) in enumerate(sol):
    params = study.best_params
    params_random = study_random.best_params
    match i:
        case 0:
            regressor_obj = KNeighborsRegressor(n_neighbors=params["knn_neighbours"], weights=params["knn_weights"], algorithm=params["knn_algorithm"], leaf_size=params["knn_leaf_size"], p=params["knn_p"])
            regressor_obj_random = KNeighborsRegressor(n_neighbors=params_random["knn_neighbours"], weights=params_random["knn_weights"], algorithm=params_random["knn_algorithm"], leaf_size=params_random["knn_leaf_size"], p=params_random["knn_p"])
        case 1:
            regressor_obj = SVR(C=params["svr_c"], epsilon=params["svr_epsilon"], kernel=params["svr_kernel"], degree=params["svr_degree"], max_iter=10000, tol=params["svr_tol"])
            regressor_obj_random = SVR(C=params_random["svr_c"], epsilon=params_random["svr_epsilon"], kernel=params_random["svr_kernel"], degree=params_random["svr_degree"], max_iter=10000, tol=params_random["svr_tol"])
        case 2:
            regressor_obj = BayesianRidge(n_iter=params["bayes_n_iter"], alpha_1=params["bayes_alpha_1"], alpha_2=params["bayes_alpha_2"], lambda_1=params["bayes_lambda_1"], lambda_2=params["bayes_lambda_2"])
            regressor_obj_random = BayesianRidge(n_iter=params_random["bayes_n_iter"], alpha_1=params_random["bayes_alpha_1"], alpha_2=params_random["bayes_alpha_2"], lambda_1=params_random["bayes_lambda_1"], lambda_2=params_random["bayes_lambda_2"])
        case 3:
            regressor_obj = DecisionTreeRegressor(max_depth=params["tree_max_depth"], random_state=random_state, splitter=params["tree_splitter"], min_samples_split=params["tree_min_samples_split"], min_samples_leaf=params["tree_min_samples_leaf"])
            regressor_obj_random = DecisionTreeRegressor(max_depth=params_random["tree_max_depth"], random_state=random_state, splitter=params_random["tree_splitter"], min_samples_split=params_random["tree_min_samples_split"], min_samples_leaf=params_random["tree_min_samples_leaf"])
        case 4:
            regressor_obj = RandomForestRegressor(max_depth=params["rf_max_depth"], min_samples_split=params["rf_min_samples_split"], n_estimators=params["rf_n_estimators"], min_samples_leaf=params["rf_min_samples_leaf"], min_weight_fraction_leaf=params["rf_min_weight_fraction_leaf"], random_state=random_state, n_jobs=-1)
            regressor_obj_random = RandomForestRegressor(max_depth=params_random["rf_max_depth"], min_samples_split=params_random["rf_min_samples_split"], n_estimators=params_random["rf_n_estimators"], min_samples_leaf=params_random["rf_min_samples_leaf"], min_weight_fraction_leaf=params_random["rf_min_weight_fraction_leaf"], random_state=random_state, n_jobs=-1)
    regressor_obj.fit(X_train, y_train)
    regressor_obj_random.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_test)
    y_pred_random = regressor_obj_random.predict(X_test)
    error = mean_squared_error(y_test, y_pred)
    error_random = mean_squared_error(y_test, y_pred_random)
    print(names[i])
    print(f"{error} vs {error_random}")


knn
732055.285705702 vs 732055.285705702
svm
587793.3275795407 vs 588581.4295810001
bayes
594236.2837093169 vs 594236.6026898673
tree
692278.7486082085 vs 692278.7486082085
forest
565709.1405882371 vs 564252.4265629102
