In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('/content/drive/MyDrive/ML/datasets/car_sales_processed.csv', sep=',', index_col='ID')
test_df = pd.read_csv('/content/drive/MyDrive/ML/datasets/car_sales_test_processed.csv', sep=',', index_col='ID').drop(columns='Cena')
synthetic_df = pd.read_csv('/content/drive/MyDrive/ML/datasets/synthetic_processed.csv', sep=',', index_col='ID')
display(train_df.shape)
display(test_df.shape)
display(synthetic_df.shape)

(135397, 18)

(72907, 17)

(6717, 18)

In [3]:
!pip install xgboost
!pip install lightgbm
!pip install scikeras



In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from keras import layers
from sklearn.ensemble import GradientBoostingRegressor

In [5]:
train, test = train_test_split(train_df, test_size=0.2, random_state=42)
X_train = train.drop(columns='Cena')
y_train = train['Cena']
X_test = test.drop(columns='Cena')
y_test = test['Cena']

In [6]:
encoder = LabelEncoder()
category_columns = X_train.select_dtypes(include=['object']).columns
for column in category_columns:
  X_train[column] = encoder.fit_transform(X_train[column])
  X_test[column] = encoder.fit_transform(X_test[column])

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.1 colorlog-6.9.0 optuna-4.2.1


In [13]:
import optuna

In [23]:
def objective(trial):
    # XGBoost parameters
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 15),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('xgb_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('xgb_min_child_weight', 1, 10),
        'gamma': trial.suggest_float('xgb_gamma', 0, 5),
        'reg_alpha': trial.suggest_float('xgb_reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('xgb_reg_lambda', 0, 5),
    }

    # LightGBM parameters
    lgb_params = {
        'n_estimators': trial.suggest_int('lgb_n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('lgb_max_depth', 3, 15),
        'learning_rate': trial.suggest_float('lgb_learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('lgb_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('lgb_colsample_bytree', 0.6, 1.0),
        'min_child_samples': trial.suggest_int('lgb_min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('lgb_reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('lgb_reg_lambda', 0, 5),
    }

    # RandomForest parameters
    rf_params = {
        'n_estimators': trial.suggest_int('rf_n_estimators', 50, 500),
        'max_depth': trial.suggest_int('rf_max_depth', 10, 100),
        'min_samples_split': trial.suggest_int('rf_min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('rf_max_features', 0.5, 1.0),
    }

    # ExtraTrees parameters
    et_params = {
        'n_estimators': trial.suggest_int('et_n_estimators', 50, 500),
        'max_depth': trial.suggest_int('et_max_depth', 10, 100),
        'min_samples_split': trial.suggest_int('et_min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('et_min_samples_leaf', 1, 10),
    }

    # GradientBoosting parameters
    gbr_params = {
        'n_estimators': trial.suggest_int('gbr_n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('gbr_max_depth', 3, 15),
        'learning_rate': trial.suggest_float('gbr_learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('gbr_subsample', 0.6, 1.0),
    }

    # Tworzenie modeli z optymalizowanymi parametrami
    xgb = XGBRegressor(**xgb_params, n_jobs=-1, random_state=42)
    lgb = LGBMRegressor(**lgb_params, n_jobs=-1, random_state=42)
    rf = RandomForestRegressor(**rf_params, n_jobs=-1, random_state=42)
    et = ExtraTreesRegressor(**et_params, n_jobs=-1, random_state=42)
    gbr = GradientBoostingRegressor(**gbr_params, random_state=42)

    # Wagi dla ensemble
    weight_xgb = trial.suggest_float('weight_xgb', 0.1, 1.0)
    weight_lgb = trial.suggest_float('weight_lgb', 0.1, 1.0)
    weight_rf = trial.suggest_float('weight_rf', 0.1, 1.0)
    weight_et = trial.suggest_float('weight_et', 0.1, 1.0)
    weight_gbr = trial.suggest_float('weight_gbr', 0.1, 1.0)

    # Tworzenie ensemble modelu z odpowiednimi wagami
    ensemble = VotingRegressor([
        ('xgb', xgb),
        ('lgb', lgb),
        ('rf', rf),
        ('et', et),
        ('gbr', gbr)
    ], weights=[weight_xgb, weight_lgb, weight_rf, weight_et, weight_gbr])

    # Trenowanie ensemble modelu
    ensemble.fit(X_train, y_train)

    # Predykcje
    preds = ensemble.predict(X_test)

    # Ocena - konwertujemy z powrotem logarytmiczną cenę
    preds_original = preds
    y_test_original = y_test

    rmse = np.sqrt(mean_squared_error(y_test_original, preds_original))
    return rmse

In [24]:
# Utworzenie badania Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)  # Możesz zwiększyć liczbę prób dla lepszych wyników

print('Najlepsze parametry:')
print(study.best_params)
print(f'Najlepsze RMSE: {study.best_value}')

[I 2025-03-25 18:11:02,076] A new study created in memory with name: no-name-f53bcc56-a21b-4411-a917-5e5ab3a78282


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1522
[LightGBM] [Info] Number of data points in the train set: 108317, number of used features: 17
[LightGBM] [Info] Start training from score 63406.802118


[I 2025-03-25 18:21:26,463] Trial 0 finished with value: 33390.35587445771 and parameters: {'xgb_n_estimators': 478, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.25472457023860245, 'xgb_subsample': 0.6108170593908073, 'xgb_colsample_bytree': 0.6477262584095289, 'xgb_min_child_weight': 8, 'xgb_gamma': 1.423599549203281, 'xgb_reg_alpha': 0.11413527997449324, 'xgb_reg_lambda': 2.4155418901348185, 'lgb_n_estimators': 517, 'lgb_max_depth': 7, 'lgb_learning_rate': 0.08784849459746373, 'lgb_subsample': 0.7278679396026053, 'lgb_colsample_bytree': 0.9748989563322886, 'lgb_min_child_samples': 100, 'lgb_reg_alpha': 0.01543004392903835, 'lgb_reg_lambda': 3.62130452951253, 'rf_n_estimators': 334, 'rf_max_depth': 43, 'rf_min_samples_split': 8, 'rf_min_samples_leaf': 4, 'rf_max_features': 0.5194044390893943, 'et_n_estimators': 396, 'et_max_depth': 80, 'et_min_samples_split': 14, 'et_min_samples_leaf': 7, 'gbr_n_estimators': 977, 'gbr_max_depth': 8, 'gbr_learning_rate': 0.13494890406443835, 'gbr_subsamp

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1522
[LightGBM] [Info] Number of data points in the train set: 108317, number of used features: 17
[LightGBM] [Info] Start training from score 63406.802118


[I 2025-03-25 18:32:54,370] Trial 1 finished with value: 34135.10020452268 and parameters: {'xgb_n_estimators': 300, 'xgb_max_depth': 6, 'xgb_learning_rate': 0.28519644544639083, 'xgb_subsample': 0.8120947538840204, 'xgb_colsample_bytree': 0.7857159232951546, 'xgb_min_child_weight': 9, 'xgb_gamma': 0.08906688213151326, 'xgb_reg_alpha': 0.804897373372569, 'xgb_reg_lambda': 2.665928846133268, 'lgb_n_estimators': 393, 'lgb_max_depth': 3, 'lgb_learning_rate': 0.0757075829306509, 'lgb_subsample': 0.6117028056069466, 'lgb_colsample_bytree': 0.9089349844650483, 'lgb_min_child_samples': 54, 'lgb_reg_alpha': 0.6393826009415221, 'lgb_reg_lambda': 3.204235733174456, 'rf_n_estimators': 392, 'rf_max_depth': 54, 'rf_min_samples_split': 20, 'rf_min_samples_leaf': 1, 'rf_max_features': 0.7516920759392522, 'et_n_estimators': 301, 'et_max_depth': 88, 'et_min_samples_split': 12, 'et_min_samples_leaf': 10, 'gbr_n_estimators': 845, 'gbr_max_depth': 7, 'gbr_learning_rate': 0.012418484583719996, 'gbr_subsamp

KeyboardInterrupt: 

In [None]:
parameters = study.best_params
parameters

In [None]:
with open('parameters.txt', 'w') as file:
  file.write(str(parameters))

In [None]:
with open('/content/drive/MyDrive/ML/parameters.txt', 'w') as file:
  file.write(str(parameters))

In [16]:
with open('/content/drive/MyDrive/ML/parameters.txt', 'r') as file:
  parameters = eval(file.read())

In [17]:
xgb_params = {
    'n_estimators': parameters['xgb_n_estimators'],
    'max_depth': parameters['xgb_max_depth'],
    'learning_rate': parameters['xgb_learning_rate'],
    'subsample': parameters['xgb_subsample'],
    'colsample_bytree': parameters['xgb_colsample_bytree'],
    'min_child_weight': parameters['xgb_min_child_weight'],
    'gamma': parameters['xgb_gamma'],
    'reg_alpha': parameters['xgb_reg_alpha'],
    'reg_lambda': parameters['xgb_reg_lambda']
}
lgb_params = {
    'n_estimators': parameters['lgb_n_estimators'],
    'max_depth': parameters['lgb_max_depth'],
    'learning_rate': parameters['lgb_learning_rate'],
    'subsample': parameters['lgb_subsample'],
    'colsample_bytree': parameters['lgb_colsample_bytree'],
    'min_child_samples': parameters['lgb_min_child_samples'],
    'reg_alpha': parameters['lgb_reg_alpha'],
    'reg_lambda': parameters['lgb_reg_lambda']
}
rf_params = {
    'n_estimators': parameters['rf_n_estimators'],
    'max_depth': parameters['rf_max_depth'],
    'min_samples_split': parameters['rf_min_samples_split'],
    'min_samples_leaf': parameters['rf_min_samples_leaf'],
    'max_features': parameters['rf_max_features']
}
et_params = {
    'n_estimators': parameters['et_n_estimators'],
    'max_depth': parameters['et_max_depth'],
    'min_samples_split': parameters['et_min_samples_split'],
    'min_samples_leaf': parameters['et_min_samples_leaf']
}
weight_xgb = parameters['weight_xgb']
weight_lgb = parameters['weight_lgb']
weight_rf = parameters['weight_rf']
weight_et = parameters['weight_et']


In [18]:
model = VotingRegressor([
    ('xgb', XGBRegressor(**xgb_params, n_jobs=-1, random_state=42)),
    ('lgb', LGBMRegressor(**lgb_params, n_jobs=-1, random_state=42)),
    ('rf', RandomForestRegressor(**rf_params, n_jobs=-1, random_state=42)),
    ('et', ExtraTreesRegressor(**et_params, n_jobs=-1, random_state=42))
], weights=[weight_xgb, weight_lgb, weight_rf, weight_et])

In [19]:
model.fit(X_train, y_train)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1522
[LightGBM] [Info] Number of data points in the train set: 108317, number of used features: 17
[LightGBM] [Info] Start training from score 63406.802118


In [20]:
predictions = model.predict(X_test)
results = np.sqrt(mean_squared_error(y_test, predictions))
results



np.float64(31701.080782765843)

In [None]:
full_X_train = train_df.drop(columns='Cena')
full_y_train = train_df['Cena']
full_X_test = test_df

for column in category_columns:
  full_X_train[column] = encoder.fit_transform(full_X_train[column])
  full_X_test[column] = encoder.fit_transform(full_X_test[column])

full_X_train = scaler.fit_transform(full_X_train)
full_X_test = scaler.transform(full_X_test)

In [None]:
display(full_X_train)
display(full_y_train)
display(full_X_test)

Unnamed: 0_level_0,Waluta,Stan,Marka_pojazdu,Model_pojazdu,Is_Premium,Wiek_pojazdu,Przebieg_km,Moc_KM,Pojemnosc_cm3,Rodzaj_paliwa,Czy_naped_4x4,Skrzynia_biegow,Typ_nadwozia,Liczba_drzwi,Kolor,Kraj_pochodzenia,Liczba_elementow_wyposazenia
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1,1,72,515,False,16,213000.0,170,1998.0,3,0,1,6,5.0,2,32,327
2,1,1,72,675,False,11,117089.0,110,1598.0,3,0,1,9,5.0,10,32,560
3,1,1,65,1122,False,6,115600.0,136,1598.0,8,0,1,6,5.0,12,6,430
4,1,1,30,458,False,14,218000.0,90,1560.0,0,0,1,3,5.0,2,32,332
5,1,0,87,205,False,8,145600.0,136,1798.0,3,0,1,1,4.0,8,21,490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135393,1,1,65,193,False,9,136931.0,150,1598.0,0,0,1,3,5.0,10,21,188
135394,1,0,57,1054,False,0,8.0,237,1950.0,0,0,0,6,4.0,12,32,707
135395,1,1,91,1122,False,9,179000.0,120,1700.0,0,0,1,6,5.0,1,32,464
135396,1,1,72,321,False,20,156000.0,60,1149.0,3,0,1,2,5.0,2,10,244


Unnamed: 0_level_0,Cena
ID,Unnamed: 1_level_1
1,13900
2,25900
3,35900
4,5999
5,44800
...,...
135393,45499
135394,269855
135395,21900
135396,4450


Unnamed: 0_level_0,Waluta,Stan,Marka_pojazdu,Model_pojazdu,Is_Premium,Wiek_pojazdu,Przebieg_km,Moc_KM,Pojemnosc_cm3,Rodzaj_paliwa,Czy_naped_4x4,Skrzynia_biegow,Typ_nadwozia,Liczba_drzwi,Kolor,Kraj_pochodzenia,Liczba_elementow_wyposazenia
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1,0,56,409,False,0,1.0,163,1950.0,0,1,0,0,5.0,1,22,951
2,1,1,28,605,False,13,202585.0,145,1999.0,3,0,1,8,5.0,8,32,600
3,1,1,9,798,False,16,373000.0,218,2993.0,0,0,0,6,5.0,6,32,763
4,1,0,78,543,False,0,10.0,130,1498.0,3,0,1,2,5.0,12,32,788
5,1,1,6,122,False,7,150000.0,245,2967.0,0,1,0,8,5.0,6,9,1032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72903,1,1,92,870,True,5,53000.0,220,1984.0,3,1,0,0,5.0,1,22,1036
72904,1,1,28,523,False,12,188132.0,136,1997.0,0,1,1,0,5.0,1,9,709
72905,1,1,88,169,False,5,76786.0,143,1995.0,0,0,1,8,5.0,2,22,740
72906,1,1,72,885,False,12,204300.0,114,1995.0,0,0,1,5,4.0,2,32,221


In [None]:
model.fit(full_X_train, full_y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1508
[LightGBM] [Info] Number of data points in the train set: 135397, number of used features: 17
[LightGBM] [Info] Start training from score 63308.218794


In [None]:
predictions = model.predict(full_X_test)

In [None]:
pred_df = pd.DataFrame({
    'ID': test_df.index,
    'Cena': predictions
})

pred_df.to_csv('predictions.csv', index=False)