In [1]:
#IMPORTS
import pandas as pd
import numpy as np
import re
import warnings

#Machine Learning Related IMPORTS
from sklearn.model_selection import train_test_split

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

import optuna
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,RobustScaler
import sklearn.metrics as mt


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data_1 = pd.read_csv(r"C:\Users\90545\Desktop\Emir\Personal_Files\Projects\Machine_Learning_Projects\Kaggle_Car_Price_Competetion\train.csv")
train_data_2 = pd.read_csv(r"C:\Users\90545\Desktop\Emir\Personal_Files\Projects\Machine_Learning_Projects\Kaggle_Car_Price_Competetion\cleaned_used_cars.csv")
test_data = pd.read_csv(r"C:\Users\90545\Desktop\Emir\Personal_Files\Projects\Machine_Learning_Projects\Kaggle_Car_Price_Competetion\test.csv")
train_data = pd.concat([train_data_1,train_data_2])

In [20]:
pd.set_option("display.max_column",100)
warnings.filterwarnings("ignore")
X=train_data.drop(columns=["price"])
y=train_data["price"]
X_train,X_val,y_train,y_val = train_test_split(X,y,random_state=16,train_size=0.80)

### Competetion Goal : Predict Car Prices with Lowest RMSE Possible
RMSE:Root Mean Squared Error

## Feature Engineering

In [21]:
### Whole Encoding In One Function
def preprocess_dataframe(df):
    """
    This function performs various preprocessing steps on the dataframe:
    1. Maps and fills missing values in 'accident' and 'fuel_type' columns.
    2. Extracts 'horsepower' and 'engine_size' from 'engine' description.
    3. Cleans and fills missing 'horsepower' values.
    4. Simplifies 'transmission' into broader categories.
    """
    # 1. Map and Fill Missing Values
    df["accident"] = df["accident"].map({
        "None reported": 0,
        "At least 1 accident or damage reported": 1
    })
    
    df["clean_title"].fillna("No", inplace=True)
    
    # Handle missing 'accident' values by replacing with the most probable value
    accident_counts = df["accident"].value_counts()
    top_options = accident_counts.nlargest(2).index
    option_probs = accident_counts[top_options] / accident_counts[top_options].sum()
    df["accident"] = df["accident"].apply(lambda x: x if x in top_options else np.random.choice(top_options, p=option_probs))
    
    # df.loc[df["fuel_type"].isna(), "fuel_type"] = "Electric"


    def fix_fuel_type(df):
        """
        Extracts fuel type from engine information and updates the fuel_type column.
        
        Args:
            df (pd.DataFrame): Input DataFrame with 'engine' and 'fuel_type' columns.
        
        Returns:
            pd.DataFrame: Updated DataFrame with corrected fuel_type information.
        """
        def extract_fuel_type(engine_info):
            if pd.isna(engine_info):
                return np.nan
            if 'Gasoline' in engine_info:
                return 'Gasoline'
            elif 'Hybrid' in engine_info:
                return 'Hybrid'
            elif 'Flex Fuel' in engine_info or 'E85' in engine_info:
                return 'Flex Fuel'
            elif 'Diesel' in engine_info:
                return 'Diesel'
            elif 'Electric' in engine_info:
                return 'Electric'
            else:
                return np.nan
        
        # Extract fuel type from engine information
        extracted_fuel_type = df['engine'].apply(extract_fuel_type)
        
        # Fill missing values in 'fuel_type' with the extracted fuel type
        df['fuel_type'] = df['fuel_type'].fillna(extracted_fuel_type)
        
        return df
 
    df  = fix_fuel_type(df)

    # 2. Extract Features
    def extract_horsepower(engine_desc):
        match = re.search(r'(\d+(\.\d+)?)HP', str(engine_desc), re.IGNORECASE)
        return match.group(1) if match else 'unknown'
    
    def extract_engine_size(engine_desc):
        match = re.search(r'(\d+(\.\d+)?)[ ]?(L|Liter)', str(engine_desc), re.IGNORECASE)
        return match.group(1) if match else 'unknown'
    
    df['horsepower'] = df['engine'].apply(extract_horsepower)
    df['engine_size'] = df['engine'].apply(extract_engine_size)
    
    # 3. Clean and Fill Horsepower
    def clean_and_fill_horsepower(df):
        df['horsepower'] = df['horsepower'].replace('unknown', np.nan)
        
        grouped_full = df.groupby(['brand', 'model', 'model_year'])['horsepower'].agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan).to_dict()
        grouped_partial = df.groupby(['brand', 'model'])['horsepower'].agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan).to_dict()

        def fill_horsepower(row):
            mode_value = grouped_full.get((row['brand'], row['model'], row['model_year']))
            if pd.isna(mode_value):
                mode_value = grouped_partial.get((row['brand'], row['model']))
            return mode_value if not pd.isna(mode_value) else 0

        df['horsepower'] = df.apply(lambda row: fill_horsepower(row) if pd.isna(row['horsepower']) else row['horsepower'], axis=1)
        return df

    df = clean_and_fill_horsepower(df)
    
    # 4. Simplify Transmission
    def simplify_transmission(df):
        patterns = {
            'Variable Transmission (CVT)': r'\b(cvt|continuously variable transmission)\b',
            'Manual': r'\b(m/t|manual)\b',
            'Automatic': r'\b(a/t|automatic|auto)\b',
            'Semi-Automatic/Automated Manual': r'\b(dual shift|auto-shift|semi-automatic)\b'
        }
        
        def classify_transmission(transmission):
            transmission = str(transmission).strip().lower()
            for category, pattern in patterns.items():
                if re.search(pattern, transmission):
                    return category
            return 'Other'

        df['transmission_type'] = df['transmission'].apply(classify_transmission)
        return df

    df = simplify_transmission(df)

    
    def extract_age_features(df, current_year=2025):
        
       df['car_age'] = current_year - df['model_year']

       bins = [0, 3, 6, 12, 20, 30, 60]
       labels = [0, 1, 2, 3, 4, 5]  

       df['age_segment'] = pd.cut(df['car_age'], bins=bins, labels=labels, right=True, include_lowest=True)                  
       return df
    df=extract_age_features(df)

    return df

In [22]:
X_train_preprocessed = preprocess_dataframe(X_train)
X_val_preprocessed = preprocess_dataframe(X_val)

### PIPELINE BUILDING

In [24]:
target = 'price'
numeric_features = ["model_year","milage","horsepower","car_age"]
categorical_features = ["brand","model","fuel_type","engine","transmission","ext_col","int_col","accident","age_segment",
                        "clean_title","transmission_type","engine_size"
                        
                      ]

numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

ValueError: A given column is not a column of the dataframe

### I used OPTUNA for finding Best Hyperparameters for Each Model
------------------------------------------------------------------------

### BEST RMSE value for Validation Set : 69765.13507066487 (XGBOOST)

 parameters = {'max_depth': 6, 'learning_rate': 0.007130145108951567, 'n_estimators': 554, 'min_child_weight': 9, 'subsample': 0.6698870925180658, 'colsample_bytree': 0.544894339837042, 'gamma': 1.3470526846778162e-05, 'reg_alpha': 0.8083435400261932, 'reg_lambda': 3.493205906307386e-05}. Best is trial 21 with value: 69765.13507066487.

In [8]:
#I normally use pipeline structure, since I use early stopping I did not use pipeline to get best params but in the other codes I used pipeline structure
# from sklearn.metrics import mean_squared_error
# def objective(trial):
#     params = {
#         'max_depth': trial.suggest_int('max_depth', 2, 15),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
#         'n_estimators': trial.suggest_int('n_estimators', 100, 600),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#         'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
#         'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
#        # 'tree_method': 'gpu_hist',  # Use GPU for training
#        # 'predictor': 'gpu_predictor',  # Use GPU for prediction as well
#     }

#     model = xgb.XGBRegressor(**params, verbosity=0, use_label_encoder=False, 
#                             )

#     X_train_transformed = preprocessor.fit_transform(X_train)
#     X_val_transformed = preprocessor.transform(X_val)
#     model.fit(X_train_transformed, y_train, 
#               eval_set=[(X_val_transformed, y_val)], 
#               verbose=False)

#     model.set_params(early_stopping_rounds=50)

#     predictions = model.predict(X_val_transformed)
#     rmse = mean_squared_error(y_val, predictions, squared=False)
#     return rmse


# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100, timeout=300)


### BEST RMSE value for Validation Set : 69857.21284136214 (CATBOOST)

parameters: {'depth': 10, 'learning_rate': 0.06545528849901149, 'iterations': 454, 'l2_leaf_reg': 5.627334494455288, 'subsample': 0.907860006008691, 'rsm': 0.6121219607102849, 'random_strength': 0.00599273019294434, 'bagging_temperature': 0.00012917747774166064, 'border_count': 96}

In [9]:
#I normally use pipeline structure, since I use early stopping I did not use pipeline to get best params but in the other codes I used pipeline structure

# def objective(trial):
#     params = {
#         'depth': trial.suggest_int('depth', 2, 15),  # equivalent to max_depth in CatBoost
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
#         'iterations': trial.suggest_int('iterations', 100, 600),  # equivalent to n_estimators
#         'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0),  # CatBoost regularization
#         'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
#         'rsm': trial.suggest_uniform('rsm', 0.5, 1.0),  # similar to colsample_bytree in XGBoost
#         'random_strength': trial.suggest_loguniform('random_strength', 1e-8, 10.0),
#         'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-8, 1.0),
#         'border_count': trial.suggest_int('border_count', 32, 255)  # CatBoost-specific feature binning
#     }

#     # Initialize the CatBoost model with suggested parameters
#     model = CatBoostRegressor(**params, silent=True)  # silent=True disables logging

#     # Transform the data
#     X_train_transformed = preprocessor.fit_transform(X_train)
#     X_val_transformed = preprocessor.transform(X_val)

#     # Train the model with early stopping
#     model.fit(X_train_transformed, y_train, 
#               eval_set=[(X_val_transformed, y_val)], 
#               early_stopping_rounds=50,  # Early stopping
#               verbose=False)

#     # Predictions and RMSE calculation
#     predictions = model.predict(X_val_transformed)
#     rmse = mean_squared_error(y_val, predictions, squared=False)
#     return rmse

# # Create Optuna study and optimize
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100, timeout=300)


### Best RMSE value for Validation Set: 69673.45602466534
parameters: {'max_depth': 8, 'learning_rate': 0.010782518849357521, 'n_estimators': 518, 'min_child_weight': 7, 'subsample': 0.7469651925338591, 'colsample_bytree': 0.8177106942315678, 'reg_alpha': 0.01250685433895496, 'reg_lambda': 6.287719342547071e-07, 'num_leaves': 64, 'min_child_samples': 68, 'feature_fraction': 0.5003994698412366}


In [11]:
#I normally use pipeline structure, since I use early stopping I did not use pipeline to get best params but in the other codes I used pipeline structure


# def objective(trial):
#     params = {
#         'max_depth': trial.suggest_int('max_depth', 2, 15),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
#         'n_estimators': trial.suggest_int('n_estimators', 100, 600),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#         'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
#         'num_leaves': trial.suggest_int('num_leaves', 20, 150),  # LightGBM specific
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),  # LightGBM specific
#         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
#         "verbose":0# LightGBM specific
#     }

#     model = lgb.LGBMRegressor(**params)

#     # Transform the training and validation data
#     X_train_transformed = preprocessor.fit_transform(X_train)
#     X_val_transformed = preprocessor.transform(X_val)

#     # Train the model with early stopping
#     model.fit(X_train_transformed, y_train,
#               eval_set=[(X_val_transformed, y_val)],  # Validation set
#               eval_metric='rmse',                    # Evaluation metric
# callbacks=(
#             [lgb.early_stopping(stopping_rounds=100)]))

#     # Make predictions
#     predictions = model.predict(X_val_transformed)
    
#     # Calculate RMSE
#     rmse = mean_squared_error(y_val, predictions, squared=False)
#     return rmse

# # Create and optimize the study
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100, timeout=300)

# # Output the best parameters and value
# print("Best trial parameters:", study.best_params)
# print("Best RMSE score:", study.best_value)


### BEST PARAMS FOR EACH MODEL

In [12]:
cb_params = {'depth': 10, 'learning_rate': 0.06545528849901149, 'iterations': 454, 'l2_leaf_reg': 5.627334494455288, 'subsample': 0.907860006008691, 'rsm': 0.6121219607102849, 'random_strength': 0.00599273019294434, 'bagging_temperature': 0.00012917747774166064, 'border_count': 96}
model_cat = cat.CatBoostRegressor(**cb_params)
pipeline_cbr = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model_cat)])
best_params={'max_depth': 8, 'learning_rate': 0.010782518849357521, 'n_estimators': 518, 'min_child_weight': 7, 'subsample': 0.7469651925338591, 'colsample_bytree': 0.8177106942315678, 'reg_alpha': 0.01250685433895496, 'reg_lambda': 6.287719342547071e-07, 'num_leaves': 64, 'min_child_samples': 68, 'feature_fraction': 0.5003994698412366}
model = lgb.LGBMRegressor(**best_params)
pipeline_lgbm = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])
xgb_params = {'max_depth': 6, 'learning_rate': 0.007130145108951567, 'n_estimators': 554, 'min_child_weight': 9, 'subsample': 0.6698870925180658, 'colsample_bytree': 0.544894339837042, 'gamma': 1.3470526846778162e-05, 'reg_alpha': 0.8083435400261932, 'reg_lambda': 3.493205906307386e-05}
xgb_model = xgb.XGBRegressor(**xgb_params)
pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', xgb_model)])

In [17]:
pipeline_cbr.fit(X_train,y_train)
pipeline_xgb.fit(X_train,y_train)
pipeline_lgbm.fit(X_train,y_train)

0:	learn: 78835.4157081	total: 46.4ms	remaining: 21s
1:	learn: 78233.8761105	total: 91.7ms	remaining: 20.7s
2:	learn: 77708.3092433	total: 135ms	remaining: 20.3s
3:	learn: 77228.2686981	total: 179ms	remaining: 20.2s
4:	learn: 76806.7834738	total: 220ms	remaining: 19.8s
5:	learn: 76421.4278068	total: 261ms	remaining: 19.5s
6:	learn: 76074.1111377	total: 304ms	remaining: 19.4s
7:	learn: 75744.3978479	total: 344ms	remaining: 19.2s
8:	learn: 75444.9249451	total: 383ms	remaining: 19s
9:	learn: 75181.6016272	total: 419ms	remaining: 18.6s
10:	learn: 74933.6364060	total: 459ms	remaining: 18.5s
11:	learn: 74708.5900677	total: 499ms	remaining: 18.4s
12:	learn: 74499.6209507	total: 541ms	remaining: 18.4s
13:	learn: 74328.0026180	total: 581ms	remaining: 18.3s
14:	learn: 74174.2427725	total: 617ms	remaining: 18.1s
15:	learn: 74004.0494160	total: 656ms	remaining: 18s
16:	learn: 73862.7074718	total: 697ms	remaining: 17.9s
17:	learn: 73718.0576064	total: 735ms	remaining: 17.8s
18:	learn: 73605.9838114

### CV SCORES FOR EACH PIPELINE

##### XGBOOST PIPELINE CV SCORES

In [40]:
# from sklearn.model_selection import KFold,cross_val_score
# skf = KFold(n_splits=10, shuffle=True, random_state=42)

# cv_scores = cross_val_score(pipeline_xgb, X_train, y_train, cv=skf,
#                             scoring="neg_root_mean_squared_error")

# for i, score in enumerate(cv_scores):
#     print(f"Fold {i + 1} score: {-score :.5f}")

# print("---" * 8)
# print(f"Mean RMSE Skoru : {-cv_scores.mean() :.5f}")
# print(f"Standart Deviation for RMSE Scores: {cv_scores.std() :.5f}")

- Fold 1 score: 73301.22221
- Fold 2 score: 87197.33259
- Fold 3 score: 66552.18724
- Fold 4 score: 67330.59766
- Fold 5 score: 63633.70289
- Fold 6 score: 78027.67577
- Fold 7 score: 68561.92834
- Fold 8 score: 68959.64954
- Fold 9 score: 85949.89528
- Fold 10 score: 69374.85767
------------------------
- Mean RMSE Score : 72888.90492
- Standart Deviation for RMSE Scores: 7778.91625

##### CATBOOST PIPELINE CV SCORES

In [22]:
# ### from sklearn.model_selection import KFold,cross_val_score
# skf = KFold(n_splits=10, shuffle=True, random_state=42)

# cv_scores = cross_val_score(pipeline_cbr, X_train, y_train, cv=skf,
#                             scoring="neg_root_mean_squared_error")

# for i, score in enumerate(cv_scores):
#     print(f"Fold {i + 1} score: {-score :.5f}%")

# print("---" * 8)
# print(f"Ortalama RMSE : {-cv_scores.mean() :.5f}")
# print(f"Standart Sapma: {-cv_scores.std() :.5f}%")


CATBOOST CV SCORES
- Fold 1 score: 73241.22
- Fold 2 score: 87120.88
- Fold 3 score: 66481.94
- Fold 4 score: 68165.06
- Fold 5 score: 63948.85
- Fold 6 score: 78080.39
- Fold 7 score: 69648.18
- Fold 8 score: 68957.25
- Fold 9 score: 85784.73
- Fold 10 score: 69134.62
------------------------
Mean RMSE Score : 73056.31
Standard Deviation: 7609.14

##### LIGHTGBM PIPELINE CV SCORES

In [21]:
# from sklearn.model_selection import KFold,cross_val_score
# skf = KFold(n_splits=10, shuffle=True, random_state=42)

# cv_scores = cross_val_score(pipeline_lgbm, X_train, y_train, cv=skf,
#                             scoring="neg_root_mean_squared_error")

# for i, score in enumerate(cv_scores):
#     print(f"Fold {i + 1} score: {-score :.5f}%")

# print("---" * 8)
# print(f"Ortalama F1-Skoru : {-cv_scores.mean() :.5f}")
# print(f"Standart Sapma: {-cv_scores.std() :.5f}%")

LIGHTGBM CV SCORES
- Fold 1 score: 73117.49
- Fold 2 score: 87092.70
- Fold 3 score: 66466.46
- Fold 4 score: 67284.22
- Fold 5 score: 63568.41
- Fold 6 score: 77928.41
- Fold 7 score: 68519.50
- Fold 8 score: 68822.25
- Fold 9 score: 85707.35
- Fold 10 score: 69457.77
------------------------
Mean RMSE Score : 72796.46
Standard Deviation: 7735.35

In [23]:
cb_preds=pipeline_cbr.predict(X_val)
lgb_preds=pipeline_lgbm.predict(X_val)
xgb_preds=pipeline_xgb.predict(X_val)



##### WEIGHT OPTIMIZATION

In [24]:
def objective(trial):
    w1 = trial.suggest_float("w1", 0, 1)
    w2 = trial.suggest_float("w2", 0, 1)
    w3 = 1 - (w1 + w2)
    
    y_pred = np.average([ cb_preds,
                         lgb_preds, xgb_preds], axis=0, 
    weights=[w1, w2,
             w3 
             ])
    
    rmse = mt.mean_squared_error(y_val, y_pred, squared=False)
    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1000)

best_weights = study.best_params
print(best_weights)

[I 2024-09-25 11:01:03,376] A new study created in memory with name: no-name-6dd9a378-bece-450b-bc7b-8176b0df0bf8
[I 2024-09-25 11:01:03,418] Trial 0 finished with value: 69791.5351065033 and parameters: {'w1': 0.5844685723741448, 'w2': 0.7059577951521788}. Best is trial 0 with value: 69791.5351065033.
[I 2024-09-25 11:01:03,426] Trial 1 finished with value: 69706.27513829281 and parameters: {'w1': 0.33112911699584047, 'w2': 0.7646939815894718}. Best is trial 1 with value: 69706.27513829281.
[I 2024-09-25 11:01:03,432] Trial 2 finished with value: 70050.03634444444 and parameters: {'w1': 0.9527393341410635, 'w2': 0.863803764730626}. Best is trial 1 with value: 69706.27513829281.
[I 2024-09-25 11:01:03,441] Trial 3 finished with value: 69759.62183387813 and parameters: {'w1': 0.21396436615539582, 'w2': 0.03874450562885534}. Best is trial 1 with value: 69706.27513829281.
[I 2024-09-25 11:01:03,452] Trial 4 finished with value: 69745.18598472566 and parameters: {'w1': 0.2118191076870971, 

{'w1': 0.06605878511525086, 'w2': 0.8229529009865497}


In [42]:
w1 = best_weights['w1']#CAT
w2 = best_weights['w2']#LIGHT
w3 = 1 - (w1 + w2)#XGB
print("w1",w1)
print("w2",w2)
print("w3",w3)


w1 0.06605878511525086
w2 0.8229529009865497
w3 0.11098831389819941


##### VOTINGREGRESSOR(ENSEMBLE) PIPELINE CV SCORES

In [30]:
# from sklearn.ensemble import VotingRegressor
# voting_reg = VotingRegressor(
#     estimators=[('xgb', xgb.XGBRegressor(**xgb_params)), ('lgb', lgb.LGBMRegressor(**best_params,verbose=0)),("cb",cat.CatBoostRegressor(**cb_params))
#                  ],
#     weights=[ w3,w2, w1]  # Custom weights for models
# )  # Custom weights for models
# pipeline_vote = Pipeline(steps=[('preprocessor', preprocessor),
#                            ('model', voting_reg)])
# skf = KFold(n_splits=10, shuffle=True, random_state=42)

# cv_scores = cross_val_score(pipeline_vote, X_train, y_train, cv=skf,
#                             scoring="neg_root_mean_squared_error")

# for i, score in enumerate(cv_scores):
#     print(f"Fold {i + 1} score: {-score :.2f}%")

# print("---" * 8)
# print(f"Mean RMSE : {-cv_scores.mean() :.2f}")
# print(f"Standard Deviation: {-cv_scores.std() :.2f}")

VOTING REGRESSOR WITH WEIGHTS OPTIMIZED CV
- Fold 1 score: 73111.29
- Fold 2 score: 87081.32
- Fold 3 score: 66441.91
- Fold 4 score: 67279.11
- Fold 5 score: 63556.61
- Fold 6 score: 77916.73
- Fold 7 score: 68525.37
- Fold 8 score: 68780.20
- Fold 9 score: 85706.86
- Fold 10 score: 69376.15
------------------------
Ortalama RMSE : 72777.55
Standart Sapma: -7741.55

### COMBINING FULL TRAINING SET FOR BETTER MODEL  AFTER FINDING BEST HYPERPARAMETERS IN VALIDATION SET 

In [31]:
X_train_combined = pd.concat([X_train,X_val])
y_train_combined = pd.concat([y_train,y_val])

In [32]:
from sklearn.ensemble import VotingRegressor
voting_reg = VotingRegressor(
    estimators=[('xgb', xgb.XGBRegressor(**xgb_params)), ('lgb', lgb.LGBMRegressor(**best_params,verbose_eval=-1)),("cb",cat.CatBoostRegressor(**cb_params))
                 ],
    weights=[ w3,w2, w1]  # Custom weights for models
)  # Custom weights for models
pipeline_vote = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', voting_reg)])
pipeline_vote.fit(X_train_combined,y_train_combined)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017996 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1597
[LightGBM] [Info] Number of data points in the train set: 192542, number of used features: 16
[LightGBM] [Info] Start training from score 43892.074270
0:	learn: 78150.5599670	total: 50ms	remaining: 22.6s
1:	learn: 77555.6761967	total: 96.6ms	remaining: 21.8s
2:	learn: 77021.1205561	total: 146ms	remaining: 21.9s
3:	learn: 76540.1436952	total: 197ms	remaining: 22.1s
4:	learn: 76114.1899854	total: 242ms	remaining: 21.7s
5:	learn: 75725.8140642	total: 288ms	remaining: 21.5s
6:	learn: 75385.1820074	total: 335ms	remaining: 21.4s
7:	learn: 75067.1471122	total: 379ms	remaining: 21.2s
8:	learn: 74774.7545959	total: 427ms	remaining: 21.1s
9:	learn: 74514.9021127	total: 471ms	remaining: 20.9s
10:	learn: 74286.1495284	total: 520ms	remaining: 20.9s
11:	learn: 74070.4141041	total: 565ms	remaining: 20.8s
12:

In [33]:
test=preprocess_dataframe(test_data)

In [35]:
y_pred = pipeline_vote.predict(test)



In [36]:
subb = pd.DataFrame({"id":test_data.id,"price":y_pred})

In [37]:
subb.to_csv("XGB_CAT_LGBM_10FOLD_CV_ROBUST.csv",index=0)

In [38]:
subb

Unnamed: 0,id,price
0,188533,18040.955956
1,188534,79529.938169
2,188535,58083.102938
3,188536,22935.323570
4,188537,30221.475681
...,...,...
125685,314218,27946.888464
125686,314219,46619.336539
125687,314220,22393.558808
125688,314221,16701.043530
