In [9]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.stats import uniform, randint, loguniform
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Set random seed for reproducibility
np.random.seed(42)

# Configuration
DATASET_PATH = 'dataset/'
TRAINING_DATA_PATH = DATASET_PATH + "train.csv"
TESTING_DATA_PATH = DATASET_PATH + "testFeatures.csv"
LOW_QUANTILE = 0.02
UP_QUANTILE = 0.99
CAT_THRESHOLD = 5
CAR_THRESHOLD = 10
CORRELATION_THRESHOLD = 0.40
CAT_LENGTH = 8
NUM_METHOD = "median"

# Helper Functions
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(3))
    print("##################### Tail #####################")
    print(dataframe.tail(3))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

def grab_col_names(dataframe, cat_th=CAT_THRESHOLD, car_th=CAR_THRESHOLD):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtype != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtype == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, cat_but_car, num_cols

def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.xticks(rotation=45)
        plt.savefig(f'{col_name}_countplot.png')
        plt.close()

def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.50, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)
    if plot:
        dataframe[numerical_col].hist(bins=50)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.savefig(f'{numerical_col}_histogram.png')
        plt.close()
    print("#####################################")

def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")

def high_correlated_cols(dataframe, plot=False, corr_th=CORRELATION_THRESHOLD):
    corr = dataframe.corr(numeric_only=True)
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    if plot:
        sns.heatmap(corr, cmap="RdBu", annot=True)
        plt.savefig('correlation_heatmap.png')
        plt.close()
    return drop_list

def outlier_thresholds(dataframe, variable, low_quantile=LOW_QUANTILE, up_quantile=UP_QUANTILE):
    q1 = dataframe[variable].quantile(low_quantile)
    q3 = dataframe[variable].quantile(up_quantile)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return lower_bound, upper_bound

def category_outlier_thresholds(df, col, category_col='ürün kategorisi'):
    thresholds = {}
    for category in df[category_col].unique():
        cat_df = df[df[category_col] == category]
        low, up = outlier_thresholds(cat_df, col)
        thresholds[category] = (low, up)
    return thresholds

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)

def replace_with_thresholds(dataframe, variable, category_col='ürün kategorisi'):
    thresholds = category_outlier_thresholds(dataframe, variable, category_col)
    for category, (low, up) in thresholds.items():
        dataframe.loc[(dataframe[category_col] == category) & (dataframe[variable] < low), variable] = low
        dataframe.loc[(dataframe[category_col] == category) & (dataframe[variable] > up), variable] = up
    return dataframe

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns

def remove_missing_values(dataframe):
    print("##################### Missing Values Before #####################")
    print(dataframe.isnull().sum())
    dataframe_cleaned = dataframe.dropna()
    print("##################### Missing Values After #####################")
    print(dataframe_cleaned.isnull().sum())
    return dataframe_cleaned

def quick_missing_imp(data, num_method=NUM_METHOD, cat_length=CAT_LENGTH, target="ürün fiyatı"):
    variables_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]
    temp_target = data[target] if target in data.columns else None
    print("# BEFORE")
    print(data[variables_with_na].isnull().sum(), "\n")
    data = data.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= cat_length) else x, axis=0)
    if num_method == "mean":
        data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)
    elif num_method == "median":
        data = data.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)
    if temp_target is not None:
        data[target] = temp_target
    print("# AFTER")
    print("Categorical variables filled with mode")
    print(f"Numerical variables filled with {num_method}")
    print(data[variables_with_na].isnull().sum(), "\n")
    return data

def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ":", len(dataframe[col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[col].value_counts(),
                            "RATIO": dataframe[col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(col)[target].mean()}), end="\n\n\n")

def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()
    rare_columns = [col for col in temp_df.columns if temp_df[col].dtype == 'O'
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]
    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])
    return temp_df

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    return pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)

# DataLoader Class
class DataLoader:
    def __init__(self, training_data_path, testing_data_path):
        self.training_data_path = training_data_path
        self.testing_data_path = testing_data_path
        print("Initializing DataLoader...")

    def get_data(self):
        print("Loading data...")
        train = pd.read_csv(self.training_data_path)
        test = pd.read_csv(self.testing_data_path)
        df = pd.concat([train, test], ignore_index=True)
        df = df.reset_index(drop=True)
        print("Data loaded successfully.")
        return df

# DataPreprocessing Class
class DataPreprocessing:
    def __init__(self, dataframe):
        self.df = dataframe.copy()

    def preprocess(self, is_test_only=False):
        self.handle_outliers()
        self.handle_missing_values()
        self.feature_engineering()
        self.drop_unnecessary_columns()
        self.encode_features()

        if is_test_only:
            test_data = self.df[self.df['ürün fiyatı'].isnull()].drop('ürün fiyatı', axis=1)
            test_ids = test_data["id"].copy()
            test_data = test_data.drop(columns=['id'])
            return test_data, test_ids
        else:
            train_data = self.df[self.df['ürün fiyatı'].notnull()]
            train_data = train_data.drop(columns=['id'])
            X = train_data.drop('ürün fiyatı', axis=1)
            y = train_data['ürün fiyatı']
            y_log = np.log1p(y)  # Log transformation for target
            X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.10, random_state=42)
            return X_train, X_val, y_train, y_val, y  # Return original y for error analysis

    def handle_outliers(self):
        num_cols = self.df.select_dtypes(include=np.number).columns
        num_cols = [col for col in num_cols if col != 'ürün fiyatı']
        for col in num_cols:
            if check_outlier(self.df, col):
                self.df = replace_with_thresholds(self.df, col)

    def handle_missing_values(self):
        num_cols = self.df.select_dtypes(include=np.number).columns
        num_cols = [col for col in num_cols if col != 'ürün fiyatı']
        self.df[num_cols] = self.df.groupby('ürün kategorisi')[num_cols].transform(lambda x: x.fillna(x.median()))
        cat_cols = self.df.select_dtypes(include='object').columns
        for col in cat_cols:
            self.df[col] = self.df[col].fillna(self.df[col].mode()[0])

    def feature_engineering(self):
        # Time-based features
        self.df['tarih'] = pd.to_datetime(self.df['tarih'])
        self.df['ay'] = self.df['tarih'].dt.month
        self.df['çeyrek'] = self.df['tarih'].dt.quarter
        self.df['haftanın_günü'] = self.df['tarih'].dt.weekday

        # Product and category-based features
        self.df['besin_değeri_log'] = np.log1p(self.df['ürün besin değeri'])
        self.df['kategori_ortalama_besin'] = self.df.groupby('ürün kategorisi')['ürün besin değeri'].transform('mean')
        self.df['ürün_ortalama_fiyat'] = self.df.groupby('ürün')['ürün fiyatı'].transform('mean')
        self.df['kategori_fiyat_std'] = self.df.groupby('ürün kategorisi')['ürün fiyatı'].transform('std')
        self.df['besin_değeri_kategori'] = pd.qcut(self.df['ürün besin değeri'], q=3, labels=['düşük', 'orta', 'yüksek'])
        self.df['ürün_freq'] = self.df['ürün'].map(self.df['ürün'].value_counts() / len(self.df))

    def drop_unnecessary_columns(self):
        columns_to_drop = ['ürün üretim yeri', 'market', 'şehir']
        self.df.drop(columns=[col for col in columns_to_drop if col in self.df.columns], inplace=True)

    def encode_features(self):
        cat_cols, cat_but_car, num_cols = grab_col_names(self.df)
        binary_cols = [col for col in cat_cols if self.df[col].nunique() <= 3]
        for col in binary_cols:
            self.df = label_encoder(self.df, col)
        high_cardinality_cols = cat_but_car + [col for col in cat_cols if col not in binary_cols]
        for col in high_cardinality_cols:
            if col in self.df.columns:
                train_data = self.df[self.df['ürün fiyatı'].notnull()]
                target_means = train_data.groupby(col)['ürün fiyatı'].mean()
                self.df[col] = self.df[col].map(target_means).fillna(train_data['ürün fiyatı'].mean())
        remaining_cat_cols = [col for col in cat_cols if col not in binary_cols and col not in high_cardinality_cols]
        if remaining_cat_cols:
            self.df = one_hot_encoder(self.df, remaining_cat_cols, drop_first=True)
        remaining_object_cols = self.df.select_dtypes(include='object').columns.tolist()
        if remaining_object_cols:
            raise ValueError(f"Categorical columns not fully encoded: {remaining_object_cols}")

# HyperTuner Class
class HyperTuner:
    def __init__(self):
        self.param_grids = {
     "CatBoost": {
    'iterations': [100],  # Sabit kalabilir, zaten yeterli
    'learning_rate': [0.01, 0.02],  # 0.01 iyi, belki biraz daha hızlı eğitim için 0.02 de denenebilir
    'depth': [6, 8],  # 8 başarılı, 6 alternatif olarak yeterli
    'l2_leaf_reg': [5, 7, 9],  # 7 iyi, biraz etrafı denenebilir
    'bagging_temperature': [0.5, 1],  # 1 işe yarıyor, ama 0.5 ile genel performans kontrolü
    'border_count': [64, 128],  # 64 işe yarıyor, 128 ile karşılaştırma yapılabilir
    'grow_policy': ['Lossguide', 'Depthwise']  # 'Lossguide' işe yarıyor, sadece bir alternatif bırak
}
,
            "XGBoost": {
                'n_estimators': randint(50, 200),
                'learning_rate': loguniform(0.005, 0.2),
                'max_depth': [3, 5, 7, 9],
                'subsample': uniform(0.6, 0.4),
                'colsample_bytree': uniform(0.6, 0.4)
            },
            "LightGBM": {
                'n_estimators': randint(50, 200),
                'learning_rate': loguniform(0.005, 0.2),
                'num_leaves': randint(20, 50),
                'max_depth': [3, 5, 7, -1],
                'subsample': uniform(0.6, 0.4),
                'colsample_bytree': uniform(0.6, 0.4)
            }
        }
        self.models = {
            "CatBoost": CatBoostRegressor(silent=True, random_state=17),
            "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=17),
            "LightGBM": LGBMRegressor(random_state=17),
        }

    def tune_model(self, model_name, X, y):
        if model_name not in self.models:
            raise ValueError(f"Model {model_name} not found in models list.")
        model = self.models[model_name]
        param_grid = self.param_grids[model_name]
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid,
            n_iter=10,
            cv=3,
            scoring='neg_mean_absolute_error',
            n_jobs=-1,
            random_state=42
        )
        search.fit(X, y)
        return search.best_estimator_, search.best_params_

# ModelEvaluator Class
class ModelEvaluator:
    def __init__(self, output_dir="predictions"):
        self.model_names = [
            "CatBoost",
        ]
        self.best_model_name = "CatBoost"
        self.rmse_scores = {}
        self.mae_scores = {}
        self.best_params = {}
        self.tuner = HyperTuner()
        self.trained_models = {}
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def evaluate_models(self, X, y, X_val, y_val, y_val_original):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=17)
        print("Evaluating models with hyperparameter tuning...")
        for name in self.model_names:
            best_model, best_params = self.tuner.tune_model(name, X_train, y_train)
            self.trained_models[name] = best_model
            self.best_params[name] = best_params
            rmse = np.mean(np.sqrt(-cross_val_score(best_model, X, y, cv=5, scoring="neg_mean_squared_error")))
            self.rmse_scores[name] = rmse
            mae = np.mean(-cross_val_score(best_model, X, y, cv=5, scoring="neg_mean_absolute_error"))
            self.mae_scores[name] = mae
            print(f"RMSE: {round(rmse, 4)} | MAE: {round(mae, 4)} ({name})")
            if self.best_params[name]:
                print(f"Best parameters: {self.best_params[name]}")
            # Category-based error analysis
            y_pred_val = np.expm1(best_model.predict(X_val))
            val_data = X_val.copy()
            val_data['y_true'] = y_val_original
            val_data['y_pred'] = y_pred_val
            print(f"\nCategory-based MAE for {name}:")
            print(val_data.groupby('ürün kategorisi').apply(lambda x: mean_absolute_error(x['y_true'], x['y_pred'])))

        # Stacking Ensemble
        print("\nTraining Stacking Ensemble...")
        estimators = [(name, self.trained_models[name]) for name in self.model_names]
        stacking_model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
        stacking_model.fit(X_train, y_train)
        self.trained_models['Stacking'] = stacking_model
        rmse = np.mean(np.sqrt(-cross_val_score(stacking_model, X, y, cv=5, scoring="neg_mean_squared_error")))
        self.rmse_scores['Stacking'] = rmse
        mae = np.mean(-cross_val_score(stacking_model, X, y, cv=5, scoring="neg_mean_absolute_error"))
        self.mae_scores['Stacking'] = mae
        print(f"RMSE: {round(rmse, 4)} | MAE: {round(mae, 4)} (Stacking)")

        best_model_mae = min(self.mae_scores, key=self.mae_scores.get)
        print(f"\nBest model based on MAE: {best_model_mae} (MAE: {round(self.mae_scores[best_model_mae], 4)})")
        best_model_rmse = min(self.rmse_scores, key=self.rmse_scores.get)
        print(f"Best model based on RMSE: {best_model_rmse} (RMSE: {round(self.rmse_scores[best_model_rmse], 4)})")
        return X_train, X_test, y_train, y_test

    def train_and_predict(self, X_train, y_train, X_test, test_ids, output_file="submission.csv"):
        print(f"\n⏳ Training {self.best_model_name} model...")
        import time
        start_time = time.time()
        best_model, best_params = self.tuner.tune_model(self.best_model_name, X_train, y_train)
        self.best_params[self.best_model_name] = best_params
        best_model.fit(X_train, y_train)
        training_time = time.time() - start_time
        print(f"✅ Training completed in {training_time:.2f} seconds")
        print(f"🏆 Best parameters: {best_params}")
        predictions = np.expm1(best_model.predict(X_test))  # Inverse log transformation
        submission_df = pd.DataFrame({
            "id": test_ids.astype(int),
            "ürün fiyatı": predictions.astype(float)
        })
        output_path = os.path.join(self.output_dir, output_file)
        submission_df.to_csv(output_path, index=False, float_format='%.4f')
        print(f"\n📁 Predictions saved to '{output_path}'")
        print(f"Sample predictions:\n{submission_df.head()}")
        return predictions

    def get_rmse_scores(self):
        return self.rmse_scores

    def get_mae_scores(self):
        return self.mae_scores

    def get_best_params(self):
        return self.best_params

# Main Function
def main():
    import time
    start_time = time.time()
    print("🚀 Starting product price prediction pipeline...")
    print("\n📂 Loading data...")
    data_loader = DataLoader(TRAINING_DATA_PATH, TESTING_DATA_PATH)
    combined_df = data_loader.get_data()
    print(f"✅ Data loaded. Shape: {combined_df.shape}")
    print("\n🔧 Preprocessing data...")
    preprocessor = DataPreprocessing(combined_df)
    X_train, X_val, y_train, y_val, y_val_original = preprocessor.preprocess()
    print(f"✅ Training data prepared. Features: {X_train.shape[1]}, Samples: {X_train.shape[0]}")
    X_test_submission, test_ids = preprocessor.preprocess(is_test_only=True)
    print(f"✅ Test data prepared. Samples: {X_test_submission.shape[0]}")
    print("\n🧪 Evaluating models...")
    evaluator = ModelEvaluator(output_dir="predictions/Boosting")
    evaluator.evaluate_models(X_train, y_train, X_val, y_val, y_val_original)
    print("\n🔮 Making final predictions...")
    predictions = evaluator.train_and_predict(
        X_train,
        y_train,
        X_test_submission,
        test_ids,
        output_file="submission.csv"
    )
    total_time = time.time() - start_time
    print(f"\n🎉 Pipeline completed in {total_time:.2f} seconds!")

if __name__ == "__main__":
    main()

🚀 Starting product price prediction pipeline...

📂 Loading data...
Initializing DataLoader...
Loading data...
Data loaded successfully.
✅ Data loaded. Shape: (273024, 9)

🔧 Preprocessing data...
Observations: 273024
Variables: 15
cat_cols: 4
num_cols: 10
cat_but_car: 1
num_but_cat: 3
✅ Training data prepared. Features: 13, Samples: 204768


  return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)
  return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)
  return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)
  return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)
  return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)
  return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)
  return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)
  return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)
  return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)
  return dataframe[(dataframe[col_name] > up_limit) | (dataframe

Observations: 273024
Variables: 15
cat_cols: 3
num_cols: 12
cat_but_car: 0
num_but_cat: 3
✅ Test data prepared. Samples: 45504

🧪 Evaluating models...
Evaluating models with hyperparameter tuning...
RMSE: 0.2893 | MAE: 0.2224 (CatBoost)
Best parameters: {'learning_rate': 0.01, 'l2_leaf_reg': 7, 'iterations': 100, 'grow_policy': 'Lossguide', 'depth': 8, 'border_count': 64, 'bagging_temperature': 1}

Category-based MAE for CatBoost:
ürün kategorisi
9.976630      2.555764
10.420985     1.792725
15.534468     3.423504
26.588236     7.292074
31.349666     8.982162
36.961375    15.158272
dtype: float64

Training Stacking Ensemble...


  print(val_data.groupby('ürün kategorisi').apply(lambda x: mean_absolute_error(x['y_true'], x['y_pred'])))


RMSE: 0.0856 | MAE: 0.0647 (Stacking)

Best model based on MAE: Stacking (MAE: 0.0647)
Best model based on RMSE: Stacking (RMSE: 0.0856)

🔮 Making final predictions...

⏳ Training CatBoost model...
✅ Training completed in 67.37 seconds
🏆 Best parameters: {'learning_rate': 0.01, 'l2_leaf_reg': 7, 'iterations': 100, 'grow_policy': 'Lossguide', 'depth': 8, 'border_count': 64, 'bagging_temperature': 1}

📁 Predictions saved to 'predictions/Boosting/submission.csv'
Sample predictions:
        id  ürün fiyatı
227520   0    39.523662
227521   1    23.880532
227522   2    25.886491
227523   3    19.666484
227524   4    29.171648

🎉 Pipeline completed in 267.67 seconds!
