In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import randint, uniform

# Set random seed for reproducibility
np.random.seed(42)

# Configuration
DATASET_PATH = 'dataset/'  # Adjust for Kaggle dataset path
TRAINING_DATA_PATH = DATASET_PATH + "train.csv"
TESTING_DATA_PATH = DATASET_PATH + "testFeatures.csv"
LOW_QUANTILE = 0.1
UP_QUANTILE = 0.9
CAT_THRESHOLD = 10
CAR_THRESHOLD = 10
CAT_LENGTH = 30
NUM_METHOD = "median"

# Helper Functions
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Columns #####################")
    print(dataframe.columns.tolist())
    print("##################### Head #####################")
    print(dataframe.head(3))
    print("##################### Tail #####################")
    print(dataframe.tail(3))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

def grab_col_names(dataframe, cat_th=CAT_THRESHOLD, car_th=CAR_THRESHOLD):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtype != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtype == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, cat_but_car, num_cols

def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.xticks(rotation=45)
        plt.savefig(f'{col_name}_countplot.png')
        plt.close()

def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.50, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)
    if plot:
        dataframe[numerical_col].hist(bins=50)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.savefig(f'{numerical_col}_histogram.png')
        plt.close()
    print("########10#############################")

def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")

def outlier_thresholds(dataframe, variable, low_quantile=LOW_QUANTILE, up_quantile=UP_QUANTILE):
    q1 = dataframe[variable].quantile(low_quantile)
    q3 = dataframe[variable].quantile(up_quantile)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return lower_bound, upper_bound

def category_outlier_thresholds(df, col, category_col='ürün kategorisi'):
    thresholds = {}
    for category in df[category_col].unique():
        cat_df = df[df[category_col] == category]
        low, up = outlier_thresholds(cat_df, col)
        thresholds[category] = (low, up)
    return thresholds

def check_outlier(dataframe, col_name):
    if dataframe[col_name].dtype in ['datetime64[ns]', 'datetime64']:
        return False
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)

def replace_with_thresholds(dataframe, variable, category_col='ürün kategorisi'):
    thresholds = category_outlier_thresholds(dataframe, variable, category_col)
    for category, (low, up) in thresholds.items():
        dataframe.loc[(dataframe[category_col] == category) & (dataframe[variable] < low), variable] = low
        dataframe.loc[(dataframe[category_col] == category) & (dataframe[variable] > up), variable] = up
    return dataframe

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns

def quick_missing_imp(data, num_method=NUM_METHOD, cat_length=CAT_LENGTH, target="ürün fiyatı"):
    variables_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]
    temp_target = data[target] if target in data.columns else None
    print("# BEFORE")
    print(data[variables_with_na].isnull().sum(), "\n")
    data = data.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= cat_length) else x, axis=0)
    if num_method == "mean":
        data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)
    elif num_method == "median":
        data = data.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)
    if temp_target is not None:
        data[target] = temp_target
    print("# AFTER")
    print("Categorical variables filled with mode")
    print(f"Numerical variables filled with {num_method}")
    print(data[variables_with_na].isnull().sum(), "\n")
    return data

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    return pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)

# DataLoader Class
class DataLoader:
    def __init__(self, training_data_path, testing_data_path):
        self.training_data_path = training_data_path
        self.testing_data_path = testing_data_path
        print("Initializing DataLoader...")

    def get_data(self):
        print("Loading data...")
        train = pd.read_csv(self.training_data_path)
        test = pd.read_csv(self.testing_data_path)
        df = pd.concat([train, test], ignore_index=True)
        df = df.reset_index(drop=True)
        print("Data loaded successfully.")
        return df

# DataPreprocessing Class
class DataPreprocessing:
    def __init__(self, dataframe):
        self.df = dataframe.copy()

    def preprocess(self, is_test_only=False):
        self.handle_outliers()
        self.handle_missing_values()
        self.feature_engineering()
        self.drop_unnecessary_columns()
        self.encode_features()

        if not is_test_only:
            train_data = self.df[self.df['ürün fiyatı'].notnull()]
            train_data = train_data.drop(columns=['id'])
            X = train_data.drop('ürün fiyatı', axis=1)
            y = train_data['ürün fiyatı']
            y_log = np.log1p(y)
            X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.10, random_state=42)
            non_numeric_cols = X_train.select_dtypes(exclude=['int64', 'float64', 'int32']).columns
            if non_numeric_cols.any():
                raise ValueError(f"Non-numeric columns found in X_train: {non_numeric_cols}")
            return X_train, X_val, y_train, y_val, y
        else:
            test_data = self.df[self.df['ürün fiyatı'].isnull()].drop('ürün fiyatı', axis=1)
            test_ids = test_data["id"].copy()
            test_data = test_data.drop(columns=['id'])
            non_numeric_cols = test_data.select_dtypes(exclude=['int64', 'float64', 'int32']).columns
            if non_numeric_cols.any():
                raise ValueError(f"Non-numeric columns found in X_test: {non_numeric_cols}")
            return test_data, test_ids

    def handle_outliers(self):
        num_cols = self.df.select_dtypes(include=np.number).columns
        num_cols = [col for col in num_cols if col != 'ürün fiyatı']
        for col in num_cols:
            if check_outlier(self.df, col):
                self.df = replace_with_thresholds(self.df, col)

    def handle_missing_values(self):
        num_cols = self.df.select_dtypes(include=np.number).columns
        num_cols = [col for col in num_cols if col != 'ürün fiyatı']
        self.df[num_cols] = self.df.groupby('ürün kategorisi')[num_cols].transform(lambda x: x.fillna(x.median()))
        cat_cols = self.df.select_dtypes(include='object').columns
        for col in cat_cols:
            self.df[col] = self.df[col].fillna(self.df[col].mode()[0])

    def feature_engineering(self):
        date_col = None
        possible_date_cols = ['tarih', 'Tarih', 'date', 'Date', 'tarih_yil', 'timestamp']
        for col in possible_date_cols:
            if col in self.df.columns:
                date_col = col
                break

        if date_col:
            print(f"Found date column: {date_col}")
            self.df[date_col] = pd.to_datetime(self.df[date_col])
            self.df['ay'] = self.df[date_col].dt.month
            self.df['çeyrek'] = self.df[date_col].dt.quarter
            self.df['haftanın_günü'] = self.df[date_col].dt.weekday
        else:
            print("No date column found. Skipping time-based features.")
            self.df['ay'] = 0
            self.df['çeyrek'] = 0
            self.df['haftanın_günü'] = 0

        self.df['besin_değeri_log'] = np.log1p(self.df['ürün besin değeri'])
        self.df['kategori_ortalama_besin'] = self.df.groupby('ürün kategorisi')['ürün besin değeri'].transform('mean')
        self.df['ürün_ortalama_fiyat'] = self.df.groupby('ürün')['ürün fiyatı'].transform('mean')
        self.df['kategori_fiyat_std'] = self.df.groupby('ürün kategorisi')['ürün fiyatı'].transform('std')
        self.df['besin_değeri_kategori'] = pd.qcut(self.df['ürün besin değeri'], q=3, labels=['düşük', 'orta', 'yüksek'])
        self.df['ürün_freq'] = self.df['ürün'].map(self.df['ürün'].value_counts() / len(self.df))

    def drop_unnecessary_columns(self):
        columns_to_drop = ['ürün üretim yeri', 'market', 'şehir'] + [col for col in ['tarih', 'Tarih', 'date', 'Date', 'tarih_yil', 'timestamp'] if col in self.df.columns]
        self.df.drop(columns=[col for col in columns_to_drop if col in self.df.columns], inplace=True)

    def encode_features(self):
        cat_cols, cat_but_car, num_cols = grab_col_names(self.df)
        binary_cols = [col for col in cat_cols if self.df[col].nunique() <= 3]
        for col in binary_cols:
            self.df = label_encoder(self.df, col)
        high_cardinality_cols = cat_but_car + [col for col in cat_cols if col not in binary_cols]
        for col in high_cardinality_cols:
            if col in self.df.columns:
                train_data = self.df[self.df['ürün fiyatı'].notnull()]
                target_means = train_data.groupby(col)['ürün fiyatı'].mean()
                self.df[col] = self.df[col].map(target_means).fillna(train_data['ürün fiyatı'].mean())
        remaining_cat_cols = [col for col in cat_cols if col not in binary_cols and col not in high_cardinality_cols]
        if remaining_cat_cols:
            self.df = one_hot_encoder(self.df, remaining_cat_cols, drop_first=True)
        remaining_object_cols = self.df.select_dtypes(include='object').columns.tolist()
        if remaining_object_cols:
            raise ValueError(f"Categorical columns not fully encoded: {remaining_object_cols}")

# HyperTuner Class
class HyperTuner:
    def __init__(self):
        self.param_grid = {
         'n_estimators': randint(100, 600),  # Wider range for more robust ensembles
    'max_depth': [None, 10, 20, 30, 40],  # Expanded depth options for flexibility
    'min_samples_split': [2, 5, 10, 15],  # Broader range to control overfitting
    'min_samples_leaf': [1, 2, 4, 8],  # More options to regularize leaf size
    'max_features': ['sqrt', 'log2', 0.2, 0.3, 0.4, 0.5],  # Diverse feature sampling strategies
    'bootstrap': [True, False],  # Test both bootstrapping and non-bootstrapping
    'max_leaf_nodes': [None, 500, 1000, 2000],  # Control tree complexity
    'min_impurity_decrease': uniform(0.0, 0.01),  # Regularize splits by impurity reduction
    'ccp_alpha': uniform(0.0, 0.02)  # Cost-complexity pruning for simpler trees
        }
        self.model = RandomForestRegressor(random_state=17)

    def tune_model(self, X, y):
        search = RandomizedSearchCV(
            estimator=self.model,
            param_distributions=self.param_grid,
            n_iter=50,  # Increased for better exploration
            cv=8,
            scoring='neg_mean_absolute_error',
            n_jobs=-1,
            random_state=42
        )
        search.fit(X, y)
        return search.best_estimator_, search.best_params_

# ModelEvaluator Class
class ModelEvaluator:
    def __init__(self, output_dir="predictions/RF/local"):
        self.model_name = "RandomForest"
        self.rmse_score = None
        self.mae_score = None
        self.best_params = None
        self.tuner = HyperTuner()
        self.trained_model = None
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def evaluate_model(self, X, y, X_val, y_val, y_val_original):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=17)
        print("Evaluating Random Forest with hyperparameter tuning...")
        best_model, best_params = self.tuner.tune_model(X_train, y_train)
        self.trained_model = best_model
        self.best_params = best_params
        rmse = np.mean(np.sqrt(-cross_val_score(best_model, X, y, cv=5, scoring="neg_mean_squared_error")))
        self.rmse_score = rmse
        mae = np.mean(-cross_val_score(best_model, X, y, cv=5, scoring="neg_mean_absolute_error"))
        self.mae_score = mae
        print(f"RMSE: {round(rmse, 4)} | MAE: {round(mae, 4)} ({self.model_name})")
        print(f"Best parameters: {best_params}")
        y_pred_val = np.expm1(best_model.predict(X_val))
        val_data = X_val.copy()
        val_data['y_true'] = y_val_original
        val_data['y_pred'] = y_pred_val
        print(f"\nCategory-based MAE for {self.model_name}:")
        print(val_data.groupby('ürün kategorisi').apply(lambda x: mean_absolute_error(x['y_true'], x['y_pred'])))
        importances = best_model.feature_importances_
        feature_importance = pd.DataFrame({'feature': X.columns, 'importance': importances}).sort_values('importance', ascending=False)
        print("\nFeature Importance:")
        print(feature_importance)
        return X_train, X_test, y_train, y_test

    def train_and_predict(self, X_train, y_train, X_test, test_ids, output_file="submission.csv"):
        print(f"\n⏳ Training {self.model_name} model...")
        import time
        start_time = time.time()
        best_model, best_params = self.tuner.tune_model(X_train, y_train)
        self.best_params = best_params
        best_model.fit(X_train, y_train)
        training_time = time.time() - start_time
        print(f"✅ Training completed in {training_time:.2f} seconds")
        print(f"🏆 Best parameters: {best_params}")
        predictions = np.expm1(best_model.predict(X_test))
        submission_df = pd.DataFrame({
            "id": test_ids.astype(int),
            "ürün fiyatı": predictions.astype(float)
        })
        output_path = os.path.join(self.output_dir, output_file)
        submission_df.to_csv(output_path, index=False, float_format='%.4f')
        print(f"\n📁 Predictions saved to '{output_path}'")
        print(f"Sample predictions:\n{submission_df.head()}")
        return predictions

    def get_rmse_score(self):
        return self.rmse_score

    def get_mae_score(self):
        return self.mae_score

    def get_best_params(self):
        return self.best_params

# Main Function
def main():
    import time
    start_time = time.time()
    print("🚀 Starting product price prediction pipeline with Random Forest...")
    print("\n📂 Loading data...")
    data_loader = DataLoader(TRAINING_DATA_PATH, TESTING_DATA_PATH)
    combined_df = data_loader.get_data()
    print(f"✅ Data loaded. Shape: {combined_df.shape}")
    print("\n🔧 Preprocessing data...")
    preprocessor = DataPreprocessing(combined_df)
    X_train, X_val, y_train, y_val, y_val_original = preprocessor.preprocess()
    print(f"✅ Training data prepared. Features: {X_train.shape[1]}, Samples: {X_train.shape[0]}")
    X_test_submission, test_ids = preprocessor.preprocess(is_test_only=True)
    print(f"✅ Test data prepared. Samples: {X_test_submission.shape[0]}")
    print("\n🧪 Evaluating Random Forest model...")
    evaluator = ModelEvaluator(output_dir="predictions/RF/local")
    X_train, X_test, y_train, y_test = evaluator.evaluate_model(X_train, y_train, X_val, y_val, y_val_original)
    print("\n🔮 Making final predictions...")
    predictions = evaluator.train_and_predict(
        X_train,
        y_train,
        X_test_submission,
        test_ids,
        output_file="submission.csv"
    )
    total_time = time.time() - start_time
    print(f"\n🎉 Pipeline completed in {total_time:.2f} seconds!")

if __name__ == "__main__":
    main()

🚀 Starting product price prediction pipeline with Random Forest...

📂 Loading data...
Initializing DataLoader...
Loading data...
Data loaded successfully.
✅ Data loaded. Shape: (273024, 9)

🔧 Preprocessing data...
Found date column: tarih
Observations: 273024
Variables: 14
cat_cols: 7
num_cols: 6
cat_but_car: 1
num_but_cat: 6
✅ Training data prepared. Features: 12, Samples: 204768
No date column found. Skipping time-based features.
Observations: 273024
Variables: 14
cat_cols: 9
num_cols: 5
cat_but_car: 0
num_but_cat: 9
✅ Test data prepared. Samples: 45504

🧪 Evaluating Random Forest model...
Evaluating Random Forest with hyperparameter tuning...
