In [None]:
# %% [markdown]
"""
# Product Price Prediction Project - Jupyter Notebook Version

This notebook combines all the Python files from your project into a single interactive notebook with the same functionality.
"""

# %%
# Import all required libraries
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.stats import uniform, randint, loguniform

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# %% [markdown]
"""
## Configuration
"""
# %%
# Configuration (equivalent to config.py)
DATASET_PATH = 'dataset/'
TRAINING_DATA_PATH = DATASET_PATH + "train.csv"
TESTING_DATA_PATH = DATASET_PATH + "testFeatures.csv"

LOW_QUANTILE = 0.05
UP_QUANTILE = 0.95
CAT_THRESHOLD = 10
CAR_THRESHOLD = 20
CORRELATION_THRESHOLD = 0.60
CAT_LENGTH = 10
NUM_METHOD = "median"

In [None]:
# %% [markdown]
"""
## Helper Functions
"""
# %%
# Helper functions (equivalent to helpers.py)
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(3))
    print("##################### Tail #####################")
    print(dataframe.tail(3))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

def grab_col_names(dataframe, cat_th=CAT_THRESHOLD, car_th=CAR_THRESHOLD):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtype != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtype == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, cat_but_car, num_cols

def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.xticks(rotation=45)
        plt.savefig(f'{col_name}_countplot.png')
        plt.close()

def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.50, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)
    if plot:
        dataframe[numerical_col].hist(bins=50)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.savefig(f'{numerical_col}_histogram.png')
        plt.close()
    print("#####################################")

def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")

def high_correlated_cols(dataframe, plot=False, corr_th=CORRELATION_THRESHOLD):
    corr = dataframe.corr(numeric_only=True)
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    if plot:
        sns.heatmap(corr, cmap="RdBu", annot=True)
        plt.savefig('correlation_heatmap.png')
        plt.close()
    return drop_list

def outlier_thresholds(dataframe, variable, low_quantile=LOW_QUANTILE, up_quantile=UP_QUANTILE):
    q1 = dataframe[variable].quantile(low_quantile)
    q3 = dataframe[variable].quantile(up_quantile)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return lower_bound, upper_bound

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[dataframe[variable] < low_limit, variable] = low_limit
    dataframe.loc[dataframe[variable] > up_limit, variable] = up_limit
    return dataframe

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns

def remove_missing_values(dataframe):
    print("##################### Missing Values Before #####################")
    print(dataframe.isnull().sum())
    dataframe_cleaned = dataframe.dropna()
    print("##################### Missing Values After #####################")
    print(dataframe_cleaned.isnull().sum())
    return dataframe_cleaned

def quick_missing_imp(data, num_method=NUM_METHOD, cat_length=CAT_LENGTH, target="Age"):
    variables_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]
    temp_target = data[target] if target in data.columns else None
    print("# BEFORE")
    print(data[variables_with_na].isnull().sum(), "\n")
    data = data.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= cat_length) else x, axis=0)
    if num_method == "mean":
        data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)
    elif num_method == "median":
        data = data.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)
    if temp_target is not None:
        data[target] = temp_target
    print("# AFTER")
    print("Categorical variables filled with mode")
    print(f"Numerical variables filled with {num_method}")
    print(data[variables_with_na].isnull().sum(), "\n")
    return data

def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ":", len(dataframe[col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[col].value_counts(),
                            "RATIO": dataframe[col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(col)[target].mean()}), end="\n\n\n")

def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()
    rare_columns = [col for col in temp_df.columns if temp_df[col].dtype == 'O'
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]
    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])
    return temp_df

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    return pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)

In [None]:
# %% [markdown]
"""
## Data Loading
"""
# %%
# DataLoader class (equivalent to dataset.py)
class DataLoader:
    def __init__(self, training_data_path, testing_data_path):
        self.training_data_path = training_data_path
        self.testing_data_path = testing_data_path
        print("Initializing DataLoader...")

    def get_data(self):
        print("Loading data...")
        # Load and combine data
        train = pd.read_csv(self.training_data_path)
        test = pd.read_csv(self.testing_data_path)
        df = pd.concat([train, test], ignore_index=True)
        df = df.reset_index(drop=True)
        print("Data loaded successfully.")
        return df

In [None]:
# %% [markdown]
"""
## Data Preprocessing
"""
# %%
# DataPreprocessing class (equivalent to data_preprocessing.py)
class DataPreprocessing:
    def __init__(self, dataframe):
        """
        Initialize with a combined DataFrame (train + test).
        """
        self.df = dataframe.copy()

    def preprocess(self, is_test_only=False):
        """
        Preprocess the data and return train/validation splits or test data.
        """
        self.handle_outliers()
        self.handle_missing_values()
        self.feature_engineering()
        self.drop_unnecessary_columns()
        self.encode_features()

        if is_test_only:
            # Test verisini al ve 'id'yi düşürmeden önce sakla
            test_data = self.df[self.df['ürün fiyatı'].isnull()].drop('ürün fiyatı', axis=1)
            test_ids = test_data["id"].copy()  # 'id'yi sakla
            test_data = test_data.drop(columns=['id'])  # 'id'yi test verisinden çıkar
            return test_data, test_ids  # 'test_ids' ile birlikte döndür
        else:
            # Eğitim verisini al ve 'id'yi düşür
            train_data = self.df[self.df['ürün fiyatı'].notnull()]
            train_data = train_data.drop(columns=['id'])  # 'id'yi düşür

            X = train_data.drop('ürün fiyatı', axis=1)
            y = train_data['ürün fiyatı']
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
            return X_train, X_val, y_train, y_val

    def handle_outliers(self):
        """
        Handle outliers using IQR method for numerical columns (excluding ürün fiyatı).
        """
        num_cols = self.df.select_dtypes(include=np.number).columns
        num_cols = [col for col in num_cols if col != 'ürün fiyatı']  # Hedef değişkeni hariç tut
        for col in num_cols:
            if check_outlier(self.df, col):
                self.df = replace_with_thresholds(self.df, col)

    def handle_missing_values(self):
        """
        Handle missing values: mean for numerical (excluding ürün fiyatı), mode for categorical.
        """
        num_cols = self.df.select_dtypes(include=np.number).columns
        num_cols = [col for col in num_cols if col != 'ürün fiyatı']
        self.df[num_cols] = self.df[num_cols].fillna(self.df[num_cols].mean())
        cat_cols = self.df.select_dtypes(include='object').columns
        for col in cat_cols:
            self.df[col] = self.df[col].fillna(self.df[col].mode()[0])

    def feature_engineering(self):
        """
        Create new features for the dataset.
        """
        # Besin değeri ile ilgili özellikler
        self.df['besin_değeri_log'] = np.log1p(self.df['ürün besin değeri'])  # Log dönüşümü

        # Ürün kategorisi bazlı ortalama besin değeri
        self.df['kategori_ortalama_besin'] = self.df.groupby('ürün kategorisi')['ürün besin değeri'].transform('mean')

    def drop_unnecessary_columns(self):
        """
        Drop unnecessary columns.
        """
        columns_to_drop = ['ürün üretim yeri', 'market', 'şehir']  # Tek değerli sütunlar
        self.df.drop(columns=[col for col in columns_to_drop if col in self.df.columns], inplace=True)

    def encode_features(self):
        """
        Encode categorical features (ürün, ürün kategorisi).
        """
        cat_cols, cat_but_car, num_cols = grab_col_names(self.df)

        # Binary veya düşük kardinaliteli sütunlar için label encoding
        binary_cols = [col for col in cat_cols if self.df[col].nunique() <= 3]  # Örneğin, ürün kategorisi
        for col in binary_cols:
            self.df = label_encoder(self.df, col)

        # Yüksek kardinaliteli sütunlar (örneğin, ürün) için target encoding
        high_cardinality_cols = cat_but_car + [col for col in cat_cols if col not in binary_cols]
        for col in high_cardinality_cols:
            if col in self.df.columns:
                # Train verisi için hedef ortalaması hesapla
                train_data = self.df[self.df['ürün fiyatı'].notnull()]
                target_means = train_data.groupby(col)['ürün fiyatı'].mean()
                # Tüm veriye ortalamaları uygula, bilinmeyen değerler için genel ortalama
                self.df[col] = self.df[col].map(target_means).fillna(train_data['ürün fiyatı'].mean())

        # Kalan kategorik sütunlar için one-hot encoding
        remaining_cat_cols = [col for col in cat_cols if col not in binary_cols and col not in high_cardinality_cols]
        if remaining_cat_cols:
            self.df = one_hot_encoder(self.df, remaining_cat_cols, drop_first=True)

        # Hala kategorik sütun kalmışsa hata fırlat
        remaining_object_cols = self.df.select_dtypes(include='object').columns.tolist()
        if remaining_object_cols:
            raise ValueError(f"Categorical columns not fully encoded: {remaining_object_cols}")

In [None]:
# %% [markdown]
"""
## Hyperparameter Tuning
"""
# %%
# HyperTuner class (equivalent to HyperTuner.py)
class HyperTuner:
    def __init__(self):
        """
        Initialize with a dictionary of models and their hyperparameter grids.
        """
        self.param_grids = {
            "LinearRegression": {},
            "Ridge": {'alpha': [0.1, 1.0, 10.0, 100.0]},
            "Lasso": {'alpha': [0.01, 0.1, 1.0, 10.0]},
            "ElasticNet": {'alpha': [0.01, 0.1, 1.0], 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]},
            "KNN": {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']},
            "DecisionTree": {'max_depth': [3, 5, 10, None], 'min_samples_split': [2, 5, 10]},
            "RandomForest": {
                'n_estimators': randint(60,160),
                'max_depth': [30, None],
                'min_samples_split': [5, 10,15],
                'min_samples_leaf': [1, 2, 4,6]
            },
            "GradientBoosting": {
                'n_estimators': randint(50, 200),
                'learning_rate': loguniform(0.005, 0.2),
                'max_depth': [3, 5, 7],
                'subsample': uniform(0.6, 0.4),
                'min_samples_split': [2, 5, 10]
            },
            "XGBoost": {
                'n_estimators': randint(50, 200),
                'learning_rate': loguniform(0.005, 0.2),
                'max_depth': [3, 5, 7, 9],
                'subsample': uniform(0.6, 0.4),
                'colsample_bytree': uniform(0.6, 0.4)
            },
            "LightGBM": {
                'n_estimators': randint(50, 200),
                'learning_rate': loguniform(0.005, 0.2),
                'num_leaves': randint(20, 50),
                'max_depth': [3, 5, 7, -1],
                'subsample': uniform(0.6, 0.4),
                'colsample_bytree': uniform(0.6, 0.4)
            },
            "CatBoost": {
                'iterations': randint(100, 300),
                'learning_rate': loguniform(0.005, 0.2),
                'depth': [4, 6, 8, 10],
                'l2_leaf_reg': uniform(1, 10),
                'bagging_temperature': uniform(0, 1)
            },
            "SVR": {
                'C': loguniform(0.1, 10),
                'epsilon': uniform(0.05, 0.2),
                'kernel': ['rbf', 'linear'],
                'gamma': loguniform(1e-4, 1e-1)
            }
        }
        self.models = {
            "LinearRegression": LinearRegression(),
            "Ridge": Ridge(),
            "Lasso": Lasso(),
            "ElasticNet": ElasticNet(),
            "KNN": KNeighborsRegressor(),
            "DecisionTree": DecisionTreeRegressor(),
            "RandomForest": RandomForestRegressor(random_state=17),
            "GradientBoosting": GradientBoostingRegressor(random_state=17),
            "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=17),
            "LightGBM": LGBMRegressor(random_state=17),
            "CatBoost": CatBoostRegressor(silent=True, random_state=17),
        }

    def tune_model(self, model_name, X, y):
        """
        Tune the specified model using GridSearchCV or RandomizedSearchCV.
        """
        if model_name not in self.models:
            raise ValueError(f"Model {model_name} not found in models list.")

        model = self.models[model_name]
        param_grid = self.param_grids[model_name]

        if param_grid:
            # Use RandomizedSearchCV for complex models
            search = RandomizedSearchCV(
                estimator=model,
                param_distributions=param_grid,
                n_iter=2,
                cv=5,
                scoring='neg_mean_absolute_error',
                n_jobs=-1,
                random_state=42
            )
            search.fit(X, y)
            return search.best_estimator_, search.best_params_
        else:
            model.fit(X, y)
            return model, {}

In [None]:
# %% [markdown]
"""
## Model Evaluation
"""
# %%
# ModelEvaluator class (equivalent to models.py)
class ModelEvaluator:
    def __init__(self, best_model_name="CatBoost", output_dir="predictions/notebook"):
        """
        Initialize with a list of regression model names and specify the best model for final predictions.
        """
        self.model_names = [
            "CatBoost",
        ]
        self.best_model_name = best_model_name
        self.rmse_scores = {}
        self.mae_scores = {}
        self.best_params = {}
        self.tuner = HyperTuner()
        self.trained_models = {}
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def evaluate_models(self, X, y):
        """
        Evaluate all models with hyperparameter tuning using 5-fold cross-validation.
        """
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=17)

        print("Evaluating models with hyperparameter tuning...")
        for name in self.model_names:
            # Tune and train the model
            best_model, best_params = self.tuner.tune_model(name, X_train, y_train)
            self.trained_models[name] = best_model
            self.best_params[name] = best_params

            # Calculate RMSE
            rmse = np.mean(np.sqrt(-cross_val_score(best_model, X, y, cv=5, scoring="neg_mean_squared_error")))
            self.rmse_scores[name] = rmse

            # Calculate MAE
            mae = np.mean(-cross_val_score(best_model, X, y, cv=5, scoring="neg_mean_absolute_error"))
            self.mae_scores[name] = mae

            print(f"RMSE: {round(rmse, 4)} | MAE: {round(mae, 4)} ({name})")
            if self.best_params[name]:
                print(f"Best parameters: {self.best_params[name]}")

        # Print the best model based on MAE
        best_model_mae = min(self.mae_scores, key=self.mae_scores.get)
        print(f"\nBest model based on MAE: {best_model_mae} (MAE: {round(self.mae_scores[best_model_mae], 4)})")
        print(f"Best parameters for {best_model_mae}: {self.best_params[best_model_mae]}")

        # Print the best model based on RMSE for reference
        best_model_rmse = min(self.rmse_scores, key=self.rmse_scores.get)
        print(f"Best model based on RMSE: {best_model_rmse} (RMSE: {round(self.rmse_scores[best_model_rmse], 4)})")

        return X_train, X_test, y_train, y_test

    def train_and_predict(self, X_train, y_train, X_test, test_ids, output_file="submission.csv"):
        """
        Train the best model with optimized parameters and save predictions to CSV.

        Args:
            X_train (pd.DataFrame): Training features
            y_train (pd.Series): Training target values
            X_test (pd.DataFrame): Test features to predict on
            test_ids (pd.Series): IDs for test samples
            output_file (str): Output CSV filename

        Returns:
            np.ndarray: Model predictions
        """
        # Model training with verbose output
        print(f"\n⏳ Training {self.best_model_name} model...")
        start_time = time.time()

        # Hyperparameter tuning and training
        best_model, best_params = self.tuner.tune_model(self.best_model_name, X_train, y_train)
        self.best_params[self.best_model_name] = best_params

        # Train final model
        best_model.fit(X_train, y_train)
        training_time = time.time() - start_time
        print(f"✅ Training completed in {training_time:.2f} seconds")
        print(f"🏆 Best parameters: {best_params}")

        # Generate predictions - keep as floating point without rounding
        predictions = best_model.predict(X_test)

        # Create submission DataFrame with floating point prices
        submission_df = pd.DataFrame({
            "id": test_ids.astype(int),
            "ürün fiyatı": predictions.astype(float)  # Ensure floating point type
        })

        # Save to CSV without index
        output_path = os.path.join(self.output_dir, output_file)
        submission_df.to_csv(output_path, index=False, float_format='%.4f')  # 4 decimal places

        print(f"\n📁 Predictions saved to '{output_path}'")
        print(f"Sample predictions:\n{submission_df.head()}")

        return predictions

    def get_rmse_scores(self):
        """
        Return the RMSE scores for all evaluated models.
        """
        return self.rmse_scores

    def get_mae_scores(self):
        """
        Return the MAE scores for all evaluated models.
        """
        return self.mae_scores

    def get_best_params(self):
        """
        Return the best parameters for all evaluated models.
        """
        return self.best_params

In [None]:
# %% [markdown]
"""
## Main Execution
"""
# %%
# Main function (equivalent to main.py)
def main():
    # Initialize timer
    start_time = time.time()

    print("🚀 Starting product price prediction pipeline...")

    # 1. Data Loading
    print("\n📂 Loading data...")
    data_loader = DataLoader(TRAINING_DATA_PATH, TESTING_DATA_PATH)
    combined_df = data_loader.get_data()
    print(f"✅ Data loaded. Shape: {combined_df.shape}")

    # 2. Data Preprocessing
    print("\n🔧 Preprocessing data...")
    preprocessor = DataPreprocessing(combined_df)

    # Training/validation split
    X_train, X_val, y_train, y_val = preprocessor.preprocess()
    print(f"✅ Training data prepared. Features: {X_train.shape[1]}, Samples: {X_train.shape[0]}")

    # Test data preparation
    X_test_submission, test_ids = preprocessor.preprocess(is_test_only=True)
    print(f"✅ Test data prepared. Samples: {X_test_submission.shape[0]}")

    # 3. Model Evaluation
    print("\n🧪 Evaluating models...")
    evaluator = ModelEvaluator(output_dir="predictions")
    evaluator.evaluate_models(X_train, y_train)

    # 4. Final Prediction
    print("\n🔮 Making final predictions...")
    predictions = evaluator.train_and_predict(
        X_train,
        y_train,
        X_test_submission,
        test_ids,
        output_file="submission.csv"
    )

    # Pipeline completion
    total_time = time.time() - start_time
    print(f"\n🎉 Pipeline completed in {total_time:.2f} seconds!")

if __name__ == "__main__":
    import time
    main()