In [None]:
# %% [markdown]
"""
# Product Price Prediction Project - LazyPredict Version

This notebook uses LazyPredict for quick model evaluation before focusing on the best performing models.
"""

In [None]:
# %%
# Import all required libraries
import numpy as np
import pandas as pd
import os
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lazypredict.Supervised import LazyRegressor

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# %% [markdown]
"""
## Configuration
"""
# %%
# Configuration
DATASET_PATH = 'dataset/'
TRAINING_DATA_PATH = DATASET_PATH + "train.csv"
TESTING_DATA_PATH = DATASET_PATH + "testFeatures.csv"

CAT_THRESHOLD = 5
CAR_THRESHOLD = 20
NUM_METHOD = "median"

In [None]:
# %% [markdown]
"""
## Helper Functions
"""
# %%
# Helper functions
def grab_col_names(dataframe, cat_th=CAT_THRESHOLD, car_th=CAR_THRESHOLD):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtype != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtype == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    return cat_cols, cat_but_car, num_cols

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    return pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)

In [None]:
# %% [markdown]
"""
## Data Loading
"""
# %%
# DataLoader class
class DataLoader:
    def __init__(self, training_data_path, testing_data_path):
        self.training_data_path = training_data_path
        self.testing_data_path = testing_data_path
        print("Initializing DataLoader...")

    def get_data(self):
        print("Loading data...")
        # Load and combine data
        train = pd.read_csv(self.training_data_path)
        test = pd.read_csv(self.testing_data_path)
        df = pd.concat([train, test], ignore_index=True)
        df = df.reset_index(drop=True)
        print("Data loaded successfully.")
        return df

In [None]:
# %% [markdown]
"""
## Data Preprocessing
"""
# %%
# DataPreprocessing class
class DataPreprocessing:
    def __init__(self, dataframe):
        self.df = dataframe.copy()

    def preprocess(self, is_test_only=False):
        self.handle_missing_values()
        self.feature_engineering()
        self.drop_unnecessary_columns()
        self.encode_features()

        if is_test_only:
            test_data = self.df[self.df['ürün fiyatı'].isnull()].drop('ürün fiyatı', axis=1)
            test_ids = test_data["id"].copy()
            test_data = test_data.drop(columns=['id'])
            return test_data, test_ids
        else:
            train_data = self.df[self.df['ürün fiyatı'].notnull()]
            train_data = train_data.drop(columns=['id'])

            X = train_data.drop('ürün fiyatı', axis=1)
            y = train_data['ürün fiyatı']
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
            return X_train, X_val, y_train, y_val

    def handle_missing_values(self):
        num_cols = self.df.select_dtypes(include=np.number).columns
        num_cols = [col for col in num_cols if col != 'ürün fiyatı']
        self.df[num_cols] = self.df[num_cols].fillna(self.df[num_cols].median())
        
        cat_cols = self.df.select_dtypes(include='object').columns
        for col in cat_cols:
            self.df[col] = self.df[col].fillna(self.df[col].mode()[0])

    def feature_engineering(self):
        self.df['besin_değeri_log'] = np.log1p(self.df['ürün besin değeri'])
        self.df['kategori_ortalama_besin'] = self.df.groupby('ürün kategorisi')['ürün besin değeri'].transform('mean')

    def drop_unnecessary_columns(self):
        columns_to_drop = ['ürün üretim yeri', 'market', 'şehir']
        self.df.drop(columns=[col for col in columns_to_drop if col in self.df.columns], inplace=True)

    def encode_features(self):
        cat_cols, cat_but_car, num_cols = grab_col_names(self.df)

        # Binary or low cardinality columns
        binary_cols = [col for col in cat_cols if self.df[col].nunique() <= 3]
        for col in binary_cols:
            self.df = label_encoder(self.df, col)

        # High cardinality columns
        high_cardinality_cols = cat_but_car + [col for col in cat_cols if col not in binary_cols]
        for col in high_cardinality_cols:
            if col in self.df.columns:
                train_data = self.df[self.df['ürün fiyatı'].notnull()]
                target_means = train_data.groupby(col)['ürün fiyatı'].mean()
                self.df[col] = self.df[col].map(target_means).fillna(train_data['ürün fiyatı'].mean())

        # One-hot encoding for remaining categorical columns
        remaining_cat_cols = [col for col in cat_cols if col not in binary_cols and col not in high_cardinality_cols]
        if remaining_cat_cols:
            self.df = one_hot_encoder(self.df, remaining_cat_cols, drop_first=True)

        # Check for remaining categorical columns
        remaining_object_cols = self.df.select_dtypes(include='object').columns.tolist()
        if remaining_object_cols:
            raise ValueError(f"Categorical columns not fully encoded: {remaining_object_cols}")

In [None]:
# %% [markdown]
"""
## Model Evaluation with LazyPredict
"""
# %%
def evaluate_with_lazypredict(X_train, y_train, X_test, y_test):
    """
    Evaluate multiple regression models using LazyPredict.
    """
    print("\n🚀 Evaluating models with LazyPredict...")
    
    # Initialize LazyRegressor
    reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
    
    # Fit and evaluate models
    models, predictions = reg.fit(X_train, X_test, y_train, y_test)
    
    # Display results
    print("\n📊 Model Performance Summary:")
    print(models.sort_values('R-Squared', ascending=False))
    
    # Plot top models
    plt.figure(figsize=(10, 6))
    top_models = models.sort_values('R-Squared', ascending=False).head(10)
    sns.barplot(x='R-Squared', y=top_models.index, data=top_models)
    plt.title('Top 10 Models by R-Squared Score')
    plt.tight_layout()
    plt.show()
    
    return models

In [None]:
# %% [markdown]
"""
## Main Execution
"""
# %%
def main():
    # Initialize timer
    start_time = time.time()

    print("🚀 Starting product price prediction pipeline with LazyPredict...")

    # 1. Data Loading
    print("\n📂 Loading data...")
    data_loader = DataLoader(TRAINING_DATA_PATH, TESTING_DATA_PATH)
    combined_df = data_loader.get_data()
    print(f"✅ Data loaded. Shape: {combined_df.shape}")

    # 2. Data Preprocessing
    print("\n🔧 Preprocessing data...")
    preprocessor = DataPreprocessing(combined_df)

    # Training/validation split
    X_train, X_val, y_train, y_val = preprocessor.preprocess()
    print(f"✅ Training data prepared. Features: {X_train.shape[1]}, Samples: {X_train.shape[0]}")

    # 3. Model Evaluation with LazyPredict
    model_performance = evaluate_with_lazypredict(X_train, y_train, X_val, y_val)
    
    # Get top 3 models
    top_models = model_performance.sort_values('R-Squared', ascending=False).head(3).index.tolist()
    print(f"\n🏆 Top 3 models: {', '.join(top_models)}")
    
    # Pipeline completion
    total_time = time.time() - start_time
    print(f"\n🎉 Pipeline completed in {total_time:.2f} seconds!")
    
    return model_performance

if __name__ == "__main__":
    model_results = main()