In [3]:
import pandas as pd
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

def clean_and_preprocess_data(df: pd.DataFrame, save_dir: str = "processed_data"):
    """
    Function to clean and preprocess the dataset.
    - Handles missing values
    - Encodes categorical features
    - Cleans numeric columns
    - Saves the processed train and inference data to CSV files.
    
    Args:
    - df (pd.DataFrame): The raw input data.
    - save_dir (str): Directory where the processed files will be saved.
    
    Returns:
    - raw_data (pd.DataFrame): The raw data with minimal changes.
    - inference_data (pd.DataFrame): The preprocessed features for model training or inference.
    - train_data (pd.DataFrame): Data with both features and target for model training.
    """
    
    # Make a copy of the raw data
    raw_data = df.copy()

    # Ensure save directory exists
    Path(save_dir).mkdir(parents=True, exist_ok=True)

    # 1. Clean the 'actual_price' and 'discounted_price' columns
    df['actual_price'] = df['actual_price'].replace('[₹,]', '', regex=True).astype(float)
    df['discounted_price'] = df['discounted_price'].replace('[₹,]', '', regex=True).astype(float)

    # 2. Convert non-numeric ratings to NaN and fill missing ratings with the mean
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    rating_imputer = SimpleImputer(strategy='mean')
    df['rating'] = rating_imputer.fit_transform(df[['rating']])

    # Impute missing values in 'rating_count' as well
    df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce')
    df['rating_count'] = rating_imputer.fit_transform(df[['rating_count']])

    # 3. Encode categorical columns (e.g., 'category') with Label Encoding
    label_encoder = LabelEncoder()
    df['category'] = label_encoder.fit_transform(df['category'])

    # 4. Define the features you want to use for training
    features = ['actual_price', 'rating', 'rating_count', 'category']
    
    # Select features (X) and target (y) for training data
    X = df[features]  # Features for training
    y = df['discounted_price']  # Target variable

    # Combine features and target into a train data (for training the model)
    train_data = pd.concat([X, y], axis=1)

    # Define target and features for inference data (for prediction without target column)
    inference_data = X  # Inference data should have only features, not the target variable

    # Save processed data to CSV files
    raw_data.to_csv(Path(save_dir) / "raw_data.csv", index=False)
    train_data.to_csv(Path(save_dir) / "train_data.csv", index=False)
    inference_data.to_csv(Path(save_dir) / "inference_data.csv", index=False)

    return raw_data, inference_data, train_data
