In [4]:
import pandas as pd
import numpy as np
# using it for path feature working by which user can input in .csv file
from pathlib import Path

# Core functions for data preprocessing inspired by Sciket-learn fit and transform

# for ordinal encoder guys...

def fit_ordinal_encoder(df, column, mapping):

    #preparing transform input for ordinal encoding here.
    if column not in df.columns:
        print(f"Warning: Column '{column}' not found in DataFrame.")
        return None

    # storing the column and mapping for later use.
    transform_input_ordinal_encoder = {
        'type': 'ordinal',
        'column_to_encode': column,
        'category_mapping': mapping
    }
    return transform_input_ordinal_encoder

def transform_ordinal_encoder(df, transform_input_ordinal_encoder):

    #converting categories to ordered numbers here.

    df_copy = df.copy()
    column = transform_input_ordinal_encoder['column_to_encode']
    mapping = transform_input_ordinal_encoder['category_mapping']
    new_column_name = f"{column}_encoded"

    # maping each value to its number after iterating, or NaN if not in the mapping.
    encoded_values = []
    for value in df_copy[column]:
        if value in mapping:
            encoded_values.append(mapping[value])
        else:
            encoded_values.append(np.nan)
    df_copy[new_column_name] = encoded_values

    return df_copy
#-------------------------------------------------------------------------------

#for one-hot encoding guys
def fit_one_hot_encoder(df, column):

    #finding all unique categories in a column here
    unique_categories_array = df[column].unique()

    # converting the numpy array of categories to a simple list here.
    unique_categories_list = []
    for category in unique_categories_array:
        unique_categories_list.append(category)

    # saving the categories for the transform step here.
    transform_input_one_hot_encoder = {
        'type': 'one_hot', # Added type for our logic
        'column_to_encode': column,
        'unique_categories': unique_categories_list
    }
    return transform_input_one_hot_encoder

def transform_one_hot_encoder(df, transform_input_one_hot_encoder):

    #creating a new column for each unique category here
    df_copy = df.copy() # best practices.
    column = transform_input_one_hot_encoder['column_to_encode']
    unique_categories = transform_input_one_hot_encoder['unique_categories']

    # creating a new binary (0 or 1) column for each category here.
    for category in unique_categories:
        new_column_name = f"{column}_{category}"
        #converting the boolean output to 0 or 1 and then assigning.
        df_copy[new_column_name] = (df_copy[column] == category).astype(int)

    # removing the original column.
    df_copy = df_copy.drop(columns=[column])
    return df_copy

#-------------------------------------------------------------------------------

#for imputation guys
def fit_imputer(df, column, strategy='mean'):
    """calculating the value to fill missing data based on stratergy input."""
    if strategy == 'mean':
        fill_value = df[column].mean()
    elif strategy == 'median':
        fill_value = df[column].median()
    elif strategy == 'mode':
        fill_value = df[column].mode()[0]
    # storing the calculated value here.
    transform_input_imputer = {
        'type': 'impute',
        'column_to_impute': column,
        'fill_value': fill_value
    }
    return transform_input_imputer

def transform_imputer(df, transform_input_imputer):
    """filling missing values in a column here."""
    df_copy = df.copy()
    column = transform_input_imputer['column_to_impute']
    fill_value = transform_input_imputer['fill_value']
    # most important step to impute the calculated value to fill NaNs.
    df_copy[column] = df_copy[column].fillna(fill_value)
    return df_copy
#-------------------------------------------------------------------------------
# INTERACTIVE WAY FOR DATA PREPROCESSING GUYS !!!

if __name__ == "__main__":


    # lets initialise with none
    my_df = None

    # asking the user for the file path.
    csv_path = input("Enter path to CSV file (or press Enter for sample data): ").strip()

    # checking if the user entered a path AND if that path end upto real file or not...
    if csv_path and Path(csv_path).is_file():
        # If both are true,lets load the data from the CSV file.
        my_df = pd.read_csv(csv_path)
        print(f"Successfully loaded data from '{csv_path}'.")
    else:
        # If the user pressed Enter or the path was bad, print a message.
        if csv_path: #if user actually typed a bad path.
            print(f"Error: File not found at '{csv_path}' !!!.")

    # If the above step failed, my_df will still be None.
    # In that case, we load the sample data as a fallback.
    if my_df is None:
        print("Loading sample data instead.")
        raw_data = {
            'tshirt_size': ['Medium', 'Small', 'Large', 'Medium', np.nan, 'Small'],
            'color': ['Blue', 'Red', 'Green', 'Blue', 'Red', 'Green'],
            'age': [25, 30, 45, np.nan, 22, 30]
        }
        my_df = pd.DataFrame(raw_data)

    print("\n Original DataFrame below")
    print(my_df)
    print("\n--- INTERACTIVE PREPROCESSING :-) ---")




    # basically I m building a sort of pipeline
    fitted_steps = []
    # This list will track columns which already set for ordinal encoding
    ordinal_cols_handled = []



    # Imputation Step
    if input("\nDo you want to run imputation? (y/n): ").lower() == 'y':
        numeric_cols = my_df.select_dtypes(include=np.number).columns
        cols_with_nan = my_df.columns[my_df.isnull().any()].tolist()

        if not cols_with_nan:
            print("  No columns with missing values found.")
        else:
            print(f"  Found missing values in: {cols_with_nan}")
            for col in cols_with_nan:
                if col in numeric_cols:
                    strategy = input(f"Strategy for numeric '{col}' (mean/median/mode) [default: mean]: ").lower()
                    if strategy not in ['median', 'mode']:
                        strategy = 'mean'
                else:
                    print(f"Strategy for categorical '{col}' will be 'mode' obviously.")
                    strategy = 'mode'

                # function called here
                step_config = fit_imputer(my_df, col, strategy)
                # I m adding it in my pipeline.
                fitted_steps.append(step_config)

                print(f"  -> Added '{strategy}' imputer for '{col}'.")




    # Ordinal Encoding Step
    if input("\nDo you want to run ordinal encoding? (y/n): ").lower() == 'y':
        object_cols = my_df.select_dtypes(include=['object']).columns
        print(f"  Available categorical columns: {object_cols.tolist()}")

        while True:
            col = input("  Enter a column name for ordinal encoding (or 'done'): ")
            if col.lower() == 'done':
                break
            if col not in object_cols:
                print(f"  ERROR: Column '{col}' is not a categorical column.")
                continue

            unique_vals = my_df[col].dropna().unique()
            print(f"    Categories in '{col}' are: {unique_vals}")
            mapping = {}
            for val in unique_vals:
                order = input(f"    Enter number (0, 1, 2...) for '{val}': ")
                mapping[val] = int(order)

            print(f"    Mapping created: {mapping}")

            # function called here
            step_config = fit_ordinal_encoder(my_df, col, mapping)
            # I m adding it in my pipeline
            fitted_steps.append(step_config)

            ordinal_cols_handled.append(col)

            print(f"  -> Added ordinal encoder for '{col}'.")




    # One-Hot Encoding Step
    if input("\nDo you want to run one-hot encoding? (y/n): ").lower() == 'y':
        object_cols = my_df.select_dtypes(include=['object']).columns
        potential_ohe_cols = [col for col in object_cols if col not in ordinal_cols_handled]

        if not potential_ohe_cols:
            print("  No suitable categorical columns left for OHE.")
        else:
            print(f"  Available columns for OHE: {potential_ohe_cols}")
            while True:
                col = input("  Enter column to one-hot encode (or 'all' or 'done'): ")
                if col.lower() == 'done':
                    break
                if col == 'all':
                    for c in potential_ohe_cols:
                        step_config = fit_one_hot_encoder(my_df, c)
                        fitted_steps.append(step_config)
                        print(f"  -> Added one-hot encoder for '{c}'.")
                    break
                if col in potential_ohe_cols:
                    step_config = fit_one_hot_encoder(my_df, col)
                    fitted_steps.append(step_config)
                    print(f"  -> Added one-hot encoder for '{col}'.")
                else:
                    print(f"  ERROR: Not a valid choice. Choose from: {potential_ohe_cols}")




    # Transformation Step
    print("\n--- APPLYING TRANSFORMATIONS ---")

    if not fitted_steps:
        print("No processing steps were configured. DataFrame is unchanged.")
        df_transformed = my_df.copy()
    else:
        df_transformed = my_df.copy()

        for step in fitted_steps:
            step_type = step['type']

            if step_type == 'impute':
                df_transformed = transform_imputer(df_transformed, step)
            elif step_type == 'ordinal':
                df_transformed = transform_ordinal_encoder(df_transformed, step)
            elif step_type == 'one_hot':
                df_transformed = transform_one_hot_encoder(df_transformed, step)

        print("\n--- Final Processed DataFrame ---")
        print(df_transformed)

    # Using the same instructions for new unseen data
    if fitted_steps:
        print("\n TRANSFORMING NEW DATA (EXAMPLE)")

        new_data = {
            'tshirt_size': ['Large', np.nan, 'Small'],
            'color': ['Red', 'Blue', 'Yellow'],
            'age': [50, np.nan, 33],
        }
        new_df = pd.DataFrame(new_data)

        print("\n New Unseen Data (Original)")
        print(new_df)

        new_df_transformed = new_df.copy()
        for step in fitted_steps:
            step_type = step['type']
            if step_type == 'impute':
                new_df_transformed = transform_imputer(new_df_transformed, step)
            elif step_type == 'ordinal':
                new_df_transformed = transform_ordinal_encoder(new_df_transformed, step)
            elif step_type == 'one_hot':
                new_df_transformed = transform_one_hot_encoder(new_df_transformed, step)

        print("\n New Unseen Data (Transformed) ")
        print(new_df_transformed)

Enter path to CSV file (or press Enter for sample data): 
Loading sample data instead.

 Original DataFrame below
  tshirt_size  color   age
0      Medium   Blue  25.0
1       Small    Red  30.0
2       Large  Green  45.0
3      Medium   Blue   NaN
4         NaN    Red  22.0
5       Small  Green  30.0

--- INTERACTIVE PREPROCESSING :-) ---

Do you want to run imputation? (y/n): y
  Found missing values in: ['tshirt_size', 'age']
Strategy for categorical 'tshirt_size' will be 'mode' obviously.
  -> Added 'mode' imputer for 'tshirt_size'.
Strategy for numeric 'age' (mean/median/mode) [default: mean]: MODE
  -> Added 'mode' imputer for 'age'.

Do you want to run ordinal encoding? (y/n): n

Do you want to run one-hot encoding? (y/n): Y
  Available columns for OHE: ['tshirt_size', 'color']
  Enter column to one-hot encode (or 'all' or 'done'): ALL
  ERROR: Not a valid choice. Choose from: ['tshirt_size', 'color']
  Enter column to one-hot encode (or 'all' or 'done'): DONE

--- APPLYING TRAN