In [4]:
import pandas as pd

In [5]:
from typing import List

def choose_stream(
    df,
    excluded: List[str] = ["created_at", "entry_id"],
    min_selection: int = 3
) -> List[str]:
    """
    Allows user to select data streams (columns) from a dataframe interactively.

    - Skips excluded columns (e.g., metadata like 'created_at', 'entry_id')
    - Enforces minimum selection (default = 3)
    - Validates input against available fields
    - Removes duplicates and trims whitespace

    Parameters:
        df (pd.DataFrame): The dataset
        excluded (List[str]): Columns to ignore for selection
        min_selection (int): Minimum required columns

    Returns:
        List[str]: Validated list of selected column names
    """

    # Filter available columns
    available_fields = [col for col in df.columns if col not in excluded]

    # Display fields
    print("\nAvailable fields (select at least 3):")
    print("-" * 40)
    for col in available_fields:
        print(f"- {col}")
    print("-" * 40)

    # Interactive loop
    while True:
        user_input = input(f"Enter at least {min_selection} field names separated by commas: ")

        # Clean and normalize input
        selected = [col.strip() for col in user_input.split(",") if col.strip()]
        selected = list(dict.fromkeys(selected))  # Remove duplicates (preserve order)

        # Validation checks
        if len(selected) < min_selection:
            print(f"Error: Please select at least {min_selection} fields.")
            continue

        invalid = [col for col in selected if col not in available_fields]
        if invalid:
            print(f"Error: Invalid field(s): {', '.join(invalid)}")
            print(f"Valid options are: {', '.join(available_fields)}")
            continue

        return selected


In [6]:
df = pd.read_csv("./datasets/2881821.csv")
selected_streams = choose_stream(df)
print("Selected Streams : ", selected_streams)



Available fields (select at least 3):
----------------------------------------
- field1
- field2
- field3
- field4
- field5
- field6
- field7
- field8
----------------------------------------
Selected Streams :  ['field1', 'field2', 'field3']
