In [None]:
"""
core/guardrail.py - The Guardrail Component.

This module implements the `Guardrail` class, which is responsible for moderating
user input and LLM responses to ensure they are safe and appropriate. It uses
a pre-trained text classification model for this purpose.
"""
from transformers import pipeline

class Guardrail:
    """
    Moderates text content for toxicity and other inappropriate language.

    The class uses a pre-trained `text-classification` model from the
    Hugging Face Transformers library to score text. It's configured
    to flag content as "toxic" if the confidence score exceeds a
    predefined threshold.
    """
    def __init__(self):
        """Initializes the text classification pipeline."""
        self.moderator = pipeline("text-classification", model="unitary/toxic-bert")

    def check(self, text: str) -> bool:
        """
        Checks if the provided text is safe.

        Args:
            text: The string to be moderated.

        Returns:
            True if the text is considered safe, False otherwise.
        """
        # The moderator returns a list of dictionaries, e.g., [{'label': 'toxic', 'score': 0.99}]
        result = self.moderator(text)[0]
        # The threshold is set to 0.7 to avoid false positives
        is_toxic = (result["label"] == "toxic" and result["score"] > 0.7)
        return not is_toxic

    def safe_response(self) -> str:
        """
        Returns a predefined safe response for when a query is flagged.
        """
        return "⚠️ Sorry, I cannot provide a response to that request."