In [2]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.tools import tool
from langchain.agents import create_agent
from langchain_core.messages import SystemMessage, HumanMessage
from pydantic import BaseModel, Field
import re

  from .autonotebook import tqdm as notebook_tqdm


## Model selection

Better to pick Instruct models and set the temperature to 0.01 - 0.05. In the other way the model wont be deterministic. <br>
Also carefully check of the model supports tool calling.

### Selected model

Qwen/Qwen2.5-7B-Instruct - one of the best models for this purpose. However all hugging face models restricts very large number of output tokens (32k max - input + output). <br>
Hence, all the files should be processed by chunks.

In [3]:
endpoint = HuggingFaceEndpoint(
    repo_id='Qwen/Qwen2.5-7B-Instruct',
    task='text-generation',
    do_sample=False,
    temperature=0.05,
    max_new_tokens=1800,
    streaming=False
)

llm = ChatHuggingFace(llm=endpoint)

In [4]:
llm.invoke('Hello! What is your name?')

AIMessage(content="Hello! I'm Qwen, an AI assistant created by Alibaba Cloud. You can call me Qwen. How can I assist you today?", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 30, 'prompt_tokens': 36, 'total_tokens': 66}, 'model_name': 'Qwen/Qwen2.5-7B-Instruct', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='lc_run--019c75e7-d7be-72f3-a741-e5a7fc7ab292-0', tool_calls=[], invalid_tool_calls=[], usage_metadata={'input_tokens': 36, 'output_tokens': 30, 'total_tokens': 66})

In [5]:
@tool
def remove_brackets_content(text: str) -> str:
    """
    Remove all content inside square brackets [],
    round brackets () and curly brackets {}.
    Useful for removing sound descriptions, speaker labels,
    stage directions like [applause], (laughs), {music}.
    """
    text = re.sub(r'\[.*?\]', '', text)   # [applause]
    text = re.sub(r'\(.*?\)', '', text)   # (laughs)
    text = re.sub(r'\{.*?\}', '', text)   # {music}
    return text.strip()


@tool
def remove_non_alphabetic(text: str) -> str:
    """
    Remove all non-alphabetic characters except spaces.
    Keeps only letters A-Z, a-z and whitespace.
    Useful for stripping punctuation, numbers, special symbols.
    """
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()


@tool
def remove_newlines(text: str) -> str:
    """
    Remove newline characters and replace them with spaces.
    Merges multi-line subtitle blocks into single lines.
    """
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = re.sub(r' +', ' ', text)  # collapse multiple spaces
    return text.strip()


@tool
def remove_dialog_punctuation(text: str) -> str:
    """
    Remove dialog-specific punctuation: dashes at line start (- text),
    ellipsis (...), double dashes (--), quotation marks,
    and excessive punctuation used in subtitles.
    """
    text = re.sub(r'^\s*-+\s*', '', text, flags=re.MULTILINE)  # leading dashes
    text = re.sub(r'\.{2,}', '', text)    # ellipsis ...
    text = re.sub(r'-{2,}', '', text)     # double dash --
    text = re.sub(r'["""\'\'\']+', '', text)  # quotes
    text = re.sub(r'[!?,;:]+', '', text)  # dialog punctuation
    return text.strip()


@tool
def remove_timestamps(text: str) -> str:
    """
    Remove SRT/VTT subtitle timestamps.
    Handles formats like:
    - 00:01:23,456 --> 00:01:25,789  (SRT)
    - 00:01:23.456 --> 00:01:25.789  (VTT)
    Also removes bare sequence numbers (1, 2, 3...) used in SRT files.
    """
    # SRT timestamps
    text = re.sub(
        r'\d{2}:\d{2}:\d{2}[.,]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[.,]\d{3}',
        '', text
    )
    # VTT cue identifiers
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    # WEBVTT header
    text = re.sub(r'WEBVTT.*\n?', '', text)
    return text.strip()


@tool
def remove_speaker_labels(text: str) -> str:
    """
    Remove speaker labels commonly found in subtitles.
    Handles formats like:
    - JOHN: text
    - John: text
    - [JOHN]: text
    - <v John> text  (VTT format)
    """
    text = re.sub(r'^[A-Z][A-Z\s]{1,20}:\s*', '', text, flags=re.MULTILINE)  # JOHN:
    text = re.sub(r'^\w[\w\s]{1,20}:\s*', '', text, flags=re.MULTILINE)       # John:
    text = re.sub(r'<v\s+[^>]+>', '', text)                                    # <v John>
    return text.strip()


@tool
def remove_html_tags(text: str) -> str:
    """
    Remove HTML/XML tags commonly found in subtitles.
    Handles: <i>, <b>, <u>, <font color="">, <c.colorname> etc.
    Used in SRT and VTT files for styling.
    """
    text = re.sub(r'<[^>]+>', '', text)
    return text.strip()


@tool
def normalize_whitespace(text: str) -> str:
    """
    Normalize all whitespace: collapse multiple spaces into one,
    strip leading/trailing spaces from each line,
    remove empty lines.
    Final cleanup step — use after all other tools.
    """
    lines = text.split('\n')
    lines = [re.sub(r' +', ' ', line).strip() for line in lines]
    lines = [line for line in lines if line]  # remove empty
    return ' '.join(lines)


@tool
def lowercase_text(text: str) -> str:
    """
    Convert all text to lowercase.
    Recommended for sentiment analysis preprocessing
    to ensure uniform token representation.
    """
    return text.lower()


@tool
def remove_filler_words(text: str) -> str:
    """
    Remove common spoken filler words that add noise for sentiment analysis.
    Removes: um, uh, hmm, ah, oh, er, erm, hm, gonna, wanna, gotta etc.
    """
    fillers = r'\b(um+|uh+|hmm+|hm+|ah+|oh+|er+|erm+|gonna|wanna|gotta|kinda|sorta|like|okay|ok|yeah|yep|nope)\b'
    text = re.sub(fillers, '', text, flags=re.IGNORECASE)
    text = re.sub(r' +', ' ', text)
    return text.strip()

## Prompt eng. and Formatting

Prompt can (and must) be changed, but the agentic solutions lacks determinism. Hence, we can not grantee high quality and structured output even with special tools.

In [6]:
class FormattedResponse(BaseModel):
    """Cleaned text"""
    CLEANED_TEXT: str = Field(description="The CLEANED subtitle text")


tools = [
    remove_timestamps,
    remove_brackets_content,
    remove_html_tags,
    remove_speaker_labels,
    remove_dialog_punctuation,
    remove_newlines,
    remove_non_alphabetic,
    remove_filler_words,
    lowercase_text,
    normalize_whitespace,
]

prompt ="""Your goal is to clean raw subtitle text step by step using available tools.

## Input text format:
The user will provide the subtitle text marked as SUBTITLE_TEXT.

## Recommended cleaning pipeline (follow this order):
1. remove_timestamps         — strip SRT/VTT timing info
2. remove_brackets_content   — remove [sound], (laughter), {{music}}
3. remove_html_tags          — strip <i>, <b>, <font> tags
4. remove_speaker_labels     — remove JOHN:, John:, <v John>
5. remove_dialog_punctuation — remove ---, ..., quotes, !?;:,
6. remove_newlines           — merge lines into single text
7. remove_non_alphabetic     — keep only letters and spaces
8. remove_filler_words       — remove um, uh, gonna, wanna...
9. lowercase_text            — convert to lowercase
10. normalize_whitespace     — final cleanup of spaces

Apply ALL steps unless the user specifies otherwise.
After cleaning, return the final cleaned text clearly labeled as:

CLEANED_TEXT: <result>
"""

structured_llm = llm.with_structured_output(FormattedResponse, method='json_mode')
agent = create_agent(llm, tools, system_prompt=SystemMessage(prompt))

In [7]:
text = """[ Women, A Cappella ] ♪ I've got no secrets kickin' up from behind ♪
♪ We keep no secrets We play right in time ♪
♪ Your time is gonna come
- ♪ I think I hear it now - [ Bell Dings ]
♪♪ [ Band ]
[ Woman Narrating ] In the late 1960s, a few women artists formed a coalition...
and named it WAR-- Women Artists in Revolution.
♪ We come together in this garden for a day ♪
♪ Your time is gonna come
♪ I think I hear it now
[ Narrator ] And you have to ask yourself...
why it was necessary for them to do this in the first place.
The books that you read
in those days...
were written in a way
that denigrated
women artists
"""

In [15]:
result = agent.invoke({'messages': [HumanMessage(f'Clean the following text:\n\nSUBTITLE_TEXT:\n{text}')]}
)

In [16]:
structured_result = structured_llm.invoke(
    f"Return this cleaned text in the required format:\n\n{result['messages'][-1].content}"
)
print(structured_result)
output = structured_result.get('CLEANED_TEXT', None)
if output is None:
    output = structured_result.get('cleaned_text', None)
print(output)

{'text': "I've got no secrets kicking up from behind, we keep no secrets, we play right in time. Your time is gonna come, I think I hear it now. Bell dings, band. In the late 1960s, a few women artists formed a coalition and named it WAR—Women Artists in Revolution. We come together in this garden for a day. Your time is gonna come, I think I hear it now. And you have to ask yourself why it was necessary for them to do this in the first place. The books that you read in those days were written in a way that denigrated women artists."}
None


## Beautiful pipeline

In [10]:
from langchain_core.runnables import RunnableLambda

pipeline = (
    RunnableLambda(lambda text: {
        'messages': [HumanMessage(f'Clean the following text:\n\nSUBTITLE_TEXT:\n{text}')]
    })
    | agent
    | RunnableLambda(lambda result: f"Return this cleaned text in the required format:\n\n{result['messages'][-1].content}")
    | structured_llm
)

result = pipeline.invoke(text)
print(result)

{'cleaned_text': "I've got no secrets kickin' up from behind. We keep no secrets. We play right in time. Your time is gonna come. I think I hear it now. Bell dings. Band. In the late 1960s, a few women artists formed a coalition and named it WAR—Women Artists in Revolution. We come together in this garden for a day. Your time is gonna come. I think I hear it now. And you have to ask yourself why it was necessary for them to do this in the first place. The books that you read in those days were written in a way that denigrated women artists."}


## Bonus

Compare the time and quality :)

In [None]:
from langchain_core.runnables import chain

@chain
def clean_subtitle(text: str) -> FormattedResponse:
    """Deterministic cleaning pipeline — no LLM needed."""
    text = remove_timestamps.invoke(text)
    text = remove_brackets_content.invoke(text)
    text = remove_html_tags.invoke(text)
    text = remove_speaker_labels.invoke(text)
    text = remove_dialog_punctuation.invoke(text)
    text = remove_newlines.invoke(text)
    text = remove_non_alphabetic.invoke(text)
    text = remove_filler_words.invoke(text)
    text = lowercase_text.invoke(text)
    text = normalize_whitespace.invoke(text)
    return FormattedResponse(CLEANED_TEXT=text)


result = clean_subtitle.invoke(text)
print(result.CLEANED_TEXT)

ive got no secrets kickin up from behind we keep no secrets we play right in time your time is come i think i hear it now in the late s a few women artists formed a coalition and named it war women artists in revolution we come together in this garden for a day your time is come i think i hear it now and you have to ask yourself why it was necessary for them to do this in the first place the books that you read in those days were written in a way that denigrated women artists
