## Importing Dependencies

In [8]:
import re
import json
import os
import yaml

from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate

from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

from typing import List, Dict, Set, Any


from tqdm import tqdm

from dotenv import load_dotenv
load_dotenv()  # Load environment variables from .env file

True

## Utitlity Module

In [9]:
def load_conversations(directory_path):
    """
    Load all conversation files from the specified directory.Supported formats are JSON and YAML.
    Args:
        directory_path (str): Path to the directory containing conversation files.
    Returns:
        list of dict: List of conversations with 'call_id' and 'conversation' keys.
    """
    all_convos = []
    for filename in os.listdir(directory_path):
        if filename.endswith(('.json', '.yml', '.yaml')):
            try:
                with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as f:
                    if filename.endswith('.json'):
                        convo = json.load(f)
                    else:
                        convo = yaml.safe_load(f)
                all_convos.append({'call_id': os.path.splitext(filename)[0], 'conversation': convo})
            except Exception as e:
                print(f"Failed loading {filename}: {e}")
    return all_convos




## Regex Search

### Modules For Regex Search

In [10]:
def load_profanity_patterns(profanity_file):
    """
    Load profanity words from a file and compile regex patterns for each word.
    Args:
        profanity_file (str): Path to the file containing profanity words.
    Returns:
        list: List of tuples (word, compiled_regex).
    """
    with open(profanity_file, 'r', encoding='utf-8') as f:
        words = [line.strip() for line in f if line.strip()]
    patterns = [(word, re.compile(rf'\b{re.escape(word)}\b', re.IGNORECASE)) for word in words]
    return patterns



def detect_profanity_in_conversations_regex(conversations, profanity_patterns):
    """
    Detects profanity in conversations.
    Args:
        conversations (list): List of conversation dicts with 'call_id' and 'conversation'.
        profanity_patterns (list): List of tuples (word, compiled_regex).

    Returns:
    dict with call_id mapping to detected profanity info:
    {
        'call_id': {
            'Agent': set([...profane words...]),
            'Customer': set([...profane words...])
        }
    }
    """
    profanity_results = {}

    for call in tqdm(conversations, desc="Processing calls"):
        call_id = call['call_id']
        utterances = call['conversation']
        agent_profanities = set()
        customer_profanities = set()

        for utterance in utterances:
            text = utterance.get('text', '')
            speaker = utterance.get('speaker', '').lower()

            for word, pattern in profanity_patterns:
                if pattern.search(text):
                    if speaker == 'agent':
                        agent_profanities.add(word)
                    elif speaker == 'customer' or speaker == 'borrower':
                        customer_profanities.add(word)

        profanity_results[call_id] = {
            'Agent': agent_profanities,
            'Customer': customer_profanities
        }

    return profanity_results


def search_profanity_calls_regex(conversations_dir, profanity_file):
    """
    Searches for profanity in conversations and returns calls with detected profanity.
    Args:
        conversations_dir (str): Directory containing conversation JSON files.
        profanity_file (str): Path to the file containing profanity words.
    Returns:
        dict: Calls with detected profanity and the profane words found.
    """
    conversations = load_conversations(conversations_dir)
    profanity_patterns = load_profanity_patterns(profanity_file)
    results = detect_profanity_in_conversations_regex(conversations, profanity_patterns)

    # Filter only calls with detected profanity and include the profane words found
    profane_calls = {
        call_id: profanities 
        for call_id, profanities in results.items()
        if profanities['Agent'] or profanities['Customer']
    }

    return profane_calls


### Usage

In [11]:
conversations_directory = '../data/All_Conversations/'
profanity_wordlist = '../data/bad-words.txt'

calls_with_profanity = search_profanity_calls_regex(conversations_directory, profanity_wordlist)

for call_id, profanities in tqdm(calls_with_profanity.items()):
    print(f"Call ID: {call_id}")
    print(f"  Agent profane words: {sorted(profanities['Agent'])}")
    print(f"  Customer profane words: {sorted(profanities['Customer'])}")
    print()

Processing calls: 100%|██████████| 250/250 [00:02<00:00, 116.24it/s]
100%|██████████| 42/42 [00:00<00:00, 43875.66it/s]

Call ID: 87e28dae-2c70-4122-9452-6ec82164fab2
  Agent profane words: ['damn']
  Customer profane words: []

Call ID: 80e5fe02-ec03-4ca5-902a-fdf38e6b7b8a
  Agent profane words: ['american']
  Customer profane words: []

Call ID: 3e6dde01-1a46-42b4-92dd-0a211185e660
  Agent profane words: []
  Customer profane words: ['damn', 'hell']

Call ID: 89d28c5a-f0da-41e9-a93e-0a749c20189c
  Agent profane words: ['crap', 'damn', 'hell', 'shit']
  Customer profane words: []

Call ID: d7bbea61-d739-43fb-a198-ced1b59f9491
  Agent profane words: []
  Customer profane words: ['assholes', 'fuck', 'hell', 'shit']

Call ID: 9505f0e7-5404-4f50-a497-35a9e899197c
  Agent profane words: ['crap', 'damn', 'hell', 'sick']
  Customer profane words: []

Call ID: d071bb49-40f8-4bae-8c6d-cfc0d4a011d5
  Agent profane words: []
  Customer profane words: ['bullshit', 'crap', 'damn', 'hell', 'stupid']

Call ID: 8598c6d9-a767-4120-af0a-5490357c72f3
  Agent profane words: []
  Customer profane words: ['crap', 'damn', 'he




## LLM Prompt System

In [12]:
groq_api_key = os.getenv('GROQ_API_KEY2')

In [13]:


def detect_profanity_llm_batch_calls(calls_batch: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Detect profanity in a batch of calls using a single LLM request.
    
    Args:
        calls_batch: List of call dictionaries with 'call_id' and 'conversation' keys
        
    Returns:
        List of profanity detection results for each call in the batch
    """
    # Setup LLM
    llm = ChatGroq(
        model="openai/gpt-oss-20b",
        temperature=0.0,
        max_retries=3,
        api_key=os.environ.get("GROQ_API_KEY3"),
    )
    
    # Build batch prompt
    prompt_text = (
        "# Role: Expert content moderator for profanity detection\n"
        "# Task: Detect profanity in debt collection call utterances\n\n"
        "# Rules:\n"
        "- Flag direct profanity and offensive language (f*ck, sh*t, damn, hell, b*tch, a**hole, etc.)\n"
        "- Include mild profanity (damn, hell, crap) and strong profanity (f-word, s-word)\n"
        "- Consider words like 'screw you', 'pissed off', 'bullsh*t' as profanity\n"
        "- Do NOT flag normal business language, frustration without swearing, or polite speech\n"
        "- List the exact profane words detected\n\n"
        "# Examples:\n"
        "- 'This is f*cking ridiculous' -> has_profanity: true, profane_words: ['fucking']\n"
        "- 'I'm really frustrated' -> has_profanity: false, profane_words: []\n"
        "- 'What the hell is going on?' -> has_profanity: true, profane_words: ['hell']\n"
        "- 'You're being unreasonable' -> has_profanity: false, profane_words: []\n\n"
        "# Output Format: Return ONLY valid JSON array:\n"
        "[\n"
        "  {\n"
        "    \"call_id\": \"string\",\n"
        "    \"utterances\": [\n"
        "      {\"utterance_number\": int, \"has_profanity\": bool, \"profane_words\": [\"word1\"]}\n"
        "    ]\n"
        "  }\n"
        "]\n\n"
        "Calls to analyze:\n"
    )

    for idx, call in enumerate(calls_batch, 1):
        prompt_text += f"\nCall {idx} ID: {call['call_id']}\n"
        for i, utt in enumerate(call['conversation'], 1):
            speaker = utt.get('speaker', 'unknown').capitalize()
            text = utt.get('text', '').replace('\n', ' ')
            prompt_text += f"{i}. [{speaker}] {text}\n"

    try:
        response = llm.invoke([("system", prompt_text)])
        content = response.content.strip()
        result = json.loads(content)
        return result if isinstance(result, list) else []
    except:
        # Fallback: no profanity detected
        return [
            {
                "call_id": call['call_id'],
                "utterances": [
                    {"utterance_number": i, "has_profanity": False, "profane_words": []} 
                    for i in range(1, len(call['conversation']) + 1)
                ]
            }
            for call in calls_batch
        ]


def process_conversations_llm_batch_calls(conversations: List[Dict[str, Any]], 
                                        batch_size: int = 25) -> Dict[str, Dict[str, Set[str]]]:
    """
    Process all conversations in batches to detect profanity.
    
    Args:
        conversations: List of conversation dictionaries with call_id and conversation data
        batch_size: Number of calls to process in each batch
        
    Returns:
        Dictionary mapping call_id to profanity results by speaker:
        {"call_id": {"Agent": set(), "Customer": set()}}
    """
    profanity_results = {}

    for i in tqdm(range(0, len(conversations), batch_size), desc="Processing batches"):
        batch_calls = conversations[i:i + batch_size]
        batch_detections = detect_profanity_llm_batch_calls(batch_calls)

        for call_res in batch_detections:
            call_id = call_res["call_id"]
            agent_profane = set()
            customer_profane = set()
            
            # Find corresponding conversation
            call_conversation = next((c for c in batch_calls if c['call_id'] == call_id), None)
            if not call_conversation:
                continue
            
            # Process utterance results
            for utt_res in call_res.get("utterances", []):
                if not utt_res.get("has_profanity", False):
                    continue
                    
                utt_idx = utt_res["utterance_number"] - 1
                if utt_idx >= len(call_conversation['conversation']):
                    continue
                    
                speaker = call_conversation['conversation'][utt_idx].get('speaker', '').lower()
                words = utt_res.get("profane_words", [])
                
                if speaker == "agent":
                    agent_profane.update(words)
                elif speaker in ["customer", "borrower"]:
                    customer_profane.update(words)

            profanity_results[call_id] = {
                "Agent": agent_profane,
                "Customer": customer_profane,
            }

    return profanity_results


def display_results(results: Dict[str, Dict[str, Set[str]]]) -> None:
    """
    Display profanity detection results with summary statistics.
    
    Args:
        results: Dictionary mapping call_id to profanity results by speaker
    """
    # Print individual results
    print("\n" + "="*50)
    print("PROFANITY DETECTION RESULTS")
    print("="*50)
    
    calls_with_profanity = 0
    for call_id, profs in results.items():
        agent_words = sorted(profs['Agent'])
        customer_words = sorted(profs['Customer'])
        
        if agent_words or customer_words:
            calls_with_profanity += 1
            print(f"\nCall ID: {call_id}")
            if agent_words:
                print(f"  Agent: {agent_words}")
            if customer_words:
                print(f"  Customer: {customer_words}")
    
    # Print summary statistics
    total_calls = len(results)
    agent_calls = sum(1 for data in results.values() if data['Agent'])
    customer_calls = sum(1 for data in results.values() if data['Customer'])
    
    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)
    print(f"Total calls: {total_calls}")
    print(f"Calls with profanity: {calls_with_profanity}")
    print(f"Agent profanity in {agent_calls} calls ({agent_calls/total_calls:.1%})")
    print(f"Customer profanity in {customer_calls} calls ({customer_calls/total_calls:.1%})")



In [14]:
DATA_DIR = "../data/All_Conversations"

conversations = load_conversations(DATA_DIR)

# Process conversations in batches
results = process_conversations_llm_batch_calls(conversations, batch_size=25)

# Display results
display_results(results)

Processing batches: 100%|██████████| 10/10 [09:44<00:00, 58.45s/it]


PROFANITY DETECTION RESULTS

Call ID: 3e6dde01-1a46-42b4-92dd-0a211185e660
  Customer: ['damn', 'f***', 'hell']

Call ID: d7bbea61-d739-43fb-a198-ced1b59f9491
  Customer: ['fuck', 'hell', 'shit']

Call ID: 9505f0e7-5404-4f50-a497-35a9e899197c
  Agent: ['crap', 'damn', 'hell']
  Customer: ['freaking']

Call ID: d071bb49-40f8-4bae-8c6d-cfc0d4a011d5
  Customer: ['bullshit', 'crap', 'damn', 'hell']

Call ID: 8598c6d9-a767-4120-af0a-5490357c72f3
  Customer: ['a**', 'crap', 'damn', 'f***']

Call ID: b70866a2-2f46-4784-992b-74d6dc60806e
  Customer: ['damn', 'f***', 'hell', 'pissing', 'screw', 'shit']

Call ID: 216fd3c8-1a80-484d-8792-464771794d9e
  Customer: ['f***', 'hell']

Call ID: 4f05aaff-585f-485f-90a1-60d027ec6e46
  Agent: ['damn', 'hell', 'shit']

Call ID: ab6ec93c-09e1-4b88-8777-574ceb28cd05
  Customer: ['crap', 'damn', 'hell', 'screw']

Call ID: 52ddb4a0-0599-4e18-a961-ac4b67da8d5e
  Agent: ['hell']

Call ID: 50abe040-847c-47e1-962f-a00d7ff2cfe4
  Agent: ['crap', 'damn']

Call ID: 


