In [4]:
import json
import gc
import re
import unicodedata
from langdetect import detect_langs
import ollama


class NLPOrchestrator:
    def __init__(self, model_id="qwen3-vl:8b-instruct"):
        print(f"Using Ollama model: {model_id}")
        self.model_id = model_id

        # Language mapping
        self.lang_map = {
            "en": "English",
            "hi": "Hindi",
            "te": "Telugu",
            "ta": "Tamil",
            "bn": "Bengali",
            "mr": "Marathi",
            "gu": "Gujarati",
            "kn": "Kannada",
            "ml": "Malayalam",
            "pa": "Punjabi",
            "ur": "Urdu",
            "or": "Odia",
            "as": "Assamese",
            "sa": "Sanskrit",
        }

    # -----------------------------
    # Core generation (deterministic)
    # -----------------------------
    def _generate(self, system_prompt, user_input, max_tokens=512):
        response = ollama.chat(
            model=self.model_id,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_input},
            ],
            options={
                "temperature": 0.0,     # critical for JSON stability
                "num_predict": max_tokens,
            },
        )

        text = response["message"]["content"].strip()

        gc.collect()
        return text

    # -----------------------------
    # Text cleaning
    # -----------------------------
    def clean_text(self, text: str) -> str:
        text = unicodedata.normalize("NFKC", text)
        text = text.replace("\u200c", "").replace("\u200d", "")
        text = text.replace("\ufeff", "")
        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text)
        return text.strip()

    # -----------------------------
    # Romanized Hindi detector
    # -----------------------------
    def is_romanized_hindi(self, text: str) -> bool:
        if not text or len(text) < 20:
            return False

        ascii_ratio = sum(1 for c in text if ord(c) < 128) / max(len(text), 1)
        if ascii_ratio < 0.85:
            return False

        hinglish_markers = [
            "hai", "haan", "han", "nahi", "nahin", "kyu", "kyun", "kya", "kaise",
            "mera", "meri", "mere", "tum", "aap", "ap", "hum", "ham",
            "mat", "kar", "karo", "kariye", "krdo", "krna", "kr diya",
            "wala", "wali", "wale", "se", "ko", "me", "mein", "par",
            "bahut", "bohot", "thoda", "jaldi", "abhi", "kal", "aaj"
        ]

        text_l = " " + re.sub(r"[^a-zA-Z ]", " ", text.lower()) + " "
        hits = sum(1 for w in hinglish_markers if f" {w} " in text_l)

        return hits >= 2

    # -----------------------------
    # Language detection
    # -----------------------------
    def detect_language(self, text):
        try:
            if self.is_romanized_hindi(text):
                return {
                    "lang_list": ["Hindi (Romanized)"],
                    "primary_lang": "Hindi (Romanized)",
                }

            langs = detect_langs(text)
            primary = langs[0].lang
            lang_list = [self.lang_map.get(l.lang, l.lang.upper()) for l in langs[:3]]
            primary_lang = self.lang_map.get(primary, primary.upper())

            return {"lang_list": lang_list, "primary_lang": primary_lang}

        except Exception:
            return {"lang_list": ["Unknown"], "primary_lang": "Unknown"}

    # -----------------------------
    # JSON parsing helpers
    # -----------------------------
    def _try_load_json(self, json_str: str):
        if not json_str:
            return None

        json_str = json_str.strip()
        json_str = re.sub(r",\s*}", "}", json_str)
        json_str = re.sub(r",\s*]", "]", json_str)
        json_str = json_str.replace("‚Äú", "\"").replace("‚Äù", "\"").replace("‚Äô", "'")

        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    def _parse_json(self, response: str):
        if not response:
            return None

        match = re.search(r"```json\s*([\s\S]*?)\s*```", response, re.IGNORECASE)
        if match:
            parsed = self._try_load_json(match.group(1))
            if parsed:
                return parsed

        match = re.search(r"```\s*([\s\S]*?)\s*```", response)
        if match:
            parsed = self._try_load_json(match.group(1))
            if parsed:
                return parsed

        match = re.search(r"\{[\s\S]*\}", response)
        if match:
            parsed = self._try_load_json(match.group(0))
            if parsed:
                return parsed

        print("‚ö†Ô∏è JSON parsing failed")
        return None

    # -----------------------------
    # Step 1: Translation
    # -----------------------------
    def step_1_translate(self, text, lang_info):
        primary = lang_info["primary_lang"].lower()

        if primary == "english":
            return {"translated_english_text": text, "translation_confidence": 1.0}

        system_prompt = """You are a professional translator.
Translate the given text into fluent English.

Rules:
- Translate completely
- Preserve meaning and proper nouns
- Return ONLY valid JSON

Output JSON format:
{
  "translated_english_text": "...",
  "translation_confidence": 0.0
}
"""

        response = self._generate(system_prompt, text, max_tokens=800)
        result = self._parse_json(response)

        if result is None:
            return {"translated_english_text": response, "translation_confidence": 0.6}

        return result

    # -----------------------------
    # Step 2: Deep analysis
    # -----------------------------
    def step_2_deep_analysis(self, english_text):
        system_prompt = """You are a security-focused NLP analyzer.
Perform comprehensive analysis on the given text.

CRITICAL RULES:
1. Country_iden: Select ONLY ONE value based on content context
2. Domains: Use exact capitalization from the list
3. Location NER: Only specific geographic places
4. Event dates: dd/mm/yyyy ONLY if explicitly mentioned
5. Sentiment "Anti-National": Only for direct threats to India

DOMAIN OPTIONS:
Politics, Crime, Military, Terrorism, Radicalisation, Extremism in J&K,
Law and Order, Narcotics, Left Wing Extremism, General

Return ONLY valid JSON in this format:
{
  "domain_ident": [],
  "sentiment": "",
  "NER": {
    "Person": [],
    "Location": [],
    "Organisation": [],
    "Event": [],
    "Product": []
  },
  "Event_calendar": [],
  "Country_iden": "",
  "Fact_checker": {
    "relevant_topics": [],
    "confidence_level": 0.0,
    "relevance_rating": ""
  },
  "Summary": ""
}
"""

        response = self._generate(system_prompt, english_text, max_tokens=1000)
        result = self._parse_json(response)

        if result is None:
            return {
                "domain_ident": ["General"],
                "sentiment": "Neutral",
                "NER": {
                    "Person": [],
                    "Location": [],
                    "Organisation": [],
                    "Event": [],
                    "Product": [],
                },
                "Event_calendar": [],
                "Country_iden": "Abroad",
                "Fact_checker": {
                    "relevant_topics": [],
                    "confidence_level": 0.0,
                    "relevance_rating": "Low",
                },
                "Summary": "Analysis failed.",
            }

        return result

    # -----------------------------
    # Orchestration
    # -----------------------------
    def process(self, text):
        cleaned = self.clean_text(text)

        print("üîç Detecting language...")
        lang_info = self.detect_language(cleaned)

        print("üìù Translation...")
        translation = self.step_1_translate(cleaned, lang_info)

        print("üî¨ Deep analysis...")
        analysis = self.step_2_deep_analysis(translation["translated_english_text"])

        return {
            "Cleaned_content": cleaned,
            "lang_list": lang_info["lang_list"],
            "primary_lang": lang_info["primary_lang"],
            "translated_english_text": translation["translated_english_text"],
            **analysis,
        }


In [5]:
content='''‡§Ö‡§Æ‡•á‡§∞‡§ø‡§ï‡•Ä ‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§™‡§§‡§ø ‡§°‡•ã‡§®‡§æ‡§≤‡•ç‡§° ‡§ü‡•ç‡§∞‡§Ç‡§™ ‡§®‡•á ‡§ó‡•Å‡§∞‡•Å‡§µ‡§æ‡§∞ ‡§ï‡•ã ‡§ï‡§•‡§ø‡§§ '‡§¨‡•ã‡§∞‡•ç‡§° ‡§ë‡§´ ‡§™‡•Ä‡§∏' ‡§™‡§π‡§≤ ‡§ï‡•Ä ‡§∂‡•Å‡§∞‡•Å‡§Ü‡§§ ‡§ï‡§∞ ‡§¶‡•Ä.

‡§ü‡•ç‡§∞‡§Ç‡§™ ‡§ï‡§æ ‡§ï‡§π‡§®‡§æ ‡§π‡•à ‡§ï‡§ø ‡§á‡§∏‡§ï‡§æ ‡§Æ‡§ï‡§º‡§∏‡§¶ ‡§ó‡§º‡§ú‡§º‡§æ ‡§Æ‡•á‡§Ç ‡§á‡§∏‡§∞‡§æ‡§á‡§≤ ‡§î‡§∞ ‡§π‡§Æ‡§æ‡§∏ ‡§ï‡•á ‡§¨‡•Ä‡§ö ‡§∏‡§Ç‡§ò‡§∞‡•ç‡§∑‡§µ‡§ø‡§∞‡§æ‡§Æ ‡§ï‡•ã ‡§∏‡•ç‡§•‡§æ‡§Ø‡•Ä ‡§¨‡§®‡§æ‡§®‡§æ ‡§î‡§∞ ‡§´‡§º‡§≤‡§∏‡•ç‡§§‡•Ä‡§®‡•Ä ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞ ‡§Æ‡•á‡§Ç ‡§è‡§ï ‡§Ö‡§Ç‡§§‡§∞‡§ø‡§Æ ‡§∏‡§∞‡§ï‡§æ‡§∞ ‡§ï‡•Ä ‡§®‡§ø‡§ó‡§∞‡§æ‡§®‡•Ä ‡§ï‡§∞‡§®‡§æ ‡§π‡•à.

‡§≠‡§æ‡§∞‡§§ ‡§â‡§® ‡§¶‡§∞‡•ç‡§ú‡§®‡•ã‡§Ç ‡§¶‡•á‡§∂‡•ã‡§Ç ‡§Æ‡•á‡§Ç ‡§∂‡§æ‡§Æ‡§ø‡§≤ ‡§π‡•à, ‡§ú‡§ø‡§®‡•ç‡§π‡•á‡§Ç ‡§á‡§∏ ‡§¨‡•ã‡§∞‡•ç‡§° ‡§Æ‡•á‡§Ç ‡§∂‡§æ‡§Æ‡§ø‡§≤ ‡§π‡•ã‡§®‡•á ‡§ï‡§æ ‡§®‡§ø‡§Æ‡§Ç‡§§‡•ç‡§∞‡§£ ‡§Æ‡§ø‡§≤‡§æ ‡§π‡•à.

‡§π‡§æ‡§≤‡§æ‡§Ç‡§ï‡§ø ‡§Ö‡§≠‡•Ä ‡§§‡§ï ‡§Ø‡§π ‡§∏‡•ç‡§™‡§∑‡•ç‡§ü ‡§®‡§π‡•Ä‡§Ç ‡§π‡•à ‡§ï‡§ø ‡§≠‡§æ‡§∞‡§§ ‡§á‡§∏‡•á ‡§∏‡•ç‡§µ‡•Ä‡§ï‡§æ‡§∞ ‡§ï‡§∞‡•á‡§ó‡§æ ‡§Ø‡§æ ‡§®‡§π‡•Ä‡§Ç. ‡§∂‡•Å‡§∞‡•Å‡§Ü‡§§ ‡§Æ‡•á‡§Ç ‡§á‡§∏‡§∞‡§æ‡§á‡§≤ ‡§®‡•á ‡§≠‡•Ä ‡§á‡§∏‡•á ‡§≤‡•á‡§ï‡§∞ ‡§Ö‡§∏‡§π‡§Æ‡§§‡§ø ‡§ú‡§§‡§æ‡§à ‡§•‡•Ä ‡§≤‡•á‡§ï‡§ø‡§® ‡§¨‡§æ‡§¶ ‡§Æ‡•á‡§Ç ‡§∏‡•ç‡§µ‡•Ä‡§ï‡§æ‡§∞ ‡§ï‡§∞ ‡§≤‡§ø‡§Ø‡§æ ‡§•‡§æ.

‡§Ö‡§Æ‡•á‡§∞‡§ø‡§ï‡•Ä ‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§™‡§§‡§ø ‡§°‡•ã‡§®‡§æ‡§≤‡•ç‡§° ‡§ü‡•ç‡§∞‡§Ç‡§™ ‡§®‡•á ‡§ó‡•Å‡§∞‡•Å‡§µ‡§æ‡§∞ ‡§ï‡•ã ‡§Ö‡§™‡§®‡•á ‡§¨‡•ã‡§∞‡•ç‡§° ‡§ë‡§´ ‡§™‡•Ä‡§∏ ‡§ï‡•Ä ‡§î‡§™‡§ö‡§æ‡§∞‡§ø‡§ï ‡§∂‡•Å‡§∞‡•Å‡§Ü‡§§ ‡§ï‡•Ä ‡§§‡•ã ‡§≠‡§æ‡§∞‡§§ ‡§â‡§® ‡§¶‡•á‡§∂‡•ã‡§Ç ‡§Æ‡•á‡§Ç ‡§∂‡§æ‡§Æ‡§ø‡§≤ ‡§•‡§æ ‡§ú‡•ã ‡§á‡§∏ ‡§∏‡§Æ‡§æ‡§∞‡•ã‡§π ‡§Æ‡•á‡§Ç ‡§Æ‡•å‡§ú‡•Ç‡§¶ ‡§®‡§π‡•Ä‡§Ç ‡§•‡•á.

‡§™‡•ç‡§∞‡§ß‡§æ‡§®‡§Æ‡§Ç‡§§‡•ç‡§∞‡•Ä ‡§®‡§∞‡•á‡§Ç‡§¶‡•ç‡§∞ ‡§Æ‡•ã‡§¶‡•Ä ‡§â‡§® ‡§ï‡§à ‡§µ‡•à‡§∂‡•ç‡§µ‡§ø‡§ï ‡§®‡•á‡§§‡§æ‡§ì‡§Ç ‡§Æ‡•á‡§Ç ‡§•‡•á, ‡§ú‡§ø‡§®‡•ç‡§π‡•á‡§Ç ‡§ü‡•ç‡§∞‡§Ç‡§™ ‡§®‡•á ‡§¨‡•ã‡§∞‡•ç‡§° ‡§Æ‡•á‡§Ç ‡§∂‡§æ‡§Æ‡§ø‡§≤ ‡§π‡•ã‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è ‡§Ü‡§Æ‡§Ç‡§§‡•ç‡§∞‡§ø‡§§ ‡§ï‡§ø‡§Ø‡§æ ‡§π‡•à.

‡§ü‡•ç‡§∞‡§Ç‡§™ ‡§ï‡•á ‡§®‡§ø‡§Æ‡§Ç‡§§‡•ç‡§∞‡§£ ‡§ï‡•ã ‡§∏‡•ç‡§µ‡•Ä‡§ï‡§æ‡§∞ ‡§ï‡§∞‡§®‡•á ‡§µ‡§æ‡§≤‡•á ‡§¶‡•á‡§∂‡•ã‡§Ç ‡§Æ‡•á‡§Ç ‡§™‡§æ‡§ï‡§ø‡§∏‡•ç‡§§‡§æ‡§®, ‡§§‡•Å‡§∞‡•ç‡§ï‡•Ä, ‡§∏‡§ä‡§¶‡•Ä ‡§Ö‡§∞‡§¨ ‡§î‡§∞ ‡§∏‡§Ç‡§Ø‡•Å‡§ï‡•ç‡§§ ‡§Ö‡§∞‡§¨ ‡§Ö‡§Æ‡•Ä‡§∞‡§æ‡§§ ‡§≠‡•Ä ‡§∂‡§æ‡§Æ‡§ø‡§≤ ‡§π‡•à‡§Ç.

‡§ü‡•ç‡§∞‡§Ç‡§™ ‡§®‡•á ‡§ï‡§π‡§æ ‡§ï‡§ø 59 ‡§¶‡•á‡§∂‡•ã‡§Ç ‡§®‡•á ‡§¨‡•ã‡§∞‡•ç‡§° ‡§Æ‡•á‡§Ç ‡§∂‡§æ‡§Æ‡§ø‡§≤ ‡§π‡•ã‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è ‡§π‡§∏‡•ç‡§§‡§æ‡§ï‡•ç‡§∑‡§∞ ‡§ï‡§ø‡§è ‡§π‡•à‡§Ç ‡§≤‡•á‡§ï‡§ø‡§® ‡§¶‡§æ‡§µ‡•ã‡§∏ ‡§Æ‡•á‡§Ç ‡§µ‡§ø‡§∂‡•ç‡§µ ‡§Ü‡§∞‡•ç‡§•‡§ø‡§ï ‡§Æ‡§Ç‡§ö ‡§ï‡•á ‡§¶‡•å‡§∞‡§æ‡§® ‡§Ü‡§Ø‡•ã‡§ú‡§ø‡§§ ‡§á‡§∏ ‡§ï‡§æ‡§∞‡•ç‡§Ø‡§ï‡•ç‡§∞‡§Æ ‡§Æ‡•á‡§Ç ‡§ï‡•á‡§µ‡§≤ 19 ‡§¶‡•á‡§∂‡•ã‡§Ç ‡§ï‡•á ‡§™‡•ç‡§∞‡§§‡§ø‡§®‡§ø‡§ß‡§ø ‡§π‡•Ä ‡§Æ‡•å‡§ú‡•Ç‡§¶ ‡§•‡•á.'''

In [6]:
if __name__ == "__main__":
    extractor = NLPOrchestrator()

    # Test text (Bengali)
    text = content
    
    print("=" * 60)
    print("NLP ORCHESTRATION PIPELINE")
    print("=" * 60)

    result = extractor.process(text)

    print("\n" + "=" * 60)
    print("FINAL OUTPUT")
    print("=" * 60)
    print(json.dumps(result, indent=2, ensure_ascii=False))

Using Ollama model: qwen3-vl:8b-instruct
NLP ORCHESTRATION PIPELINE
üîç Detecting language...
üìù Translation...
üî¨ Deep analysis...

FINAL OUTPUT
{
  "Cleaned_content": "‡§Ö‡§Æ‡•á‡§∞‡§ø‡§ï‡•Ä ‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§™‡§§‡§ø ‡§°‡•ã‡§®‡§æ‡§≤‡•ç‡§° ‡§ü‡•ç‡§∞‡§Ç‡§™ ‡§®‡•á ‡§ó‡•Å‡§∞‡•Å‡§µ‡§æ‡§∞ ‡§ï‡•ã ‡§ï‡§•‡§ø‡§§ '‡§¨‡•ã‡§∞‡•ç‡§° ‡§ë‡§´ ‡§™‡•Ä‡§∏' ‡§™‡§π‡§≤ ‡§ï‡•Ä ‡§∂‡•Å‡§∞‡•Å‡§Ü‡§§ ‡§ï‡§∞ ‡§¶‡•Ä.\n\n‡§ü‡•ç‡§∞‡§Ç‡§™ ‡§ï‡§æ ‡§ï‡§π‡§®‡§æ ‡§π‡•à ‡§ï‡§ø ‡§á‡§∏‡§ï‡§æ ‡§Æ‡§ï‡§º‡§∏‡§¶ ‡§ó‡§º‡§ú‡§º‡§æ ‡§Æ‡•á‡§Ç ‡§á‡§∏‡§∞‡§æ‡§á‡§≤ ‡§î‡§∞ ‡§π‡§Æ‡§æ‡§∏ ‡§ï‡•á ‡§¨‡•Ä‡§ö ‡§∏‡§Ç‡§ò‡§∞‡•ç‡§∑‡§µ‡§ø‡§∞‡§æ‡§Æ ‡§ï‡•ã ‡§∏‡•ç‡§•‡§æ‡§Ø‡•Ä ‡§¨‡§®‡§æ‡§®‡§æ ‡§î‡§∞ ‡§´‡§º‡§≤‡§∏‡•ç‡§§‡•Ä‡§®‡•Ä ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞ ‡§Æ‡•á‡§Ç ‡§è‡§ï ‡§Ö‡§Ç‡§§‡§∞‡§ø‡§Æ ‡§∏‡§∞‡§ï‡§æ‡§∞ ‡§ï‡•Ä ‡§®‡§ø‡§ó‡§∞‡§æ‡§®‡•Ä ‡§ï‡§∞‡§®‡§æ ‡§π‡•à.\n\n‡§≠‡§æ‡§∞‡§§ ‡§â‡§® ‡§¶‡§∞‡•ç‡§ú‡§®‡•ã‡§Ç ‡§¶‡•á‡§∂‡•ã‡§Ç ‡§Æ‡•á‡§Ç ‡§∂‡§æ‡§Æ‡§ø‡§≤ ‡§π‡•à, ‡§ú‡§ø‡§®‡•ç‡§π‡•á‡§Ç ‡§á‡§∏ ‡§¨‡•ã‡§∞‡•ç‡§° ‡§Æ‡•á‡§Ç ‡§∂‡§æ‡§Æ‡§ø‡§≤ ‡§π‡•ã‡§®‡•á ‡§ï‡§æ ‡§®‡§ø‡§Æ‡§Ç‡§