In [None]:
import json
import gc
import re
import unicodedata
import hashlib
from functools import lru_cache
from typing import Dict, List, Tuple, Optional
from langdetect import detect_langs
import ollama


class MultiLanguageRomanizationDetector:
    """Efficient multi-language romanization detector"""
    
    def __init__(self):
        # Language-specific marker words (most common romanized words)
        self.markers = {
            "hindi": {
                "hai", "haan", "nahi", "nahin", "kyu", "kya", "kaise",
                "mera", "meri", "mere", "tera", "teri", "tum", "aap",
                "kar", "karo", "karna", "kiya", "hum", "main",
                "bahut", "thoda", "jaldi", "abhi", "kal", "aaj",
                "wala", "wali", "wale", "se", "ko", "me", "mein",
                "achha", "bura", "bhi", "baat", "sahi", "galat"
            },
            "tamil": {
                "antha", "ithu", "athu", "enna", "epdi", "eppadi",
                "naan", "nee", "avan", "aval", "namma", "ungal",
                "illa", "irukku", "irukkadhu", "panna", "pannu",
                "nalla", "ketta", "romba", "konjam", "ippo", "naalaiku",
                "da", "di", "nga", "ma", "pa", "sollu", "solla",
                "vaa", "po", "poda", "vara", "poga", "vandha"
            },
            "telugu": {
                "andi", "enti", "ela", "ela unnaru", "chala",
                "nenu", "nuvvu", "vaadu", "aame", "manam", "meeru",
                "ledu", "undi", "undadu", "cheyyi", "chey",
                "manchidi", "cheddadi", "chala", "konchem", "ippudu", "repu",
                "ra", "ri", "ru", "anna", "akka", "cheppu", "cheppandi",
                "raa", "vellu", "vellandi", "vacchi", "velli"
            },
            "bengali": {
                "aache", "nai", "nei", "keno", "ki", "kivabe",
                "amar", "tomar", "tumi", "apni", "amra", "ora",
                "koro", "korben", "korechi", "hobe", "hoyeche",
                "bhalo", "kharap", "onek", "ektu", "ekhon", "kal",
                "bolo", "bolben", "jao", "jan", "esho", "elen"
            },
            "marathi": {
                "aahe", "nahi", "kay", "kasa", "kuthe",
                "maza", "tuja", "tu", "tumhi", "aamhi", "tyanche",
                "kar", "kara", "kela", "hota", "honar",
                "changle", "vait", "khup", "thode", "aata", "udya",
                "sang", "sanga", "ja", "ya", "ye", "aala"
            },
            "gujarati": {
                "chhe", "nathi", "kai", "kem", "kya",
                "maru", "taru", "tame", "aapne", "ame", "teo",
                "karo", "karjo", "karyu", "hashe", "thayu",
                "saru", "kharaab", "ghanu", "thodu", "aaje", "kale",
                "bolo", "kahjo", "jao", "aavo", "avyo"
            },
            "kannada": {
                "ide", "illa", "enu", "hege", "elli",
                "nanna", "ninna", "neevu", "avanu", "avalu", "naavu",
                "maadi", "maadu", "maadiruva", "agutte", "aayitu",
                "chennagide", "ketta", "thumba", "swalp╨░", "eeega", "naale",
                "heli", "he─╝i", "hogi", "baa", "bandaru"
            },
            "malayalam": {
                "aanu", "alla", "enthu", "engane", "evide",
                "ente", "ninte", "ningal", "avan", "aval", "nammal",
                "cheyyuka", "cheyyu", "cheythu", "aakum", "aayi",
                "nalla", "cheeththa", "valare", "koracchu", "ippo", "naale",
                "parayuka", "paRa", "poda", "vaa", "vannu"
            },
            "punjabi": {
                "hai", "nahi", "ki", "kive", "kithe",
                "mera", "tera", "tussi", "assi", "ohna",
                "karo", "karna", "kita", "hona", "hoya",
                "changa", "mara", "bahut", "thoda", "hun", "kal",
                "dassi", "dasso", "jao", "aao", "aaya"
            },
            "urdu": {
                "hai", "nahi", "kya", "kaise", "kahan",
                "mera", "tera", "aap", "tum", "hum", "unka",
                "karo", "karna", "kiya", "hoga", "hua",
                "acha", "bura", "bahut", "thora", "abhi", "kal",
                "bolo", "kahiye", "jao", "aayiye", "aaya"
            }
        }
        
        # Character n-grams specific to languages
        self.bigrams = {
            "tamil": {"ll", "nn", "tt", "pp", "kk", "th", "ng", "la"},
            "telugu": {"nd", "nt", "mp", "mb", "ll", "nn", "tt"},
            "bengali": {"ch", "sh", "kh", "gh", "bh", "dh"},
            "kannada": {"tt", "dd", "nn", "ll", "ge", "de"},
            "malayalam": {"kk", "tt", "pp", "ll", "nn", "th"},
        }
        
        self.alpha_only_pattern = re.compile(r"[^a-zA-Z ]")
    
    def detect(self, text: str) -> Tuple[Optional[str], float]:
        """
        Detect romanized language with confidence score.
        Returns: (language_name, confidence_score)
        """
        if not text or len(text) < 20:
            return None, 0.0
        
        # Quick ASCII check
        ascii_ratio = sum(1 for c in text if ord(c) < 128) / len(text)
        if ascii_ratio < 0.85:
            return None, 0.0
        
        text_l = " " + self.alpha_only_pattern.sub(" ", text.lower()) + " "
        text_len = len(text.split())
        
        scores = {}
        
        # Check marker words for each language
        for lang, markers in self.markers.items():
            hits = sum(1 for word in markers if f" {word} " in text_l)
            if hits > 0:
                # Confidence based on marker density
                confidence = min(hits / max(text_len * 0.15, 1), 1.0)
                scores[lang] = confidence
        
        # Bigram analysis for South Indian languages
        bigrams_in_text = {text[i:i+2].lower() for i in range(len(text)-1) 
                          if text[i:i+2].isalpha()}
        
        for lang, bigrams in self.bigrams.items():
            matches = len(bigrams_in_text & bigrams)
            if matches > 0:
                bigram_conf = min(matches / 5, 0.4)  # Max 0.4 from bigrams
                scores[lang] = scores.get(lang, 0) + bigram_conf
        
        if not scores:
            return None, 0.0
        
        # Get language with highest score
        best_lang = max(scores.items(), key=lambda x: x[1])
        
        if best_lang[1] >= 0.3:  # Minimum threshold
            lang_name = best_lang[0].capitalize()
            return f"{lang_name} (Romanized)", min(best_lang[1], 0.95)
        
        return None, 0.0


class NLPOrchestrator:
    def __init__(self, model_id="llama3.1:8b-instruct-q4_K_M"):
        print(f"Using Ollama model: {model_id}")
        self.model_id = model_id
        
        # Initialize romanization detector
        self.romanization_detector = MultiLanguageRomanizationDetector()
        
        # Language mapping
        self.lang_map = {
            "en": "English", "hi": "Hindi", "te": "Telugu", "ta": "Tamil",
            "bn": "Bengali", "mr": "Marathi", "gu": "Gujarati", "kn": "Kannada",
            "ml": "Malayalam", "pa": "Punjabi", "ur": "Urdu", "or": "Odia",
            "as": "Assamese", "sa": "Sanskrit",
        }
        
        # Pre-compiled regex patterns
        self.unicode_cleanup = str.maketrans({
            "\u200c": "", "\u200d": "", "\ufeff": ""
        })
        self.space_pattern = re.compile(r"[ \t]+")
        self.newline_pattern = re.compile(r"\n{3,}")
        self.trailing_comma_pattern = re.compile(r",\s*([}\]])")
        self.json_block_pattern = re.compile(r"```(?:json)?\s*([\s\S]*?)\s*```", re.IGNORECASE)
        self.json_object_pattern = re.compile(r"\{[\s\S]*\}")
        
        # Cache for results
        self._cache = {}
    
    def _get_cache_key(self, text: str, operation: str) -> str:
        """Generate cache key for text + operation"""
        return f"{operation}:{hashlib.md5(text.encode()).hexdigest()}"
    
    def _generate(self, system_prompt: str, user_input: str, max_tokens: int = 512) -> str:
        """Core generation method"""
        response = ollama.chat(
            model=self.model_id,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_input},
            ],
            options={
                "temperature": 0.0,
                "num_predict": max_tokens,
            },
        )
        
        text = response["message"]["content"].strip()
        gc.collect()
        return text
    
    def clean_text(self, text: str) -> str:
        """Optimized text cleaning"""
        text = unicodedata.normalize("NFKC", text)
        text = text.translate(self.unicode_cleanup)
        text = self.space_pattern.sub(" ", text)
        text = self.newline_pattern.sub("\n\n", text)
        return text.strip()
    
    def _clean_json_string(self, json_str: str) -> str:
        """Clean JSON string"""
        json_str = json_str.strip()
        json_str = self.trailing_comma_pattern.sub(r"\1", json_str)
        json_str = json_str.replace(""", "\"").replace(""", "\"").replace("'", "'")
        return json_str
    
    def _try_load_json(self, json_str: str) -> Optional[Dict]:
        """Attempt to parse JSON"""
        if not json_str:
            return None
        
        cleaned = self._clean_json_string(json_str)
        try:
            return json.loads(cleaned)
        except json.JSONDecodeError:
            return None
    
    def _parse_json(self, response: str) -> Optional[Dict]:
        """Extract and parse JSON from response"""
        if not response:
            return None
        
        # Try code block
        match = self.json_block_pattern.search(response)
        if match:
            parsed = self._try_load_json(match.group(1))
            if parsed:
                return parsed
        
        # Try direct object
        match = self.json_object_pattern.search(response)
        if match:
            parsed = self._try_load_json(match.group(0))
            if parsed:
                return parsed
        
        return None
    
    def detect_language(self, text: str) -> Dict:
        """Enhanced language detection with romanization support"""
        cache_key = self._get_cache_key(text, "lang_detect")
        if cache_key in self._cache:
            return self._cache[cache_key]
        
        try:
            # Check for romanized languages first
            romanized_lang, rom_confidence = self.romanization_detector.detect(text)
            
            if romanized_lang and rom_confidence >= 0.3:
                result = {
                    "lang_list": [romanized_lang],
                    "primary_lang": romanized_lang,
                    "confidence": round(rom_confidence, 3),
                    "detection_method": "romanization_markers"
                }
                self._cache[cache_key] = result
                return result
            
            # Fallback to langdetect
            langs = detect_langs(text)
            primary = langs[0].lang
            lang_list = [self.lang_map.get(l.lang, l.lang.upper()) for l in langs[:3]]
            primary_lang = self.lang_map.get(primary, primary.upper())
            
            result = {
                "lang_list": lang_list,
                "primary_lang": primary_lang,
                "confidence": round(langs[0].prob, 3),
                "detection_method": "langdetect"
            }
            self._cache[cache_key] = result
            return result
            
        except Exception:
            result = {
                "lang_list": ["Unknown"],
                "primary_lang": "Unknown",
                "confidence": 0.0,
                "detection_method": "fallback"
            }
            return result
    
    def _calculate_english_ratio(self, text: str) -> float:
        """Calculate percentage of English content"""
        try:
            sentences = text.split('.')
            english_count = 0
            
            for sent in sentences[:10]:  # Sample first 10 sentences
                sent = sent.strip()
                if len(sent) < 10:
                    continue
                try:
                    langs = detect_langs(sent)
                    if langs and langs[0].lang == 'en' and langs[0].prob > 0.8:
                        english_count += 1
                except:
                    pass
            
            return english_count / min(len(sentences), 10) if sentences else 0.0
        except:
            return 0.0
    
    def step_1_translate(self, text: str, lang_info: Dict) -> Dict:
        """Optimized translation with confidence scoring"""
        primary = lang_info["primary_lang"].lower()
        
        # Skip translation if already English
        if "english" in primary:
            return {
                "translated_text": text,
                "translation_confidence": 1.0,
                "translation_method": "skipped_native_english"
            }
        
        # Check if mostly English
        english_ratio = self._calculate_english_ratio(text)
        if english_ratio > 0.8:
            return {
                "translated_text": text,
                "translation_confidence": 0.95,
                "translation_method": "skipped_high_english_ratio"
            }
        
        # Compact prompt with "what not to do" approach
        system_prompt = """Translate to English. Return JSON only.

DON'T:
- Add explanations or preambles
- Use markdown formatting
- Translate proper nouns (names, places)
- Add content not in original
- Change meaning or tone

Format:
{"translated_text": "...", "confidence": 0.0}"""
        
        cache_key = self._get_cache_key(text, "translate")
        if cache_key in self._cache:
            return self._cache[cache_key]
        
        response = self._generate(system_prompt, text, max_tokens=len(text.split()) * 3)
        result = self._parse_json(response)
        
        if result is None:
            result = {
                "translated_text": response,
                "translation_confidence": 0.5,
                "translation_method": "fallback_raw_response"
            }
        else:
            # Handle nested JSON
            if "translated_text" in result:
                trans_value = result["translated_text"]
                if isinstance(trans_value, str) and trans_value.strip().startswith("{"):
                    nested = self._try_load_json(trans_value)
                    if nested and "translated_text" in nested:
                        result = nested
            
            result["translation_method"] = "llm_translation"
            result["translation_confidence"] = result.get("confidence", 0.85)
            if "confidence" in result:
                del result["confidence"]
        
        self._cache[cache_key] = result
        return result
    
    def step_2_deep_analysis(self, english_text: str) -> Dict:
        """Optimized deep analysis with confidence scores"""
        # Compact prompt with "what not to do" approach
        system_prompt = """Analyze text. Return JSON only.

DON'T:
- Invent facts not in text
- Use multiple values for Country_iden (pick ONE)
- Add Event_calendar dates without explicit mentions (dd/mm/yyyy format required)
- Mark sentiment as "Anti-National" unless direct threats to India
- Use domain names not in list below
- Include markdown, preambles, or explanations
- Leave confidence scores at 0.0 (estimate realistically)

Domains: Politics, Crime, Military, Terrorism, Radicalisation, Extremism in J&K, Law and Order, Narcotics, Left Wing Extremism, General

Format:
{
  "domain_ident": [],
  "domain_confidence": 0.0,
  "sentiment": "",
  "sentiment_confidence": 0.0,
  "NER": {
    "Person": [],
    "Location": [],
    "Organisation": [],
    "Event": [],
    "Product": []
  },
  "ner_confidence": 0.0,
  "Event_calendar": [],
  "Country_iden": "",
  "country_confidence": 0.0,
  "Fact_checker": {
    "relevant_topics": [],
    "confidence_level": 0.0,
    "relevance_rating": ""
  },
  "Summary": ""
}"""
        
        cache_key = self._get_cache_key(english_text, "analysis")
        if cache_key in self._cache:
            return self._cache[cache_key]
        
        # Dynamic token allocation based on text length
        text_tokens = len(english_text.split())
        max_tokens = min(400 + (text_tokens // 2), 800)
        
        response = self._generate(system_prompt, english_text, max_tokens=max_tokens)
        result = self._parse_json(response)
        
        if result is None:
            result = {
                "domain_ident": ["General"],
                "domain_confidence": 0.3,
                "sentiment": "Neutral",
                "sentiment_confidence": 0.3,
                "NER": {
                    "Person": [], "Location": [], "Organisation": [],
                    "Event": [], "Product": [],
                },
                "ner_confidence": 0.0,
                "Event_calendar": [],
                "Country_iden": "Unknown",
                "country_confidence": 0.0,
                "Fact_checker": {
                    "relevant_topics": [],
                    "confidence_level": 0.0,
                    "relevance_rating": "Low",
                },
                "Summary": "Analysis failed - JSON parsing error.",
            }
        
        self._cache[cache_key] = result
        return result
    
    def process(self, text: str) -> Dict:
        """Main orchestration pipeline"""
        cleaned = self.clean_text(text)
        
        print("ЁЯФН Detecting language...")
        lang_info = self.detect_language(cleaned)
        
        print("ЁЯУЭ Translation...")
        translation = self.step_1_translate(cleaned, lang_info)
        
        print("ЁЯФм Deep analysis...")
        analysis = self.step_2_deep_analysis(translation["translated_text"])
        
        # Compile final output
        return {
            "metadata": {
                "language_detection": {
                    "primary_lang": lang_info["primary_lang"],
                    "lang_list": lang_info["lang_list"],
                    "confidence": lang_info["confidence"],
                    "method": lang_info["detection_method"]
                },
                "translation": {
                    "confidence": translation["translation_confidence"],
                    "method": translation["translation_method"]
                }
            },
            "content": {
                "original": cleaned,
                "translated": translation["translated_text"]
            },
            "analysis": {
                "domain": {
                    "categories": analysis["domain_ident"],
                    "confidence": analysis.get("domain_confidence", 0.0)
                },
                "sentiment": {
                    "label": analysis["sentiment"],
                    "confidence": analysis.get("sentiment_confidence", 0.0)
                },
                "entities": {
                    "NER": analysis["NER"],
                    "confidence": analysis.get("ner_confidence", 0.0)
                },
                "geography": {
                    "country": analysis["Country_iden"],
                    "confidence": analysis.get("country_confidence", 0.0)
                },
                "events": analysis["Event_calendar"],
                "fact_check": analysis["Fact_checker"],
                "summary": analysis["Summary"]
            }
        }


# Usage
if __name__ == "__main__":
    content = '''ркнрк╛рк░ркдрлАркп ркЯрлАркорлЗ ркирлНркпрлВркЭрлАрк▓рлЗркирлНркб рк╕рк╛ркорлЗ рк╕ркдркд ркмрлАркЬрлА T20I ркЬрлАркдрлАркирлЗ рк╢рлНрк░рлЗркгрлАркорк╛ркВ 2-0ркирлА рк▓рлАркб ркорлЗрк│рк╡рлА ркЫрлЗ. рк░рк╛ркпрккрлБрк░ркорк╛ркВ рк░ркорк╛ркпрлЗрк▓рлА ркмрлАркЬрлА ркорлЗркЪркорк╛ркВ, ркЯрлАрко ркИркирлНркбрк┐ркпрк╛ркП ркорк╛ркдрлНрк░ 92 ркмрлЛрк▓ркорк╛ркВ 209 рк░ркиркирлЛ рк▓ркХрлНрк╖рлНркпрк╛ркВркХ рк╣рк╛ркВрк╕рк▓ ркХрк░рлНркпрлЛ. ркЖ ркЬрлАркдркорк╛ркВ ркнрк╛рк░ркдрлАркп ркЯрлАрко ркорк╛ркЯрлЗ ркХрлЗрккрлНркЯрки рк╕рлВрк░рлНркпркХрлБркорк╛рк░ ркпрк╛ркжрк╡рлЗ рк╕рлМркерлА рк╡ркзрлБ рк░рки ркмркирк╛рк╡рлНркпрк╛, 37 ркмрлЛрк▓ркорк╛ркВ 82 рк░рки ркмркирк╛рк╡рлНркпрк╛. ркЬрлЛркХрлЗ, ркЬрлАркдркирлЛ рк╡рк╛рк╕рлНркдрк╡рк┐ркХ рккрк╛ркпрлЛ ркИрк╢рк╛рки ркХрк┐рк╢рки ркжрлНрк╡рк╛рк░рк╛ ркирк╛ркЦрк╡рк╛ркорк╛ркВ ркЖрк╡рлНркпрлЛ рк╣ркдрлЛ, ркдрлЗркгрлЗ ркорлБрк╢рлНркХрлЗрк▓ рккрк░рк┐рк╕рлНркерк┐ркдрк┐ркУркорк╛ркВ ркЬрлЗ рк░рлАркдрлЗ рккрк░рклрлЛрк░рлНрко ркХрк░рлА ркЕркбркзрлА рк╕ркжрлА ркмркирк╛рк╡рлА ркдрлЗ ркЬрлЛркИркирлЗ ркмркзрк╛ ркЬ ркЦрлБрк╢ ркеркИ ркЧркпрк╛. ркЬрлЛркХрлЗ, ркЖ ркЗркирк┐ркВркЧ ркжрк░ркорк┐ркпрк╛рки, рк╕рлВрк░рлНркпркХрлБркорк╛рк░ ркИрк╢рк╛рки рккрк░ ркЧрлБрк╕рлНрк╕рлЗ ркеркИ ркЧркпрк╛ рк╣ркдрк╛, ркЕркирлЗ ркдрлЗркоркгрлЗ ркорлЗркЪ рккркЫрлА ркЖ рк╡рк╛ркдркирлЛ ркЦрлБрк▓рк╛рк╕рлЛ ркХрк░рлНркпрлЛ.'''
    
    extractor = NLPOrchestrator()
    
    print("=" * 60)
    print("NLP ORCHESTRATION PIPELINE")
    print("=" * 60)
    
    result = extractor.process(content)
    
    print("\n" + "=" * 60)
    print("FINAL OUTPUT")
    print("=" * 60)
    print(json.dumps(result, indent=2, ensure_ascii=False))