In [None]:
!pip install transformers torch langdetect googletrans==4.0.0-rc1 indic-transliteration

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting indic-transliteration
  Downloading indic_transliteration-2.3.69-py3-none-any.whl.metadata (1.4 kB)
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.me

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from langdetect import detect, DetectorFactory
from googletrans import Translator
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate


In [None]:
# To make language detection consistent
DetectorFactory.seed = 0


In [None]:
MODEL_NAME = "unitary/toxic-bert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

TOXIC_LABELS = ["toxicity", "severe_toxicity", "obscene", "identity_attack", "insult", "threat"]
translator = Translator()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
def is_probably_tamil_roman(text):
    tamil_roman_keywords = ['enna', 'illa', 'podu', 'poda', 'dei', 'machan', 'madiri', 'romba', 'unga', 'evlo', 'aama']
    text_lower = text.lower()
    return any(word in text_lower for word in tamil_roman_keywords)


In [None]:
def translate_to_english(text):
    try:
        detected_lang = detect(text)
    except:
        detected_lang = "unknown"

    # Step 1: Check for Tunglish
    if is_probably_tamil_roman(text):
        print("🔤 Detected as Tunglish")
        tamil_script = transliterate(text, sanscript.ITRANS, sanscript.TAMIL)
        print(f"📝 Transliterated to Tamil: {tamil_script}")
        translated = translator.translate(tamil_script, src='ta', dest='en')
        return translated.text, 'tunglish'

    # Step 2: Force English if short and likely valid
    if detected_lang != "en" and len(text.split()) <= 4 and all(char.isalpha() or char.isspace() for char in text):
        print("🔤 Detected as English (forced due to short clean sentence)")
        return text, 'en'

    # Step 3: Regular route
    if detected_lang == "en":
        return text, 'en'
    elif detected_lang != "unknown":
        translated = translator.translate(text, src=detected_lang, dest='en')
        return translated.text, detected_lang
    else:
        return text, 'unknown'



In [None]:
def predict_toxicity(text, threshold=0.5):
    eng_text, lang = translate_to_english(text)

    inputs = tokenizer(eng_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        scores = torch.sigmoid(outputs.logits)[0].tolist()

    result = {
        "original_language": lang,
        "translated_text": eng_text
    }

    is_toxic = False
    for label, score in zip(TOXIC_LABELS, scores):
        result[label] = round(score, 4)
        if score >= threshold:
            is_toxic = True

    result["final_decision"] = "Blocked" if is_toxic else "Allowed"
    return result


In [None]:
user_comment = input("🗣️ Enter your comment (English, Tamil or Tunglish): ")
output = predict_toxicity(user_comment)

print(f"\n🌐 Original Language: {output['original_language']}")
print(f"🔁 Translated to English: {output['translated_text']}")
print("📊 Toxicity Scores:")
for label in TOXIC_LABELS:
    print(f"  {label}: {output[label]}")
print(f"\n🚦 Final Result: {output['final_decision']}")


🗣️ Enter your comment (English, Tamil or Tunglish): i hate it
🔤 Detected as English (forced due to short clean sentence)

🌐 Original Language: en
🔁 Translated to English: i hate it
📊 Toxicity Scores:
  toxicity: 0.4038
  severe_toxicity: 0.0007
  obscene: 0.0046
  identity_attack: 0.0029
  insult: 0.0029
  threat: 0.0024

🚦 Final Result: Allowed
