In [None]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

DetectorFactory.seed = 0

lang_dect = {
    "ar": "arb_Arab",
    "en": "eng_Latn",
    "ru": "rus_Cyrl",
    "zn-ch": "zho_Hans",
    "fr": "fra_Latn"
}

def detect_language(text):
    try:
        language = detect(text)
        return lang_dect[language]
    except LangDetectException:
        return "Could not detect language"
    

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd

model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

post = input("Enter the text to be summarized:")
src_lang = detect_language(post)
tgt_lang = input("Enter a language to have the summary in (english, arabic, russian, mandarin, french):")

lang_tgt = {
    "arabic": "arb_Arab",
    "english": "eng_Latn",
    "russian": "rus_Cyrl",
    "mandarin": "zho_Hans",
    "french": "fra_Latn"
}

tgt_lang = lang_tgt[tgt_lang.lower()]

if src_lang != 'eng_Latn':
    inputs = tokenizer(post, return_tensors="pt", src_lang=src_lang)
    generated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id['eng_Latn'])
    post = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [None]:
model_path = "./bart-large-final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

inputs = tokenizer(post, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(**inputs, max_length=64, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
from better_profanity import profanity
import re

profanity.load_censor_words()

def custom_censor(text, level="full"):
    censored_text = text
    bad_words = profanity.CENSOR_WORDSET

    for word in bad_words:
        pattern = re.compile(rf"\b{re.escape(word)}\b", re.IGNORECASE)
        matches = pattern.findall(text)

        for match in matches:
            if level.lower() == "none":
                replacement = match
            elif level.lower() == "partial":
                if len(match) <= 2:
                    replacement = "*" * len(match)
                else:
                    replacement = match[0] + "*" * (len(match) - 2) + match[-1]
            elif level.lower() == "full":
                replacement = "*" * len(match)
            else:
                raise ValueError("Invalid censorship level. Choose from 'none', 'partial', or 'full'.")

            censored_text = re.sub(rf"\b{re.escape(match)}\b", replacement, censored_text, flags=re.IGNORECASE)

    return censored_text

level = input("Enter your level of censorship (full, partial, none):")

summary = custom_censor(summary, level)

In [None]:
if tgt_lang != 'eng_Latn':
    inputs = tokenizer(summary, return_tensors="pt", src_lang='eng_Latn')
    generated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
    summary = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [None]:
print("Summary: ", summary)