# Luxembourgish language translation

Author: Lujun LI

This is a demo project for luxembourgish langaguage translation 

We mainly use langchain and huggingface opensource models

1. Classification of langaguage: especially luxembourgish
2. Translate luxembourgish to english
3. Translate non-luxembourgish sentence to english
4. Sentiment analysis for the pipeline


Objectif: low cost & high accuracy

**Start from loading data**

In [27]:
import pandas as pd

folder_name = "data/"
file_name = "Machine Translation - Luxembourguish_list - Machine Translation - Luxembourguish_list.csv"
sheet_name = ""
data = pd.read_csv(folder_name + file_name)
column_name = "Sentence"
batch_size = 1000

language_map = {
    "de": "German",
    "nl": "Dutch",
    "it": "Italian",
    "pt": "Portuguese",
    "ur": "Urdu",
    "hi": "Hindi",
    "fr": "French",
    "lb": "Luxembourgish",
    "en": "English",
}
language_map_reverse = {v: k for k, v in language_map.items()}
data["true_flag"] = data["Language"].map(language_map_reverse)

## Classification

Choose the model and testing

In [28]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

texts = data[column_name].astype(str).to_list()

model_ckpt = "papluca/xlm-roberta-base-language-detection"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

all_preds = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i : i + batch_size]

    inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    preds = torch.softmax(logits, dim=-1)
    all_preds.append(preds)

all_preds = torch.cat(all_preds, dim=0)

In [29]:
id2lang = model.config.id2label
vals, idxs = torch.max(all_preds, dim=1)
labels = []
pbs = []
for idx, val in zip(idxs, vals):
    labels.append(id2lang[idx.item()])
    pbs.append(val.item())

label_series = pd.Series(labels, name="label_roberta")
pb_series = pd.Series(pbs, name="probability_roberta")
data = pd.concat([data, label_series, pb_series], axis=1)

In [30]:
from langid.langid import LanguageIdentifier, model

labels = []
pbs = []
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
identifier.set_languages(["lb", "en", "nl", "fr", "de"])
for text in texts:
    conf, prob = identifier.classify(text)
    labels.append(conf)
    pbs.append(prob)

label_series = pd.Series(labels, name="label_langid")
pb_series = pd.Series(pbs, name="probability_langid")
data = pd.concat([data, label_series, pb_series], axis=1)

## Translation

Without training and directly use the model to do the translation from German to english and from luxembourgish to english

In [31]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "etamin/Letz-Translate-OPUS-LB-EN"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [32]:
input_texts = (
    data[~data["label_roberta"].isin(["en", "fr"])][column_name].astype(str).to_list()
)
indices = data[~data["label_roberta"].isin(["en", "fr"])].index

translated_texts = []

for i in range(0, len(input_texts), batch_size):
    batch_texts = input_texts[i : i + batch_size]
    batch_indices = indices[i : i + batch_size]

    inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs, max_length=50, num_beams=4, early_stopping=True)

    batch_translations = [
        tokenizer.decode(output, skip_special_tokens=True) for output in outputs
    ]
    translated_texts.extend(batch_translations)


translated_series = pd.Series(translated_texts, index=indices)
data["translated_sentence"] = None
data.loc[translated_series.index, "translated_sentence"] = translated_series

Non commercial liscence

In [33]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# # Mistral Llama3
# model_name = "facebook/nllb-200-distilled-1.3B"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


# def batch_translate(
#     texts, source_lang="fra_Latn", target_lang="eng_Latn", batch_size=500
# ):
#     translations = []
#     for i in range(0, len(texts), batch_size):
#         batch_texts = texts[i : i + batch_size]
#         inputs = tokenizer(
#             batch_texts, return_tensors="pt", padding=True, truncation=True
#         )
#         translated_tokens = model.generate(
#             **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
#         )
#         translated_texts = tokenizer.batch_decode(
#             translated_tokens, skip_special_tokens=True
#         )
#         translations.extend(translated_texts)
#     return translations


# french_texts = (
#     data[data["translated_sentence"].isna()][column_name].astype(str).to_list()
# )
# indices = data[data["translated_sentence"].isna()].index
# english_translations = batch_translate(french_texts, batch_size=batch_size)
# translated_series = pd.Series(english_translations, index=indices)
# data.loc[translated_series.index, "translated_sentence"] = translated_series

In [37]:
sheet_name = ""
data.to_csv(f"output_{file_name}_{sheet_name}.csv", index=False)

# Sentiment Analysis

In [50]:
from langchain import PromptTemplate
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)


instruction = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature."""
question = "hello, I am lujun"
template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{instruction}

<|eot_id|><|start_header_id|>user<|end_header_id|>
{question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt_template = PromptTemplate.from_template(template)

prompt = prompt_template.format(instruction=instruction, question=question)
generated_text = pipeline(prompt, max_length=250)[0]["generated_text"]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
