# Luxembourgish language translation

Author: Lujun LI

This is a demo project for luxembourgish langaguage translation 

We mainly use langchain and huggingface opensource models

1. Classification of langaguage: especially luxembourgish
2. Translate luxembourgish to english
3. Translate non-luxembourgish sentence to english
4. Sentiment analysis for the pipeline


Objectif: low cost & high accuracy

**Start from loading data**

In [43]:
import pandas as pd

data = pd.read_csv(
    "data/Machine Translation - Luxembourguish_list - Machine Translation - Luxembourguish_list.csv"
)

language_map = {
    "de": "German",
    "nl": "Dutch",
    "it": "Italian",
    "pt": "Portuguese",
    "ur": "Urdu",
    "hi": "Hindi",
    "fr": "French",
    "lb": "Luxembourgish",
    "en": "English",
}
language_map_reverse = {v: k for k, v in language_map.items()}
data["true_flag"] = data["Language"].map(language_map_reverse)

## Classification

Choose the model and testing

In [44]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

texts = data["Sentence"].to_list()

model_ckpt = "papluca/xlm-roberta-base-language-detection"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

preds = torch.softmax(logits, dim=-1)

# Map raw predictions to languages

In [45]:
id2lang = model.config.id2label
vals, idxs = torch.max(preds, dim=1)
labels = []
pbs = []
for idx, val in zip(idxs, vals):
    labels.append(id2lang[idx.item()])
    pbs.append(val.item())

label_series = pd.Series(labels, name="label_roberta")
pb_series = pd.Series(pbs, name="probability_roberta")
data = pd.concat([data, label_series, pb_series], axis=1)

In [46]:
from langid.langid import LanguageIdentifier, model

labels = []
pbs = []
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
identifier.set_languages(["lb", "en", "nl", "fr", "de"])
for text in texts:
    conf, prob = identifier.classify(text)
    labels.append(conf)
    pbs.append(prob)

label_series = pd.Series(labels, name="label_langid")
pb_series = pd.Series(pbs, name="probability_langid")
data = pd.concat([data, label_series, pb_series], axis=1)

In [57]:
correct = (data["true_flag"] == data["label_langid"]).sum()
total = len(data)
accuracy = correct / total
print("Accuracy_all_langid on all languages: " + str(accuracy))


data_lux = data[data["Language"] == "Luxembourgish"]
correct = (data_lux["true_flag"] == data_lux["label_langid"]).sum()
total = len(data_lux)
accuracy = correct / total
print("Accuracy_all_langid on luxembourgish: " + str(accuracy))

correct = (data["true_flag"] == data["label_roberta"]).sum()
total = len(data)
accuracy = correct / total
print("Accuracy_all_roberta on all languages: " + str(accuracy))


data_lux = data[data["Language"] == "Luxembourgish"]
correct = (data_lux["true_flag"] == data_lux["label_roberta"]).sum()
total = len(data_lux)
accuracy = correct / total
print("Accuracy_all_roberta on luxembourgish: " + str(accuracy))

Accuracy_all_langid on all languages: 0.825
Accuracy_all_langid on luxembourgish: 0.35
Accuracy_all_roberta on all languages: 0.75
Accuracy_all_roberta on luxembourgish: 0.0


In [62]:
data["correct_prediction"] = (data["true_flag"] == data["label_langid"]).astype(int)
accuracy_df = data.groupby("true_flag")["correct_prediction"].mean().reset_index()
accuracy_df

Unnamed: 0,true_flag,correct_prediction
0,de,0.95
1,en,1.0
2,fr,1.0
3,lb,0.35


In [63]:
data["correct_prediction"] = (data["true_flag"] == data["label_roberta"]).astype(int)
accuracy_df = data.groupby("true_flag")["correct_prediction"].mean().reset_index()
accuracy_df

Unnamed: 0,true_flag,correct_prediction
0,de,1.0
1,en,1.0
2,fr,1.0
3,lb,0.0


## Translation

Without training and directly use the model to do the translation from German to english and from luxembourgish to english

In [None]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "etamin/Letz-Translate-OPUS-LB-EN"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [81]:
input_texts = data[~data["label_roberta"].isin(["en", "fr"])]["Sentence"].to_list()
indices = data[~data["label_roberta"].isin(["en", "fr"])].index

inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)

outputs = model.generate(**inputs, max_length=50, num_beams=4, early_stopping=True)

translated_texts = [
    tokenizer.decode(output, skip_special_tokens=True) for output in outputs
]

translated_series = pd.Series(translated_texts, index=indices)
data["translated_sentence"] = None
data.loc[translated_series.index, "translated_sentence"] = translated_series

In [80]:
data.to_csv("output.csv", index=False)