# LLMs Translation Pipeline


This would be a pipeline for Machine Translation from lux to english including data pre-processing, translation and data post-processing.

1. Context information should be included inside each transation sample and sliding windows will be used.
2. Make full use of the model's hallucination properties to make more samples. (Different temperatures: 0.1,0.5,0.9,1.3)
3. Clean the models' output and check the correcteness of the models' output. (Corprus Checking)

## Data pre-processing

In [1]:
import json
import pandas as pd
from utils.utils_nlp import split_paragraph


def load_data(file_path, label):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_data = json.loads(line)
            json_data['label'] = label
            data.append(json_data)
    return data
    
train_data = load_data('/Users/lujun.li/projects/mt_luxembourgish/data/NC_lux/train.json', 'train')
test_data = load_data('/Users/lujun.li/projects/mt_luxembourgish/data/NC_lux/test.json', 'test')
dev_data = load_data('/Users/lujun.li/projects/mt_luxembourgish/data/NC_lux/dev.json', 'dev')
all_data = train_data + test_data + dev_data

NC_lux_df = pd.DataFrame(all_data)
NC_lux_df["length"] = NC_lux_df["text"].apply(len)
NC_lux_df.head(1)



Unnamed: 0,label,text,length
0,train,Bei den Dammen a bei den Häre stoung an der Sc...,899


In [2]:
# Drop Duplicates
original_len = len(NC_lux_df)
print("Original length:", original_len)
NC_lux_df.drop_duplicates(subset=['text'], inplace=True)
new_len = len(NC_lux_df)
print("New length after dropping duplicates:", new_len)
dropped_count = original_len - new_len
print("Number of dropped duplicates:", dropped_count)

Original length: 10052
New length after dropping duplicates: 10042
Number of dropped duplicates: 10


In [3]:
from transformers import AutoTokenizer

model_nllb = "facebook/nllb-200-distilled-600M"
model_opus = "Helsinki-NLP/opus-mt-mul-en"

nllb_tokenizer = AutoTokenizer.from_pretrained(model_nllb)
opus_tokenizer = AutoTokenizer.from_pretrained(model_opus)

def calculate_tokens(df, text_column):
    def get_token_length(text, tokenizer):
        tokens = tokenizer.tokenize(text)
        return len(tokens)
    
    df['NLLB_tokens'] = df[text_column].apply(lambda x: nllb_tokenizer.tokenize(x))
    df['NLLB_token_length'] = df[text_column].apply(lambda x: get_token_length(x, nllb_tokenizer))
    
    df['OPUS_tokens'] = df[text_column].apply(lambda x: opus_tokenizer.tokenize(x))
    df['OPUS_token_length'] = df[text_column].apply(lambda x: get_token_length(x, opus_tokenizer))
    return df

NC_lux_df = calculate_tokens(NC_lux_df, 'text')

Token indices sequence length is longer than the specified maximum sequence length for this model (1775 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

nllb_model = AutoModelForSeq2SeqLM.from_pretrained(model_nllb,local_files_only=True)
opus_model = AutoModelForSeq2SeqLM.from_pretrained(model_opus,local_files_only=True)

nllb_max_length = nllb_model.config.max_position_embeddings
opus_max_length = opus_model.config.max_position_embeddings




In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting the distribution of NLLB_token_length
plt.figure(figsize=(10, 6))
sns.histplot(NC_lux_df["NLLB_token_length"], bins=10, kde=True)
plt.title('Distribution of NLLB_token_length')
plt.xlabel('NLLB_token_length')
plt.ylabel('Frequency')
plt.grid()
plt.show()


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 512
chunk_overlap = 256
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    is_separator_regex=False,
)
