## Start Inference Using the models

In [None]:
!pip install fasttext langid
!apt install g++
!pip install spacy
!python -m spacy download xx_ent_wiki_sm
!pip install sentencepiece
!pip install sacremoses
!pip install ctranslate2
!pip install langchain

## NLLB Running Local

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import torch
import pandas as pd
from datetime import datetime
import os


def get_latest_file(pattern="translation_*.csv"):
    files = [f for f in os.listdir() if f.startswith("translation_") and f.endswith(".csv")]
    if not files:
        return None
    latest_file = max(files, key=os.path.getmtime)
    return latest_file

def load_checkpoint(latest_file, df, text_column="subsentence"):
    if latest_file:
        translated_df = pd.read_csv(latest_file)
        translated_texts = translated_df[text_column].tolist()
        start_idx = len(translated_texts)
    else:
        start_idx = 0
    return start_idx

# Translate a batch of texts and save the results to a CSV file
def translate_batch(translator, df, text_column="subsentence", batch_size=1, is_checkpoint=False):
    # Find the latest file by date
    latest_file = get_latest_file()
    start_idx = 0
    # Determine output file name
    if latest_file and is_checkpoint:
        output_file = latest_file
        # Determine start index based on checkpoint file
        start_idx = load_checkpoint(latest_file, df, text_column) if latest_file else 0
        print ("Start From Index: ", start_idx)
    else:
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"translation_{current_time}.csv"
    
    texts = df[text_column].to_list()
    for i in tqdm(range(start_idx, len(texts), batch_size), desc="Translating", unit="batch"):
        batch = texts[i:i + batch_size]
        translated_batch = translator(batch)

        for j, text in enumerate(batch):
            updated_row = df.iloc[i + j].copy()
            updated_row["translated_text"] = translated_batch[j]['translation_text']  #
            updated_dataframe = pd.DataFrame([updated_row])
            
            if i == start_idx and j == 0 and not latest_file:
                updated_dataframe.to_csv(output_file, index=False, mode="w", header=True)
            else:
                updated_dataframe.to_csv(output_file, index=False, mode="a", header=False)

    print(f"Translation completed. Results saved to {output_file}")



model_path = "/home/lujun_li/projects/base_models/nllb-200-3.3B"
src_lang="ltz_Latn"
tgt_lang="eng_Latn"
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
batch_size = 5
tokenizer = AutoTokenizer.from_pretrained(model_path, device=device)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

translator = pipeline(
    'translation',
    model=model,
    tokenizer=tokenizer,
    src_lang=src_lang,
    tgt_lang=tgt_lang,
    max_length=360,
    device=device,
    batch_size=batch_size
)

dataset_df = pd.read_csv("/home/lujun_li/projects/mt_luxembourgish/data/NC_lux/NC_lux_subsentences_test.csv")
translate_batch(translator = translator, df = dataset_df, text_column="subsentence", batch_size=5, is_checkpoint=False)


## LLM running Local


In [1]:
from langchain.prompts import PromptTemplate

# Define a function to generate the translation prompt
def generate_translation_prompt(text, language_1="Luxembourgish", language_2="English"):
    prompt_template = """Please translate the following {language_1} text into {language_2}. Please answer me with only translated text!

    ---------------------------------- Text to be translated ----------------------------------

    {Text}

    ---------------------------------- Text to be translated ----------------------------------

    """
    
    translation_prompt = PromptTemplate(
        input_variables=["language_1", "language_2", text],
        template=prompt_template
    )
    
    return translation_prompt.format(language_1=language_1, language_2=language_2, Text=text)



In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GenerationConfig,
    pipeline,
)
from tqdm import tqdm
import pandas as pd
import numpy as np
from datetime import datetime
import time
import os

def initialize_pipeline(model_config_dict):
    model_path = model_config_dict["CURRENT_RUNNING_MODEL"]
    if not model_path:
        raise ValueError("CURRENT_RUNNING_MODEL is not set")
    load_in_4bit, load_in_8bit = model_config_dict["CURRENT_LOAD_IN_4BIT"], model_config_dict["CURRENT_LOAD_IN_8BIT"]

    if model_config_dict["IF_LOADING_QUANTIZATION"]:
        nf4_config = BitsAndBytesConfig(load_in_4bit=load_in_4bit, load_in_8bit=load_in_8bit, bnb_4bit_compute_dtype=torch.float16)
    else:
        nf4_config = None
        
    generation_config = GenerationConfig.from_pretrained(model_path)
    generation_config.do_sample = True
    generation_config.max_length = 512
    generation_config.pad_token_id = 0
    generation_config.top_p = 0.9
    generation_config.temperature = 0.1
    generation_config.max_new_tokens = 512
    
    text_pipeline = pipeline("text-generation", model=model_path, torch_dtype=torch.float32, device_map="auto")
    # text_pipeline.model.generation_config = generation_config # This needs to write a blog on that
    text_pipeline.generation_config = generation_config
    return text_pipeline

def generate_text(pipeline, prompt):
    response = pipeline(prompt)[0]["generated_text"]
    return response

def find_most_recent_date(df, date_column):
    """Finds the most recent date in the specified date column of the DataFrame."""
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    most_recent_date = df[date_column].max()
    return most_recent_date

def get_latest_file(pattern="translation_*.csv"):
    files = [f for f in os.listdir() if f.startswith("translation_LLM_") and f.endswith(".csv")]
    if not files:
        return None
    latest_file = max(files, key=os.path.getmtime)
    return latest_file

def load_checkpoint(latest_file, df, text_column="subsentence"):
    if latest_file:
        translated_df = pd.read_csv(latest_file)
        translated_texts = translated_df[text_column].tolist()
        start_idx = len(translated_texts)
    else:
        start_idx = 0
    return start_idx

def translate_batch_LLM(translator, df, text_column="subsentence", batch_size=1):

    df["prompts_inputs"] = df[text_column].apply(generate_translation_prompt)
    latest_file = get_latest_file()
    # Determine output file name based on the latest file
    if latest_file:
        output_file = latest_file
    else:
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"translation_LLM_{current_time}.csv"

    # Determine start index based on checkpoint file
    start_idx = load_checkpoint(latest_file, df, text_column) if latest_file else 0
    print ("Start From Index: ", start_idx)
    texts = df["prompts_inputs"].to_list()

    for i in tqdm(range(start_idx, len(texts), batch_size), desc="Translating", unit="batch"):
        batch = texts[i:i + batch_size]
        translated_batch = translator(batch)

        for j, text in enumerate(batch):
            updated_row = df.iloc[i + j].copy()
            updated_row["translated_text"] = translated_batch[j][0]['generated_text']
            updated_dataframe = pd.DataFrame([updated_row])
            
            if i == start_idx and j == 0 and not latest_file:
                updated_dataframe.to_csv(output_file, index=False, mode="w", header=True)
            else:
                updated_dataframe.to_csv(output_file, index=False, mode="a", header=False)

    print(f"Translation completed. Results saved to {output_file}")

model_config_dict = {
    "CURRENT_RUNNING_MODEL": "/home/lujun_li/projects/base_models/Llama-3.2-3B-Instruct",
    "IF_LOADING_QUANTIZATION": False,
    "CURRENT_LOAD_IN_4BIT":    True,
    "CURRENT_LOAD_IN_8BIT":    False
}

translator = initialize_pipeline(model_config_dict)
dataset_df = pd.read_csv("/home/lujun_li/projects/mt_luxembourgish/data/NC_lux/NC_lux_subsentences_test.csv")
translate_batch_LLM(translator = translator, df = dataset_df, text_column="subsentence", batch_size=1)

## LLM running with Ollama

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
import os
import requests


def generate_text_with_ollama(model_name, prompt):
    url = f"http://localhost:11434/api/generate"  # Default local Ollama server endpoint
    payload = {
        "model": model_name,
        "prompt": prompt,
        "system": "",
        "temperature": 0.1,
        "max_tokens": 512,
        "top_p": 0.9,
        "stop": None,
    }
    response = requests.post(url, json=payload)
    response.raise_for_status()
    return response.json()["response"]

def find_most_recent_date(df, date_column):
    """Finds the most recent date in the specified date column of the DataFrame."""
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    most_recent_date = df[date_column].max()
    return most_recent_date

def get_latest_file(pattern="translation_*.csv"):
    files = [f for f in os.listdir() if f.startswith("translation_LLM_") and f.endswith(".csv")]
    if not files:
        return None
    latest_file = max(files, key=os.path.getmtime)
    return latest_file

def load_checkpoint(latest_file, df, text_column="subsentence"):
    if latest_file:
        translated_df = pd.read_csv(latest_file)
        translated_texts = translated_df[text_column].tolist()
        start_idx = len(translated_texts)
    else:
        start_idx = 0
    return start_idx

def translate_batch_ollama(model_name, df, text_column="subsentence", batch_size=1):
    df["prompts_inputs"] = df[text_column].apply(generate_translation_prompt)
    latest_file = get_latest_file()
    # Determine output file name based on the latest file
    if latest_file:
        output_file = latest_file
    else:
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"translation_LLM_{current_time}.csv"

    # Determine start index based on checkpoint file
    start_idx = load_checkpoint(latest_file, df, text_column) if latest_file else 0
    print("Start From Index: ", start_idx)
    texts = df["prompts_inputs"].to_list()

    for i in tqdm(range(start_idx, len(texts), batch_size), desc="Translating", unit="batch"):
        batch = texts[i:i + batch_size]
        translated_batch = [generate_text_with_ollama(model_name, text) for text in batch]

        for j, text in enumerate(batch):
            updated_row = df.iloc[i + j].copy()
            updated_row["translated_text"] = translated_batch[j]
            updated_dataframe = pd.DataFrame([updated_row])
            
            if i == start_idx and j == 0 and not latest_file:
                updated_dataframe.to_csv(output_file, index=False, mode="w", header=True)
            else:
                updated_dataframe.to_csv(output_file, index=False, mode="a", header=False)

    print(f"Translation completed. Results saved to {output_file}")

def generate_translation_prompt(text):
    return f"Translate the following text into Luxembourgish:\n\n{text}"

# Configuration for Ollama
model_name = "llama-3.2-3b-instruct"

# Load the dataset and start translation
dataset_df = pd.read_csv("/home/lujun_li/projects/mt_luxembourgish/data/NC_lux/NC_lux_subsentences_test.csv")
translate_batch_ollama(model_name=model_name, df=dataset_df, text_column="subsentence", batch_size=1)


## LLM running with vllm