## Start Inference Using the models

In [None]:
!pip install fasttext langid
!pip install langchain
!pip install 'transformers[torch]'
!pip install sentencepiece
!pip install sacremoses
!pip install ctranslate2
!pip install torch
!pip install pandas

## NLLB Running Local

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import torch
import pandas as pd
from datetime import datetime
import os
import sys

project_root = "/home/users/luli/project/mt_luxembourgish"
if project_root not in sys.path:
    sys.path.append(project_root)

def get_latest_file(pattern):
    files = [f for f in os.listdir() if f.startswith(pattern)]
    return max(files, key=os.path.getmtime) if files else None

def load_checkpoint(latest_file, df, text_column):
    if latest_file:
        translated_df = pd.read_csv(latest_file)
        translated_texts = translated_df[text_column].tolist()
        start_idx = len(translated_texts)
    else:
        start_idx = 0
    return start_idx

def translate_batch(config, df):
    # Initialize model and tokenizer
    model_path = config["model_name"]
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    
    translator = pipeline(
        "translation",
        model=model,
        tokenizer=tokenizer,
        src_lang=config["src_lang"],
        tgt_lang=config["tgt_lang"],
        max_length=config["max_length"],
        device=config["device"]
    )

    # Determine output file name 
    prefix = config["prefix"]
    latest_file = get_latest_file(prefix)
    if latest_file and not bool(config.get("is_new_file", False)):
        output_file = latest_file
    else:
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"{prefix}_{current_time}.csv"

    # Load data and check starting index
    texts = df[config["text_column"]].tolist()
    start_idx = (
        load_checkpoint(latest_file, df, config["text_column"]) 
        if (latest_file and not bool(config.get("is_new_file", False))) 
        else 0
    )

    print("Start From Index: ", start_idx)
    texts = df[config["text_column"]].to_list()
    
    # Batch translation and saving
    batch_size = config["batch_size"]
    for i in tqdm(range(start_idx, len(texts), batch_size), desc="Translating", unit="batch"):
        batch = texts[i:i + batch_size]
        translated_batch = translator(batch)
        
        for j, text in enumerate(batch):
            updated_row = df.iloc[i + j].copy()
            updated_row["translated_text"] = translated_batch[j]["translation_text"]
            updated_dataframe = pd.DataFrame([updated_row])
            
            mode = "w" if i == start_idx and j == 0 and start_idx == 0 else "a"
            header = mode == "w"
            updated_dataframe.to_csv(output_file, index=False, mode=mode, header=header)

    print(f"Translation completed. Results saved to {output_file}")


# Unified configuration dictionary
config = {
    "model_name": "/mnt/lscratch/users/luli/model/nllb-200-3.3B/",
    "src_lang": "ltz_Latn",
    "tgt_lang": "eng_Latn",
    "device": 'cuda:2' if torch.cuda.is_available() else 'cpu',
    "max_length": 360,
    "batch_size": 6,
    "text_column": "subsentence",
    "prefix": "translation_nllb_",
    "is_new_file": False
}

input_file = "NC_lux_subsentences_test.csv"
dataset_df = pd.read_csv(input_file)
translate_batch(config, dataset_df)

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  2.89it/s]


Start From Index:  6


Translating: 100%|██████████| 4/4 [00:23<00:00,  5.96s/batch]

Translation completed. Results saved to translation_nllb__20241117_000411.csv





## LLM running Local


In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GenerationConfig,
    pipeline,
)
from tqdm import tqdm
import pandas as pd
import numpy as np
from datetime import datetime
import time
import os

from langchain.prompts import PromptTemplate
import sys

project_root = "/home/users/luli/project/mt_luxembourgish"
if project_root not in sys.path:
    sys.path.append(project_root)

def get_latest_file(pattern):
    files = [f for f in os.listdir() if f.startswith(pattern)]
    return max(files, key=os.path.getmtime) if files else None

def load_checkpoint(latest_file, df, text_column):
    if latest_file:
        translated_df = pd.read_csv(latest_file)
        translated_texts = translated_df[text_column].tolist()
        start_idx = len(translated_texts)
    else:
        start_idx = 0
    return start_idx


# Define a function to generate the translation prompt
def generate_translation_prompt(text, language_1="Luxembourgish", language_2="English"):
    prompt_template = """Please translate the following {language_1} text into {language_2}. Please answer me with only translated text!

    ---------------------------------- Text to be translated ----------------------------------

    {Text}

    ---------------------------------- Text to be translated ----------------------------------

    """
    
    translation_prompt = PromptTemplate(
        input_variables=["language_1", "language_2", text],
        template=prompt_template
    )
    
    return translation_prompt.format(language_1=language_1, language_2=language_2, Text=text)


def initialize_pipeline(config):
    model_path = config["model_name"]
    if not model_path:
        raise ValueError("model_name is not set")
    load_in_4bit, load_in_8bit = config["current_load_in_4bit"], config["current_load_in_4bit"]

    if config["if_loading_quantization"]:
        nf4_config = BitsAndBytesConfig(load_in_4bit=load_in_4bit, load_in_8bit=load_in_8bit, bnb_4bit_compute_dtype=torch.float16)
    else:
        nf4_config = None

    # update configuration in model generation
    config_updates = config["model_config"]
    generation_config = GenerationConfig.from_pretrained(model_path)
    for key, value in config_updates.items():
        setattr(generation_config, key, value)
    
    text_pipeline = pipeline("text-generation", model=model_path, torch_dtype=torch.float32, device_map=config["device"])
    # text_pipeline.model.generation_config = generation_config # This needs to write a blog on that
    text_pipeline.generation_config = generation_config
    return text_pipeline

def generate_text(pipeline, prompt):
    response = pipeline(prompt)[0]["generated_text"]
    return response

def find_most_recent_date(df, date_column):
    """Finds the most recent date in the specified date column of the DataFrame."""
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    most_recent_date = df[date_column].max()
    return most_recent_date

def get_latest_file(prefix):
    files = [f for f in os.listdir() if f.startswith(prefix) and f.endswith(".csv")]
    if not files:
        return None
    latest_file = max(files, key=os.path.getmtime)
    return latest_file

def load_checkpoint(latest_file, df, text_column="subsentence"):
    if latest_file:
        translated_df = pd.read_csv(latest_file)
        translated_texts = translated_df[text_column].tolist()
        start_idx = len(translated_texts)
    else:
        start_idx = 0
    return start_idx

def translate_batch_LLM(config, df):
    translator = initialize_pipeline(config)
    df["prompts_inputs"] = df[config["text_column"]].apply(generate_translation_prompt)

    # Determine output file name 
    prefix = config["prefix"]
    latest_file = get_latest_file(prefix)
    if latest_file and not bool(config.get("is_new_file", False)):
        output_file = latest_file
    else:
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"{prefix}_{current_time}.csv"

    # Load data and check starting index
    texts = df[config["text_column"]].tolist()
    start_idx = (
        load_checkpoint(latest_file, df, config["text_column"]) 
        if (latest_file and not bool(config.get("is_new_file", False))) 
        else 0
    )

    print("Start From Index: ", start_idx)
    texts = df["prompts_inputs"].to_list()
    batch_size = config["batch_size"]

    for i in tqdm(range(start_idx, len(texts), batch_size), desc="Translating", unit="batch"):
        batch = texts[i:i + batch_size]
        translated_batch = translator(batch, pad_token_id=translator.tokenizer.eos_token_id, return_full_text=False)

        for j, text in enumerate(batch):
            updated_row = df.iloc[i + j].copy()
            updated_row["translated_text"] = translated_batch[j][0]['generated_text']
            updated_dataframe = pd.DataFrame([updated_row])
            
            if i == start_idx and j == 0 and config["is_new_file"]:
                updated_dataframe.to_csv(output_file, index=False, mode="w", header=True)
            else:
                updated_dataframe.to_csv(output_file, index=False, mode="a", header=False)

    print(f"Translation completed. Results saved to {output_file}")

config = {
    "model_name": "/mnt/lscratch/users/luli/model/Llama-3.2-3B-Instruct",
    "if_loading_quantization": False,
    "current_load_in_4bit": True,
    "current_load_in_8bit": False,
    "model_config": {
        "temperature": 0.1, # necessary
        "max_tokens": 512, # necessary
        "top_p": 0.9, # necessary
        "do_sample": True, # necessary
        "max_new_tokens": 512, # necessary
        "max_length": 512, # necessary
    },
    "batch_size": 5, # use this to accelerate the translation process
    "prefix": "translation_LLM_huggingface_pipeline_", # necessary
    "text_column": "subsentence", # necessary
    "device": "auto",
    "is_new_file": False
}

dataset_df = pd.read_csv("NC_lux_subsentences_test.csv")
translate_batch_LLM(config = config, df = dataset_df)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.10s/it]


Start From Index:  10


Translating: 100%|██████████| 4/4 [06:17<00:00, 94.48s/batch] 

Translation completed. Results saved to translation_LLM_huggingface_pipeline__20241117_003434.csv





## LLM running with Ollama

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from langchain.prompts import PromptTemplate
import os
import requests
import json
import sys

project_root = "/home/users/luli/project/mt_luxembourgish"
if project_root not in sys.path:
    sys.path.append(project_root)

# Define a function to generate the translation prompt
def generate_translation_prompt(text, language_1 = "Luxembourgish", language_2 = "English"):
    prompt_template = """Please translate the following {language_1} text into {language_2}. Please answer me with only translated text!

    ---------------------------------- Text to be translated ----------------------------------

    {Text}

    ---------------------------------- Text to be translated ----------------------------------

    """
    
    translation_prompt = PromptTemplate(
        input_variables=["language_1", "language_2", text],
        template=prompt_template
    )
    
    return translation_prompt.format(language_1=language_1, language_2=language_2, Text=text)


def generate_text_with_ollama(config, prompt):
    
    payload = {
        "model": config["model_name"],
        "prompt": prompt,
        "stream": False,
        "format": "json", # Without this, the api calling will be blocked
        "options": config["options"],
    }
    response = requests.post(config["server_url"], headers=config["headers"], data=json.dumps(payload))
    
    if response.status_code == 200:
        data = json.loads(response.text)
        actual_response = data["response"]
        return actual_response
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")

def get_latest_file(prefix):
    files = [f for f in os.listdir() if f.startswith(prefix) and f.endswith(".csv")]
    if not files:
        return None
    latest_file = max(files, key=os.path.getmtime)
    return latest_file

def load_checkpoint(latest_file, df, text_column):
    if latest_file:
        translated_df = pd.read_csv(latest_file)
        translated_texts = translated_df[text_column].tolist()
        start_idx = len(translated_texts)
    else:
        start_idx = 0
    return start_idx

def translate_batch_ollama(config, df):
    df["prompts_inputs"] = df[config["text_column"]].apply(generate_translation_prompt)
    
    # Determine output file name 
    prefix = config["prefix"]
    latest_file = get_latest_file(prefix)
    if latest_file and not bool(config.get("is_new_file", False)):
        output_file = latest_file
    else:
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"{prefix}_{current_time}.csv"

    # Load data and check starting index
    texts = df[config["text_column"]].tolist()
    start_idx = (
        load_checkpoint(latest_file, df, config["text_column"]) 
        if (latest_file and not bool(config.get("is_new_file", False))) 
        else 0
    )

    print("Start From Index: ", start_idx)
    texts = df["prompts_inputs"].to_list()

    for i in tqdm(range(start_idx, len(texts), config["batch_size"]), desc="Translating", unit="batch"):
        batch = texts[i:i + config["batch_size"]]
        translated_batch = [generate_text_with_ollama(config, text) for text in batch]

        for j, text in enumerate(batch):
            updated_row = df.iloc[i + j].copy()
            updated_row["translated_text"] = translated_batch[j]
            updated_dataframe = pd.DataFrame([updated_row])
            
            if i == start_idx and j == 0 and not latest_file:
                updated_dataframe.to_csv(output_file, index=False, mode="w", header=True)
            else:
                updated_dataframe.to_csv(output_file, index=False, mode="a", header=False)

    print(f"Translation completed. Results saved to {output_file}")


config = {
    "model_name": "llama3.1:8b", # necessary
    "server_url": "http://localhost:11434/api/generate", # necessary
    "headers": {"Content-Type": "application/json"},  # necessary
    # model setting options
    "options": {
        "temperature": 0.1, # necessary
        "max_tokens": 512, # necessary
        "top_p": 0.9, # necessary
        "do_sample": True, # necessary
        "max_new_tokens": 512, # necessary
        "max_length": 512, # necessary
        "num_ctx": 2048, # necessary for acclelerating the translation process
    },
    "batch_size": 1, # use this to accelerate the translation process
    "prefix": "translation_LLM_ollama", # necessary
    "text_column": "subsentence", # necessary
    "is_new_file": True
}

input_file = "NC_lux_subsentences_test.csv"
dataset_df = pd.read_csv(input_file)
translate_batch_ollama(config=config, df=dataset_df)


## LLM running with vllm

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from langchain.prompts import PromptTemplate
import os
import requests
import json
import sys

project_root = "/home/users/luli/project/mt_luxembourgish"
if project_root not in sys.path:
    sys.path.append(project_root)

# Define a function to generate the translation prompt
def generate_translation_prompt(text, language_1="Luxembourgish", language_2="English"):
    prompt_template = """Please translate the following {language_1} text into {language_2}. Please answer me with only translated text!

    ---------------------------------- Text to be translated ----------------------------------

    {Text}

    ---------------------------------- Text to be translated ----------------------------------

    """
    
    translation_prompt = PromptTemplate(
        input_variables=["language_1", "language_2", "Text"],
        template=prompt_template
    )
    
    return translation_prompt.format(language_1=language_1, language_2=language_2, Text=text)

def generate_text_with_vllm(config, prompt):
    headers = {
        "Content-Type": "application/json",
    }

    payload = {
        "model": config["model_name"],
        "prompt": prompt,
    }
    payload.update(config["options"])

    response = requests.post(config["server_url"], headers=headers, json=payload)
    
    if response.status_code == 200:
        data = response.json()
        return data.get("generated_text", "")
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")


def get_latest_file(prefix):
    files = [f for f in os.listdir() if f.startswith(prefix) and f.endswith(".csv")]
    if not files:
        return None
    latest_file = max(files, key=os.path.getmtime)
    return latest_file


def load_checkpoint(latest_file, df, text_column):
    if latest_file:
        translated_df = pd.read_csv(latest_file)
        translated_texts = translated_df[text_column].tolist()
        start_idx = len(translated_texts)
    else:
        start_idx = 0
    return start_idx


def translate_batch_vllm(config, df):
    df["prompts_inputs"] = df[config["text_column"]].apply(generate_translation_prompt)
    
    # Determine output file name 
    prefix = config["prefix"]
    latest_file = get_latest_file(prefix)
    if latest_file and not bool(config.get("is_new_file", False)):
        output_file = latest_file
    else:
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"{prefix}_{current_time}.csv"

    # Load data and check starting index
    texts = df[config["text_column"]].tolist()
    start_idx = (
        load_checkpoint(latest_file, df, config["text_column"]) 
        if (latest_file and not bool(config.get("is_new_file", False))) 
        else 0
    )

    print("Start From Index: ", start_idx)
    texts = df["prompts_inputs"].to_list()

    for i in tqdm(range(start_idx, len(texts), config["batch_size"]), desc="Translating", unit="batch"):
        batch = texts[i:i + config["batch_size"]]
        translated_batch = [generate_text_with_vllm(config, text) for text in batch]

        for j, text in enumerate(batch):
            updated_row = df.iloc[i + j].copy()
            updated_row["translated_text"] = translated_batch[j]
            updated_dataframe = pd.DataFrame([updated_row])
            
            if i == start_idx and j == 0 and not latest_file:
                updated_dataframe.to_csv(output_file, index=False, mode="w", header=True)
            else:
                updated_dataframe.to_csv(output_file, index=False, mode="a", header=False)

    print(f"Translation completed. Results saved to {output_file}")

# CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server --model /mnt/lscratch/users/luli/model/Llama-3.2-3B-Instruct --tensor-parallel-size 4 --port 5260 --device cuda --dtype float16

config = {
    "model_name": "/mnt/lscratch/users/luli/model/Llama-3.2-3B-Instruct",  # Change to your model's name in vllm
    "server_url": "http://localhost:5260/v1/completions",  # vllm server URL
    "options": {
        "temperature": 0.1,
        "max_tokens": 512,
        "top_p": 0.9,
    },
    "batch_size": 1,
    "prefix": "translation_LLM_vllm",
    "text_column": "subsentence",
    "is_new_file": True
}

input_file = "NC_lux_subsentences_test.csv"
dataset_df = pd.read_csv(input_file)
translate_batch_vllm(config=config, df=dataset_df)


FileNotFoundError: [Errno 2] No such file or directory: 'NC_lux_subsentences_test.csv'

In [None]:
curl http://localhost:5260/v1/completions -H "Content-Type: application/json" -d '{"model": "/mnt/lscratch/users/luli/model/Llama-3.2-3B-Instruct", "prompt": "San Francisco is a"}