In [None]:
!pip install pandas openpyxl transformers torch
!pip install llama-cpp-python

In [None]:
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor

model_path = "/Users/briannapatten/Desktop/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
llm = Llama(model_path=model_path)

In [2]:
import random

def translate_query(query):
    #prompt = f"### Instruction:\nRephrase the following ESGish query into a natural English sentence that begins with 'Return all occurrences of'.\n\n{query}\n\n### Response:"
    #Changes up prompts each time to ensure variety in English conversions
    PROMPTS = [
        f"### Instruction:\nRephrase the following ESGish query into a natural English sentence that begins with 'Return all occurrences of'.\n\n{query}\n\n### Response:",
        f"### Instruction:\nRephrase the following ESGish query into a natural English sentence that begins with 'Find where'.\n\n{query}\n\n### Response:",
        f"### Instruction:\nRephrase the following ESGish query into a natural English sentence that begins with 'Return issuers that'.\n\n{query}\n\n### Response:",
        f"### Instruction:\nRephrase the following ESGish query into a natural English sentence.\n\n{query}\n\n### Response:",
        f"### Instruction:\nRephrase the following ESGish query into a natural English sentence that begins with 'Return'.\n\n{query}\n\n### Response:"
    ]
    #Error handling 
    try:
        #Chooses a random prompt from the array, queries a response from Mistral, returns response
        prompt = random.choice(PROMPTS).format(query=query)
        response = llm(prompt, max_tokens=575, temperature=0.1)
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(f"Error translating query: '{query}'\nError: {str(e)}")
        return "[TRANSLATION_FAILED]"

In [None]:
import pandas as pd
from functools import partial
import time

#Reads and stores the Esgish queries
df = pd.read_excel("esgish_short.xlsx")
queries = df["Esgish"].tolist()
#Ensures no overload and efficiency
batch_size = 100 
output_file = "translated_queries_TEST.xlsx"

#Looks at each query in each batch, calls the translate_query function, and stores it
for i in range(0, len(queries), batch_size):
    batch = queries[i:i + batch_size]
    translated_batch = []
    
    for query in batch:
        translated = translate_query(query)
        translated_batch.append(translated)
    
    df_batch = pd.DataFrame({
        "Esgish": batch,
        "English": translated_batch
    })

    #Makes a new file if needed, or adds onto the current file during each batch in case the program crashes at some point
    if i == 0:
        df_batch.to_excel(output_file, index=False)  
    else:
        with pd.ExcelWriter(output_file, mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
            df_batch.to_excel(writer, index=False, header=False, startrow=i + 1)
    
    print(f"Saved batch {i // batch_size + 1} to {output_file}")
    #Avoids overwhelming the system/API
    time.sleep(10) 