In [None]:
!pip install pandas openpyxl transformers torch
!pip install llama-cpp-python

In [None]:
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor

model_path = "/Users/pierr/Desktop/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
llm = Llama(
    model_path=model_path,
    n_ctx = 2048
)

In [None]:
def translate_query(query):
    # ESGish cheat sheet for llm
    cheat_sheet = """
    ### ESGish Query Language – Cheat Sheet

    1. Query Structure:
    - A query can be a single condition or a group of conditions using AND, OR, NOT.

    2. Filters:
    - Format: [FieldName] Operator 'Value'
    - Operators: =, >, <, >=, <=, !=
    - Example: [BoardIndependencePct] > '0.7'

    3. Functions:
    - RATIO(F1, F2): ratio of F1 to F2
    - SUM(F1, F2, ...): total of multiple fields
    - CASE_COUNT(Query): number of cases matching a query

    4. Logical Operators:
    - AND(...), OR(...), NOT(...)
    - Used to combine multiple conditions

    5. Syntax Notes:
    - Field names use square brackets []
    - Values are quoted: 'Yes', '0.5', 'High'
    - Fields can be numeric, boolean, date, or string

    6. Your Task:
    - Reprhase the ESGish queries provided as natural English statements
    - Use lead-ins like: "Find companies where...", "Show issuers with...", etc.
    - Output should be clear, concise, and human-friendly
    - Give only one translation per query
    """
    prompt = f"""{cheat_sheet}
    
    {query}
    
    ### Response:"""

    # send the prompt to the llm with specified parameters
    response = llm(prompt, max_tokens=575, temperature=0.1)
    return response["choices"][0]["text"].strip()


In [8]:
print("Max context window:", llm.context_params.n_ctx)

Max context window: 2048


In [None]:
import pandas as pd
import os
from tqdm import tqdm
from openpyxl import load_workbook
from functools import partial
import time

#Reads and stores the Esgish queries
df = pd.read_excel("tester.xlsx")
queries = df["Esgish"].tolist()
#Ensures no overload and efficiency
batch_size = 100 
output_file = "translated_queries_TEST.xlsx"
start = time.time()

#Looks at each query in each batch, calls the translate_query function, and stores it
for i in range(0, len(queries), batch_size):
    batch = queries[i:i + batch_size]
    translated_batch = []
    
    for query in tqdm(batch, desc=f"Translating batch {i // batch_size + 1}"):
        translated = translate_query(query)
        translated_batch.append(translated)
    
    df_batch = pd.DataFrame({
        "Esgish": batch,
        "English": translated_batch
    })

    if i == 0 or not os.path.exists(output_file):
        df_batch.to_excel(output_file, index=False)  
    else:
        # Get the current row count in the Excel file to avoid overlapping
        workbook = load_workbook(output_file)
        sheet = workbook.active
        existing_rows = sheet.max_row

        with pd.ExcelWriter(output_file, mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
            df_batch.to_excel(writer, index=False, header=False, startrow=existing_rows)

    print(f"Saved batch {i // batch_size + 1} to {output_file}")
    #Avoids overwhelming the system/API
    time.sleep(1) 

end = time.time()