In [None]:
!pip install pandas openpyxl transformers torch
!pip install llama-cpp-python

In [None]:
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor

model_path = "/Users/pierr/Desktop/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
llm = Llama(model_path=model_path)

In [5]:
def translate_query(query):
    # ESGish cheat sheet to guide the LLM
    cheat_sheet = """
    ### ESGish Query Language – Cheat Sheet

    1. Query Structure:
    - A query can be a single condition or a group of conditions using AND, OR, NOT.

    2. Filters:
    - Format: [FieldName] Operator 'Value'
    - Operators: =, >, <, >=, <=, !=
    - Example: [BoardIndependencePct] > '0.7'

    3. Functions:
    - RATIO(F1, F2): ratio of F1 to F2
    - SUM(F1, F2, ...): total of multiple fields
    - CASE_COUNT(Query): number of cases matching a query

    4. Logical Operators:
    - AND(...), OR(...), NOT(...)
    - Used to combine multiple conditions

    5. Syntax Notes:
    - Field names use square brackets []
    - Values are quoted: 'Yes', '0.5', 'High'
    - Fields can be numeric, boolean, date, or string

    6. Your Task:
    - Rephrase ESGish queries as natural English statements
    - Use lead-ins like: "Find companies where...", "Show issuers with...", etc.
    - Output should be clear, concise, and human-friendly.
    """
    prompt = f"""{cheat_sheet}
    
    {query}
    
    ### Response:"""

    # send the prompt to the llm with specified parameters
    response = llm(prompt, max_tokens=575, temperature=0.1)
    return response["choices"][0]["text"].strip()

query = "[CEOSpecialGrantPct] > '0.5'"

#     prompt = f"""### Instruction:
#     Rephrase the following ESGish query into a concise natural English sentence. These queries return a set of issuers from a larger dataset. With this in mind can you also generate a variety of natural language openers prior to the associated prompts to form the full English sentence.   
    
#     {query}
    
#     ### Response:"""
#     # send the prompt to the llm with specified parameters
#     response = llm(prompt, max_tokens=575, temperature=0.1)
#     return response["choices"][0]["text"].strip()
# query = "[CEOSpecialGrantPct] > '0.5'"

In [None]:
import pandas as pd
from functools import partial
import time # used to pause execution between batches

# read the excel file and load the "ESGish" col into a list
df = pd.read_excel("tester.xlsx")
queries = df["Esgish"].tolist()

# set the number of queries to translate per batch
batch_size = 100  
# create an empty list to store translated queries
translated_queries = []

# process queries in batches
for i in range(0, len(queries), batch_size):
    batch = queries[i:i + batch_size] # get the current batch of queries
    batch_translations = [translate_query(q) for q in batch] # translate each query in the batch
    translated_queries.extend(batch_translations) # add the translated queries to the full list
   
    print(f"Processed {i + len(batch)} / {len(queries)} queries") # print progress update
    
    # additional logging every 10 batches
    if (i // batch_size + 1) % 10 == 0: 
        print(f"Processed {i + len(batch)} / {len(queries)} queries")
    
    time.sleep(1) # pause to avoid overloading the translation service

df["English"] = translated_queries
df.to_excel("translated_queries.xlsx", index=False)