In [None]:
!pip install pandas openpyxl transformers torch
!pip install llama-cpp-python

In [None]:
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor

model_path = "/Users/pierr/Desktop/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
llm = Llama(model_path=model_path)

In [5]:
def translate_query(query):
    # ESGish cheat sheet to guide the LLM
    cheat_sheet = """
    ### ESGish Query Language – Cheat Sheet

    1. Query Structure:
    - A query can be a single condition or a group of conditions using AND, OR, NOT.

    2. Filters:
    - Format: [FieldName] Operator 'Value'
    - Operators: =, >, <, >=, <=, !=
    - Example: [BoardIndependencePct] > '0.7'

    3. Functions:
    - RATIO(F1, F2): ratio of F1 to F2
    - SUM(F1, F2, ...): total of multiple fields
    - CASE_COUNT(Query): number of cases matching a query

    4. Logical Operators:
    - AND(...), OR(...), NOT(...)
    - Used to combine multiple conditions

    5. Syntax Notes:
    - Field names use square brackets []
    - Values are quoted: 'Yes', '0.5', 'High'
    - Fields can be numeric, boolean, date, or string

    6. Your Task:
    - Rephrase ESGish queries as natural English statements
    - Use lead-ins like: "Find companies where...", "Show issuers with...", etc.
    - Output should be clear, concise, and human-friendly.
    """
    prompt = f"""{cheat_sheet}
    
    {query}
    
    ### Response:"""

    # send the prompt to the llm with specified parameters
    response = llm(prompt, max_tokens=575, temperature=0.1)
    return response["choices"][0]["text"].strip()

query = "[CEOSpecialGrantPct] > '0.5'"

#     prompt = f"""### Instruction:
#     Rephrase the following ESGish query into a concise natural English sentence. These queries return a set of issuers from a larger dataset. With this in mind can you also generate a variety of natural language openers prior to the associated prompts to form the full English sentence.   
    
#     {query}
    
#     ### Response:"""
#     # send the prompt to the llm with specified parameters
#     response = llm(prompt, max_tokens=575, temperature=0.1)
#     return response["choices"][0]["text"].strip()
# query = "[CEOSpecialGrantPct] > '0.5'"

In [None]:
import pandas as pd
from functools import partial
import time

#Reads and stores the Esgish queries
df = pd.read_excel("esgish_short.xlsx")
queries = df["Esgish"].tolist()
#Ensures no overload and efficiency
batch_size = 100 
output_file = "translated_queries_TEST.xlsx"

#Looks at each query in each batch, calls the translate_query function, and stores it
for i in range(0, len(queries), batch_size):
    batch = queries[i:i + batch_size]
    translated_batch = []
    
    for query in batch:
        translated = translate_query(query)
        translated_batch.append(translated)
    
    df_batch = pd.DataFrame({
        "Esgish": batch,
        "English": translated_batch
    })

    #Makes a new file if needed, or adds onto the current file during each batch in case the program crashes at some point
    if i == 0:
        df_batch.to_excel(output_file, index=False)  
    else:
        with pd.ExcelWriter(output_file, mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
            df_batch.to_excel(writer, index=False, header=False, startrow=i + 1)
    
    print(f"Saved batch {i // batch_size + 1} to {output_file}")
    #Avoids overwhelming the system/API
    time.sleep(1) 