In [None]:
%pip install pandas openpyxl transformers torch
%pip install llama-cpp-python

In [None]:
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor

model_path = "/Users/solar/OneDrive/Documents/Capstone/Capstone-Jupyter/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
max_context = 4096
llm = Llama(
    model_path,
    n_ctx=max_context
)

In [None]:
import json
import re

with open("metadata.json", "r", encoding="utf-8") as f:
    raw_metadata = json.load(f)

# Create a lookup table
metadata_lookup = {
    entry["code"]: f'{entry["name"]}. {entry["description"]}' 
    for entry in raw_metadata
}

def extract_codes(query):
    return re.findall(r"\[([A-Za-z0-9_]+)\]", query)

# Find the entry with the longest name + description combo
longest_entry = max(metadata_lookup.items(), key=lambda item: len(item[1]))

# Print the result
print(f'Longest entry code: {longest_entry[0]}')
print(f'Length: {len(longest_entry[1])}')
print(f'Content: {longest_entry[1]}')


In [10]:
"""
def translate_query(query):
    prompt = f"### Instruction:\nRephrase this ESGish query as a natural English sentence. Each query is asking for all companies or issuers that match some paramater.\n\n{query}\n\n### Response:"

    # Use llama_cpp tokenizer to get exact number of tokens in prompt
    prompt_tokens = llm.tokenize(prompt.encode("utf-8"))
    num_prompt_tokens = len(prompt_tokens)

    max_total_tokens = 512
    max_response_tokens = max_total_tokens - num_prompt_tokens

    if max_response_tokens < 1:
        print(f"Skipping query — prompt too long: {query}")
        return "[Prompt too long]"

    response = llm(prompt, max_tokens=max_response_tokens, temperature=0.1)
    return response["choices"][0]["text"].strip()
"""
"""
def translate_query(query):
    codes = extract_codes(query)
    context = "\n".join(
        f"{code}: {metadata_lookup.get(code, 'No metadata found.')}"
        for code in codes
    )
    
    prompt = f"### Instruction:
Rephrase the following ESGish query into a concise natural English sentence. Each query is asking for all companies or issuers that match some paramater. Use the following metadata definitions for clarity:

{context}

Query: {query}

### Response:"

    response = llm(prompt, max_tokens=2048, temperature=0.1)
    return response["choices"][0]["text"].strip()
"""
def max_tokens(text, max_tokens=40):
    tokens = llm.tokenize(text.encode("utf-8"))
    if len(tokens) <= max_tokens:
        return text
    truncated = llm.detokenize(tokens[:max_tokens]).decode("utf-8", errors="ignore")
    return truncated + "..."

def max_afforded_tokens(codes):
    count = len(codes)
    m_tokens = int(max_context/count)
    if m_tokens < 100:
        m_tokens = 100
    return m_tokens


def translate_query(query, max_total_tokens=2048, max_output_tokens=256):
    codes = extract_codes(query)

    m_tokens = max_afforded_tokens(codes)

    # Build full context blocks for each code
    context_blocks = [
        f"{code}: {max_tokens(metadata_lookup.get(code, 'No metadata found.'), max_tokens=m_tokens)}" for code in codes
    ]

    # Initial prompt pieces
    instruction = "### Instruction:\nRephrase the following ESGish query into a concise natural English sentence. Each query is asking for all companies or issuers that match some paramater. Use the following metadata definitions for clarity:\n\n"
    query_part = f"\n\nQuery: {query}\n\n### Response:"

    # Tokenize instruction and query to calculate remaining token budget
    tokenizer = llm.tokenize  # Built-in tokenizer
    instruction_tokens = len(tokenizer(instruction.encode("utf-8")))
    query_tokens = len(tokenizer(query_part.encode("utf-8")))
    token_budget = max_total_tokens - max_output_tokens - instruction_tokens - query_tokens

    # Now iteratively add context blocks until budget is exhausted
    context = ""
    used_tokens = 0
    for block in context_blocks:
        block_tokens = len(tokenizer(block.encode("utf-8"))) + 1  # +1 for newline
        if used_tokens + block_tokens <= token_budget:
            context += block + "\n"
            used_tokens += block_tokens
        else:
            break  # stop once we're out of budget

    # Final prompt
    prompt = instruction + context + query_part

    # Call model
    response = llm(prompt, max_tokens=max_output_tokens, temperature=0.1)
    return response["choices"][0]["text"].strip()



In [None]:
import pandas as pd
from functools import partial
import time

#Reads and stores the Esgish queries
df = pd.read_excel("esgish_short.xlsx")
queries = df["Esgish"].tolist()
#Ensures no overload and efficiency
batch_size = 100 
output_file = "translated_queries_TEST.xlsx"

#Looks at each query in each batch, calls the translate_query function, and stores it
for i in range(0, len(queries), batch_size):
    batch = queries[i:i + batch_size]
    translated_batch = []
    
    for query in batch:
        translated = translate_query(query)
        translated_batch.append(translated)
    
    df_batch = pd.DataFrame({
        "Esgish": batch,
        "English": translated_batch
    })

    #Makes a new file if needed, or adds onto the current file during each batch in case the program crashes at some point
    if i == 0:
        df_batch.to_excel(output_file, index=False)  
    else:
        with pd.ExcelWriter(output_file, mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
            df_batch.to_excel(writer, index=False, header=False, startrow=i + 1)
    
    print(f"Saved batch {i // batch_size + 1} to {output_file}")