In [None]:
%pip install pandas openpyxl transformers torch
%pip install llama-cpp-python

In [None]:
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor

model_path = "ENTER_YOUR_PATH_HERE/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
max_context = 4096
llm = Llama(
    model_path,
    n_ctx=max_context
)

In [None]:
import json
import re

with open("metadata.json", "r", encoding="utf-8") as f:
    raw_metadata = json.load(f)

# Create a lookup table
metadata_lookup = {
    entry["code"]: f'{entry["name"]}. {entry["description"]}' 
    for entry in raw_metadata
}

def extract_codes(query):
    return re.findall(r"\[([A-Za-z0-9_]+)\]", query)

list_keywords = {"IN", "ANY", "NONE"}
null_keywords = {":NC": "Null type \"Not collected\"", ":NA": "Null type \"Not applicable\"", ":ND": "Null type \"Not disclosed\"", ":NI": "Null type \"No information\"", ":NM": "Null type \"Not meaningful\""}
def extract_nulls_and_lists(query):
    nulls = []
    lists = []
    contains_null = re.findall(r"\[([A-Za-z0-9_]+)\]\s+IS\s+(:[A-Z]{2})", query)
    for code, null_keyword in contains_null:
        nulls.append((code, null_keyword))
    
    contains_list = re.findall(r"\[([A-Za-z0-9_]+)\]\s+(IN|ANY|NONE)\b", query, re.IGNORECASE)
    for code, list_keyword in contains_list:
        lists.append((code, list_keyword.upper()))
    
    return nulls, lists

# Find the entry with the longest name + description combo
longest_entry = max(metadata_lookup.items(), key=lambda item: len(item[1]))

# Print the result
print(f'Longest entry code: {longest_entry[0]}')
print(f'Length: {len(longest_entry[1])}')
print(f'Content: {longest_entry[1]}')

In [None]:
def max_tokens(text, max_tokens=40):
    #moved this to use in below function
    tokens = llm.tokenize(text.encode("utf-8"))
    if len(tokens) <= max_tokens:
        return text
    truncated = llm.detokenize(tokens[:max_tokens]).decode("utf-8", errors="ignore")
    return truncated + "..."

def build_ordered_context(query, token_budget):
    #builds context in-order: null type definitions, enumerations for list types, and metadata lookups
    context_lines = []
    seen = set()
    used_tokens = 0

    null_hits, list_hits = extract_nulls_and_lists(query)
    codes_in_order = re.findall(r"\[([A-Za-z0-9_]+)\]", query)

    #tokenizer = llm.tokenizer
    for code in codes_in_order:
        if code in seen:
            continue
        seen.add(code)

        null_entry = next((kw for c, kw in null_hits if c == code), None)
        if null_entry:
            null_def = null_keywords.get(null_entry, f"No definition for {null_entry}")
            line = f"{code} = {null_entry} → {null_def}"
            tokens = len(llm.tokenize(line.encode("utf-8")))
            if used_tokens + tokens > token_budget:
                break
            context_lines.append(line)
            used_tokens += tokens

        base_meta = metadata_lookup.get(code, "No metadata found.")
        metadata_line = f"{code}: {max_tokens(base_meta, 100)}" 
        tokens = len(llm.tokenize(metadata_line.encode("utf-8")))
        if used_tokens + tokens > token_budget:
            break
        context_lines.append(metadata_line)
        used_tokens += tokens

        if any(c == code for c, _ in list_hits):
            enum_line = f"{code} (enumeration): {max_tokens(base_meta, 100)}"
            tokens = len(llm.tokenize(enum_line.encode("utf-8")))
            if used_tokens + tokens > token_budget:
                break
            context_lines.append(enum_line)
            used_tokens += tokens

    return "\n".join(context_lines)

def max_afforded_tokens(codes):
    return max(4096 // max(1, len(codes)), 100)


def translate_query(query, max_total_tokens=2048, max_output_tokens=256):
    codes = extract_codes(query)

    m_tokens = max_afforded_tokens(codes)

    # Initial prompt pieces
    instruction = "### Instruction:\nRephrase the following ESGish query into a concise natural English sentence. Each query is asking for all companies or issuers that match some paramater. Use the following metadata definitions for clarity:\n\n"
    query_part = f"\n\nQuery: {query}\n\n### Response:"

    # Tokenize instruction and query to calculate remaining token budget
    #tokenizer = llm.tokenize  # Built-in tokenizer
    instruction_tokens = len(llm.tokenize(instruction.encode("utf-8")))
    query_tokens = len(llm.tokenize(query_part.encode("utf-8")))
    token_budget = max_total_tokens - max_output_tokens - instruction_tokens - query_tokens

    # Build full context blocks for each code
    context = build_ordered_context(query, token_budget)
    print("query: ", query)
    print(context)

    """
    # Now iteratively add context blocks until budget is exhausted
    context = ""
    used_tokens = 0
    for block in context_blocks:
        block_tokens = len(tokenizer(block.encode("utf-8"))) + 1  # +1 for newline
        if used_tokens + block_tokens <= token_budget:
            context += block + "\n"
            used_tokens += block_tokens
        else:
            break  # stop once we're out of budget
    """
    # Final prompt
    prompt = instruction + context + query_part

    # Call model
    response = llm(prompt, max_tokens=max_output_tokens, temperature=0.1)
    return response["choices"][0]["text"].strip()




In [None]:
import pandas as pd
from functools import partial
import time

# Reads and stores the Esgish queries
df = pd.read_excel("Esgish2.xlsx") ## Input File
queries = df["Esgish"].tolist()

#Ensures no overload and efficiency
batch_size = 100 
output_file = "Translated_Esgish2.xlsx" ## Output File

#Looks at each query in each batch, calls the translate_query function, and stores it
for i in range(0, len(queries), batch_size):
    batch = queries[i:i + batch_size]
    translated_batch = []
    
    for query in batch:
        translated = translate_query(query)
        translated_batch.append(translated)
    
    df_batch = pd.DataFrame({
        "Esgish": batch,
        "English": translated_batch
    })

    #Makes a new file if needed, or adds onto the current file during each batch in case the program crashes at some point
    if i == 0:
        df_batch.to_excel(output_file, index=False)  
    else:
        with pd.ExcelWriter(output_file, mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
            df_batch.to_excel(writer, index=False, header=False, startrow=i + 1)
    
    print(f"Saved batch {i // batch_size + 1} to {output_file}")

In [None]:
import pandas as pd

input_file = "Translated_Esgish2.xlsx"
output_file = "Translated_Esgish2.xlsx"

# Extract the English column
df = pd.read_excel(input_file)
english_queries = df["English"].tolist()

# Function to request a comprehensibility rating from the model
def rate_comprehensibility(text, max_tokens=256):
    # Adjust the prompt to ask the model for a rating from 1 to 10
    prompt = f"""### Instruction:
Please rate the following text's comprehensibility from 1 to 10, where:
- 1 = Completely incomprehensible, nonsensical, or full of errors.
- 5 = Understandable with effort; some awkwardness, complexity, or minor errors.
- 10 = Perfectly clear, natural, and easy to understand.

Here are some examples:
Text: "asjdk asjd aksd" → Rating: 1
Text: "Provide list companies ESG data incomplete understandable" → Rating: 4
Text: "Please provide a list of companies with complete ESG data." → Rating: 9

Now, rate this text:
Text: {text}

### Response:"""
    # Call model (adjust temperature and other params as needed)
    response = llm(prompt, max_tokens=max_tokens, temperature=0.2)
    rating = response["choices"][0]["text"].strip()
    print(rating)
    # Ensure the response is a valid number between 1 and 10
    try:
        rating = int(rating)
        if 1 <= rating <= 10:
            return rating
    except ValueError:
        pass
    return None  # Return None if no valid rating is obtained

# List to store ratings
ratings = []

# Iterate through each English translation and get a rating
for query in english_queries:
    rating = rate_comprehensibility(query)
    ratings.append(rating)

# Add the ratings as a new column to the dataframe
df["Comprehensibility Rating"] = ratings

# Save the updated dataframe with ratings to a new Excel file
df.to_excel(output_file, index=False)

print("Comprehensibility ratings added and saved to 'Translated_Esgish2.xlsx'")
