In [1]:
import json
import pandas as pd
import time
from tqdm import tqdm
import boto3

bedrock = boto3.client('bedrock-runtime', region_name = 'us-east-1')

In [2]:
output = "D:/GS/final_scoping_review/data_extractions/stage3_health_theme/2023_2024.xlsx"

df_terms = pd.read_excel("D:/GS/final_scoping_review/shortened_sub_health_themes.xlsx")
df_docs = pd.read_excel("D:/GS/final_scoping_review/final_excels/yearly_grouped_excels/2023_2024.xlsx")

In [3]:
########################################
# 1. Batch Processing Function
########################################
def batch_check_relevance(terms_definitions, abstract, max_retries=3):
    """
    Processes all terms for one document in a single API call
    Returns list of relevant terms or empty list on failure
    """
    # Truncate abstract to save tokens (adjust as needed)
    # truncated_abstract = abstract[:1500] + "..." if len(abstract) > 1500 else abstract
    truncated_abstract = abstract
    
    # Create term-definition list with numbering
    numbered_terms = "\n".join([f"{i+1}. {term}: {defn}" 
                              for i, (term, defn) in enumerate(terms_definitions.items())])
    
    prompt = f"""Analyze this abstract and return COMMA-SEPARATED list of relevant term which best describes the abstract based on the term's definitions (NUMBERS ONLY):
    
Terms/Definitions:
{numbered_terms}

Abstract: {truncated_abstract}

Return ONLY comma-separated numbers of relevant terms 
If none are relevant, return "0". Do not include explanations."""

    payload = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 1000,
        "messages": [{
            "role": "user",
            "content": [{"type": "text", "text": prompt}]
        }]
    }

    for attempt in range(max_retries):
        try:
            response = bedrock.invoke_model(
                body=json.dumps(payload),
                modelId='anthropic.claude-3-5-sonnet-20240620-v1:0',  # Cheaper model
                accept='application/json',
                contentType='application/json'
            )
            
            result = json.loads(response['body'].read().decode('utf-8'))
            response_text = result['content'][0]['text'].strip()
            
            if response_text == "0":
                return []
                
            # Convert number responses back to term names
            term_numbers = [int(n) for n in response_text.split(",")]
            term_list = list(terms_definitions.keys())
            return [term_list[n-1] for n in term_numbers if 0 < n <= len(term_list)]
            
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {str(e)}")
            continue
            
    return []  # Return empty if all retries fail

########################################
# 2. Main Script with Optimizations
########################################

# Preprocess terms into a dictionary
terms_dict = dict(zip(df_terms['Term'], df_terms['Shortened_Definition']))

mapped_results = []
total_docs = len(df_docs)

for i, doc_row in df_docs.iterrows():
    doc_title = doc_row['Title']
    doc_abstract = doc_row['Abstract']
    
    # Get relevant terms in batch
    relevant_terms = batch_check_relevance(terms_dict, doc_abstract)
    
    # Filter valid terms
    valid_terms = [term for term in relevant_terms if term in terms_dict]
    
    mapped_results.append({
        "Title": doc_title,
        "Relevant Terms": ", ".join(valid_terms)
    })
    
    # Progress tracking with flush
    print(f"Processed {i+1}/{total_docs}", end='\r', flush=True)

# Save results
output_df = pd.DataFrame(mapped_results)
output_df.to_excel(output, index=False)

print("\nBatch processing complete! Results saved to final_sub_theme_short_mapping.xlsx")


Processed 3823/3823
Batch processing complete! Results saved to final_sub_theme_short_mapping.xlsx
