In [1]:
import json
import boto3
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [2]:
bedrock = boto3.client('bedrock-runtime', region_name = 'us-east-1')

In [3]:
abstract_df = pd.read_csv("D:/GS/final_scoping_review/final_papers/scopus_final/excel_sheets/2024.csv")

In [None]:
# # classification prompt

# def classify_abstract(abstract):
    
#     # payload
#     payload = {
#         "anthropic_version": "bedrock-2023-05-31",
#         "max_tokens": 20000,
#         "messages": [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": f"""
# Human: Does this abstract describe a solution to a healthcare-related problem (including but not limited to physical health, mental health, environmental health and any other healthcare field) using smart technology or technologies?  
# Answer "Yes" or "No".
# Skip if the paper is a review by answering "Review Paper".

# If unsure or the abstract is unclear, answer "No". No explanation or reasons for any of the cases

# Abstract: {abstract}
#                         """
#                     }
#                 ]
#             }
#         ]
#     }
    
#     # Invoke model
#     response = bedrock.invoke_model(
#             body=json.dumps(payload),
#             modelId='anthropic.claude-3-5-sonnet-20240620-v1:0',
#             accept='application/json',
#             contentType='application/json'
#         )
    

#     # full_response = json.loads(response['body'].read().decode('utf-8'))

#     full_response = json.loads(response['body'].read().decode('utf-8'))
#     assistant_message = full_response['content'][0]['text']
#     return assistant_message.strip()


# # processing each abstract and classifying
# results = []
# for index, row in abstract_df.iterrows():
#     abstract = row['Abstract']
#     classification = classify_abstract(abstract)
#     results.append(classification)

#     print(f"Processed {index+1}/{len(abstract_df)}")

# # Final save after full run
# abstract_df['Classification'] = results
# abstract_df.to_excel('D:/GS/final_scoping_review/saved_files/classified_abstracts_2016.xlsx', index=False)

# print("Classification completed and saved to 'classified_abstracts.xlsx'")

In [4]:
# Original single classification function (batch processing)

def classify_abstract(abstract):
    payload = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 20000,
        "messages": [{
            "role": "user",
            "content": [{
                "type": "text",
                "text": f"""
Human: Does this abstract describe a solution to a healthcare-related problem (including but not limited to physical health, mental health, environmental health and any other healthcare field) using smart technology or technologies or deals with any sort of safety/security issues using smart technology?  
Answer "Yes" or "No".
Skip if the paper is a review by answering "Review Paper".

If unsure or the abstract is unclear, answer "No". No explanation or reasons for any of the cases.

Abstract: {abstract}
"""
            }]
        }]
    }
    
    response = bedrock.invoke_model(
        body=json.dumps(payload),
        modelId='anthropic.claude-3-5-sonnet-20240620-v1:0',
        accept='application/json',
        contentType='application/json'
    )
    
    full_response = json.loads(response['body'].read().decode('utf-8'))
    return full_response['content'][0]['text'].strip()

# Batch processing implementation
def classify_batch(index_range, abstracts):
    try:
        numbered_abstracts = "\n".join(
            [f"{i+1}. Abstract: {text}" for i, text in enumerate(abstracts)]
        )

        prompt = f"""
Classify each abstract (1-{len(abstracts)}): 
Does this abstract describe a solution to a healthcare-related problem (including but not limited to physical health, mental health, environmental health and any other healthcare field) using smart technology or technologies?  
Answer "Yes" or "No".
Skip if the paper is a review by answering "Review Paper".

If unsure or the abstract is unclear, answer "No". No explanation or reasons for any of the cases.

{numbered_abstracts}
        """

        payload = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 20000,
            "messages": [{
                "role": "user",
                "content": [{"type": "text", "text": prompt}]
            }]
        }

        response = bedrock.invoke_model(
            body=json.dumps(payload),
            modelId='anthropic.claude-3-5-sonnet-20240620-v1:0',
            accept='application/json',
            contentType='application/json'
        )

        time.sleep(1)

        result_text = json.loads(response['body'].read().decode('utf-8'))['content'][0]['text'].strip()
        answers = [ans.split()[-1].strip() for ans in result_text.splitlines() if ans.strip()]
        return list(zip(index_range, answers))

    except Exception as e:
        print(f"Error processing batch: {e}")
        return [(i, "Error") for i in index_range]

# Main execution flow
def process_abstracts(abstract_df, batch_size=5, workers=3):
    abstracts = list(abstract_df['Abstract'])
    
    # Create batches
    batches = [abstracts[i:i + batch_size] for i in range(0, len(abstracts), batch_size)]
    index_ranges = [range(i, min(i + batch_size, len(abstracts))) for i in range(0, len(abstracts), batch_size)]

    results = [None] * len(abstracts)
    
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = [
            executor.submit(classify_batch, idx_range, batch)
            for idx_range, batch in zip(index_ranges, batches)
        ]
        
        for i, future in enumerate(as_completed(futures)):
            for idx, classification in future.result():
                results[idx] = classification
            print(f"Completed batch {i+1}/{len(batches)}")
    
    return results

# Main execution block
if __name__ == "__main__":
    classifications = process_abstracts(abstract_df)
    
    abstract_df['Classification'] = classifications
    abstract_df.to_excel('D:/GS/final_scoping_review/saved_files/classified_abstracts_2024.xlsx', index=False)
    print("Classification completed and saved")


Completed batch 1/13448
Completed batch 2/13448
Completed batch 3/13448
Completed batch 4/13448
Completed batch 5/13448
Completed batch 6/13448
Completed batch 7/13448
Completed batch 8/13448
Completed batch 9/13448
Completed batch 10/13448
Completed batch 11/13448
Completed batch 12/13448
Completed batch 13/13448
Completed batch 14/13448
Completed batch 15/13448
Completed batch 16/13448
Completed batch 17/13448
Completed batch 18/13448
Completed batch 19/13448
Completed batch 20/13448
Completed batch 21/13448
Completed batch 22/13448
Completed batch 23/13448
Completed batch 24/13448
Completed batch 25/13448
Completed batch 26/13448
Completed batch 27/13448
Completed batch 28/13448
Completed batch 29/13448
Completed batch 30/13448
Completed batch 31/13448
Completed batch 32/13448
Completed batch 33/13448
Completed batch 34/13448
Completed batch 35/13448
Completed batch 36/13448
Completed batch 37/13448
Completed batch 38/13448
Completed batch 39/13448
Completed batch 40/13448
Completed

In [6]:
abstract_df.to_csv('classified_abstracts_2023_cleaned.csv', index=False)