In [2]:
import os
import json
from lxml import etree

def load_pmc_ids(filename):
    with open(filename, 'r') as f:
        pmc_ids = json.load(f)
    return pmc_ids

def parse_xml_file(file_path):
    details = {
        'disease': '',
        'drug': '',
        'animal': '',
        'disease_model': '',
        'dosage': '',
        'dosage_duration': ''
    }

    try:
        tree = etree.parse(file_path)
        root = tree.getroot()
        
        # Example XPath queries to extract relevant information
        details['disease'] = extract_info(root, '//disease')
        details['drug'] = extract_info(root, '//drug')
        details['animal'] = extract_info(root, '//animal')
        details['disease_model'] = extract_info(root, '//disease_model')
        details['dosage'] = extract_info(root, '//dosage')
        details['dosage_duration'] = extract_info(root, '//dosage_duration')

    except Exception as e:
        print(f"Failed to parse {file_path}: {str(e)}")
    
    return details



In [3]:
def extract_info(root, xpath):
    elements = root.xpath(xpath)
    if elements:
        return ' '.join([element.text for element in elements if element.text is not None])
    return "Not specified"

def main():
    pmc_ids_file = 'pmc_ids.json'  # The JSON file containing the list of PMC IDs
    input_dir = '/mnt/data/shared/PMC'  # Directory containing the XML files
    output_file = 'extracted_details.json'  # JSON file to save the extracted details

    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist.")
        return

    pmc_ids = load_pmc_ids(pmc_ids_file)
    all_details = []

    for pmc_id in pmc_ids:
        file_path = os.path.join(input_dir, f"{pmc_id}.xml")
        if os.path.exists(file_path):
            details = parse_xml_file(file_path)
            details['PMC ID'] = pmc_id  # Include the PMC ID in the details for reference
            all_details.append(details)
        else:
            print(f"File {file_path} does not exist.")

    # Save the extracted details to a JSON file
    with open(output_file, 'w') as f:
        json.dump(all_details, f, indent=4)

    print(f"Data has been written to {output_file}")

if __name__ == '__main__':
    main()


File /mnt/data/shared/PMC/PMC11119472.xml does not exist.
File /mnt/data/shared/PMC/PMC11112549.xml does not exist.
File /mnt/data/shared/PMC/PMC11056916.xml does not exist.
File /mnt/data/shared/PMC/PMC10769202.xml does not exist.
File /mnt/data/shared/PMC/PMC10603947.xml does not exist.
File /mnt/data/shared/PMC/PMC10726404.xml does not exist.
File /mnt/data/shared/PMC/PMC10453983.xml does not exist.
File /mnt/data/shared/PMC/PMC10318922.xml does not exist.
File /mnt/data/shared/PMC/PMC10221257.xml does not exist.
File /mnt/data/shared/PMC/PMC10111265.xml does not exist.
File /mnt/data/shared/PMC/PMC9995971.xml does not exist.
File /mnt/data/shared/PMC/PMC9977891.xml does not exist.
File /mnt/data/shared/PMC/PMC9936517.xml does not exist.
File /mnt/data/shared/PMC/PMC9897793.xml does not exist.
File /mnt/data/shared/PMC/PMC9817828.xml does not exist.
File /mnt/data/shared/PMC/PMC9787120.xml does not exist.
File /mnt/data/shared/PMC/PMC9745895.xml does not exist.
File /mnt/data/shared

# Extract using Llama 3

In [None]:
import os
import json
import xml.etree.ElementTree as ET
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def load_llama_model():
    model_name = "meta-llama/Llama-2-8b-hf"  # Adjust if necessary
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    return tokenizer, model

def extract_text_from_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Extract text from all paragraph elements
    paragraphs = root.findall(".//p")
    text = " ".join([p.text for p in paragraphs if p.text])
    
    return text

def extract_preclinical_details(text, tokenizer, model):
    prompt = f"""
    Extract the following preclinical details from the given text:
    - Disease
    - Drug
    - Animal
    - Disease model
    - Dosage
    - Dosage duration

    Text: {text[:4000]}  # Limiting text to 4000 characters to avoid token limit issues

    Provide the output in JSON format.
    """

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
    outputs = model.generate(**inputs, max_length=2048)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract JSON from the response
    try:
        json_start = response.index('{')
        json_end = response.rindex('}') + 1
        json_str = response[json_start:json_end]
        return json.loads(json_str)
    except:
        return {"error": "Failed to extract JSON from model output"}



In [None]:
def process_pmc_papers(folder_path, tokenizer, model):
    results = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".xml"):
            file_path = os.path.join(folder_path, filename)
            try:
                xml_text = extract_text_from_xml(file_path)
                details = extract_preclinical_details(xml_text, tokenizer, model)
                results.append({
                    "filename": filename,
                    "details": details
                })
            except ET.ParseError:
                print(f"Error parsing XML file: {filename}")
            except Exception as e:
                print(f"Error processing file {filename}: {str(e)}")
    return results



In [None]:
def main():
    pmc_folder = ""  # Update this path
    tokenizer, model = load_llama_model()
    results = process_pmc_papers(pmc_folder, tokenizer, model)
    
    # Save results to a JSON file
    with open("preclinical_details.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()

In [3]:
%pip install requests lxml

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
import json
import requests
from lxml import etree
from huggingface_hub import InferenceClient

# Set up Hugging Face API key and model
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B"
headers = {"Authorization": "Bearer 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"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

# Function to parse XML files and extract text
def parse_xml(file_path):
    try:
        tree = etree.parse(file_path)
        root = tree.getroot()
        full_text = ' '.join(root.xpath('//text()'))
        return full_text
    except Exception as e:
        print(f"Failed to parse {file_path}: {str(e)}")
        return None

# Function to extract relevant details using the Hugging Face API
def extract_details_from_text(text):
    prompt = (
        "Extract the following preclinical data from the text: \n"
        "- Disease: The specific disease or condition being studied.\n"
        "- Drug: The drug or treatment being tested.\n"
        "- Animal: The type of animal used in the study.\n"
        "- Disease Model: The animal model used to replicate the disease or condition.\n"
        "- Dosage: The amount of drug or treatment administered.\n"
        "- Dosage Duration: The duration for which the dosage was administered.\n"
        "Provide the extracted information in a structured format.\n\n"
        f"Text: {text}"
    )
    
    payload = {"inputs": prompt}
    response = query(payload)
    
    # Extract generated text from the response
    if response and isinstance(response, list) and 'generated_text' in response[0]:
        return response[0]['generated_text'].strip()
    else:
        return "No details extracted"

def main():
    input_dir = '/mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx'  # Directory containing the XML files
    output_file = 'extracted_details.json'  # JSON file to save the extracted details

    pmc_ids_file = 'pmc_ids.json'  # The JSON file containing the list of PMC IDs


    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist.")
        return

    pmc_ids = load_pmc_ids(pmc_ids_file)
    processed_files = set()

    # Load already processed details if the output file exists
    if os.path.exists(output_file):
        with open(output_file, 'r') as f:
            all_details = json.load(f)
            processed_files = {detail['file'] for detail in all_details}
    else:
        all_details = []

    for pmc_id in pmc_ids:
        filename = f"{pmc_id}.xml"
        if filename in processed_files:
            print(f"Skipping already processed file: {filename}")
            continue

        file_path = os.path.join(input_dir, filename)
        if os.path.exists(file_path):
            text = parse_xml(file_path)
            if text:
                details = extract_details_from_text(text)
                all_details.append({
                    'file': filename,
                    'details': details
                })

                # Write the updated details to the output file incrementally
                with open(output_file, 'w') as f:
                    json.dump(all_details, f, indent=4)

                print(f"Processed and saved details for: {filename}")
        else:
            print(f"File {file_path} does not exist.")

if __name__ == '__main__':
    main()

File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC11119472.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC11112549.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC11056916.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC10769202.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC10603947.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC10726404.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC10453983.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC10318922.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC10221257.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC10111265.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC9995971.xml does not exist.
File /mnt/data/shared/PMC/oa_noncomm/PMC002xxxxxx/PMC9977891.xml does not exist.
File /mnt/data/sha

## Llama 3 Instruct

In [17]:
import os
import json
from lxml import etree
from huggingface_hub import InferenceClient

# Set up Hugging Face API key and model
client = InferenceClient(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    token="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",
)

# Function to parse XML files and extract text
def parse_xml(file_path):
    try:
        tree = etree.parse(file_path)
        root = tree.getroot()
        full_text = ' '.join(root.xpath('//text()'))
        return full_text
    except Exception as e:
        print(f"Failed to parse {file_path}: {str(e)}")
        return None

# Function to extract relevant details using the Hugging Face InferenceClient
def extract_details_from_text(text):
    prompt = (
        "Extract the following preclinical data from the text:\n"
        "- Disease: The specific disease or condition being studied.\n"
        "- Drug: The drug or treatment being tested.\n"
        "- Animal: The type of animal used in the study.\n"
        "- Disease Model: The animal model used to replicate the disease or condition.\n"
        "- Dosage: The amount of drug or treatment administered.\n"
        "- Dosage Duration: The duration for which the dosage was administered.\n"
        "Provide the extracted information in a structured format.\n\n"
        f"Text: {text}"
    )
    
    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500
    )
    
    generated_text = ""
    for message in response:
        if 'choices' in message and message['choices']:
            generated_text += message['choices'][0]['delta'].get('content', "")
    
    return generated_text.strip()

def load_pmc_ids(filename):
    with open(filename, 'r') as f:
        pmc_ids = json.load(f)
    return pmc_ids

def main():
    pmc_ids_file = 'pmc_ids.json'  # The JSON file containing the list of PMC IDs
    input_dir = '/mnt/data/shared/PMC/oa_comm/PMC005xxxxxx'  # Directory containing the XML files
    output_file = 'extracted_details.json'  # JSON file to save the extracted details

    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist.")
        return

    pmc_ids = load_pmc_ids(pmc_ids_file)
    processed_files = set()

    # Load already processed details if the output file exists
    if os.path.exists(output_file):
        with open(output_file, 'r') as f:
            all_details = json.load(f)
            processed_files = {detail['file'] for detail in all_details}
    else:
        all_details = []

    for pmc_id in pmc_ids:
        filename = f"{pmc_id}.xml"
        if filename in processed_files:
            print(f"Skipping already processed file: {filename}")
            continue

        file_path = os.path.join(input_dir, filename)
        if os.path.exists(file_path):
            text = parse_xml(file_path)
            if text:
                details = extract_details_from_text(text)
                all_details.append({
                    'file': filename,
                    'details': details
                })

                # Write the updated details to the output file incrementally
                with open(output_file, 'w') as f:
                    json.dump(all_details, f, indent=4)

                print(f"Processed and saved details for: {filename}")
        else:
            print(f"File {file_path} does not exist.")

if __name__ == '__main__':
    main()


File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC11119472.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC11112549.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC11056916.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC10769202.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC10603947.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC10726404.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC10453983.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC10318922.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC10221257.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC10111265.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC9995971.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC9977891.xml does not exist.
File /mnt/data/shared/PMC/oa_comm/PMC005xxxxxx/PMC9936

HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions (Request ID: I72nPS3Nk9MysiD-2LFxg)

Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate

## Mistral Implementation

In [21]:
import os
import json
import torch
from lxml import etree
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

# Function to parse XML files and extract text
def parse_xml(file_path):
    try:
        tree = etree.parse(file_path)
        root = tree.getroot()
        full_text = ' '.join(root.xpath('//text()'))
        return full_text
    except Exception as e:
        print(f"Failed to parse {file_path}: {str(e)}")
        return None

# Function to extract relevant details using the LLaMA model
def extract_details_from_text(text):
    prompt = (
        "Extract the following preclinical data from the text:\n"
        "- Disease: The specific disease or condition being studied.\n"
        "- Drug: The drug or treatment being tested.\n"
        "- Animal: The type of animal used in the study.\n"
        "- Disease Model: The animal model used to replicate the disease or condition.\n"
        "- Dosage: The amount of drug or treatment administered.\n"
        "- Dosage Duration: The duration for which the dosage was administered.\n"
        "Provide the extracted information in a structured format.\n\n"
        f"Text: {text}"
    )
    
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=500)
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text.strip()

def load_pmc_ids(filename):
    with open(filename, 'r') as f:
        pmc_ids = json.load(f)
    return pmc_ids

def main():
    pmc_ids_file = 'pmc_ids.json'  # The JSON file containing the list of PMC IDs
    input_dir = '/mnt/data/shared/PMC/oa_comm/PMC005xxxxxx'  # Directory containing the XML files
    output_file = 'extracted_details.json'  # JSON file to save the extracted details

    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist.")
        return

    pmc_ids = load_pmc_ids(pmc_ids_file)
    processed_files = set()

    # Load already processed details if the output file exists
    if os.path.exists(output_file):
        with open(output_file, 'r') as f:
            all_details = json.load(f)
            processed_files = {detail['file'] for detail in all_details}
    else:
        all_details = []

    for pmc_id in pmc_ids:
        filename = f"{pmc_id}.xml"
        if filename in processed_files:
            print(f"Skipping already processed file: {filename}")
            continue

        file_path = os.path.join(input_dir, filename)
        if os.path.exists(file_path):
            text = parse_xml(file_path)
            if text:
                details = extract_details_from_text(text)
                all_details.append({
                    'file': filename,
                    'details': details
                })

                # Write the updated details to the output file incrementally
                with open(output_file, 'w') as f:
                    json.dump(all_details, f, indent=4)

                print(f"Processed and saved details for: {filename}")
        else:
            print(f"File {file_path} does not exist.")

if __name__ == '__main__':
    main()


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct.
401 Client Error. (Request ID: Root=1-66974ae9-72c4639e46e7e9ad370e784f;ed323fa9-0104-4566-b3ef-51cbd603791d)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B-Instruct is restricted. You must be authenticated to access it.