In [None]:
import os
import json
from lxml import etree

def load_pmc_ids(filename):
    with open(filename, 'r') as f:
        pmc_ids = json.load(f)
    return pmc_ids

def parse_xml_file(file_path):
    details = {
        'disease': '',
        'drug': '',
        'animal': '',
        'disease_model': '',
        'dosage': '',
        'dosage_duration': ''
    }

    try:
        tree = etree.parse(file_path)
        root = tree.getroot()
        
        # Example XPath queries to extract relevant information
        details['disease'] = extract_info(root, '//disease')
        details['drug'] = extract_info(root, '//drug')
        details['animal'] = extract_info(root, '//animal')
        details['disease_model'] = extract_info(root, '//disease_model')
        details['dosage'] = extract_info(root, '//dosage')
        details['dosage_duration'] = extract_info(root, '//dosage_duration')

    except Exception as e:
        print(f"Failed to parse {file_path}: {str(e)}")
    
    return details



In [None]:
def extract_info(root, xpath):
    elements = root.xpath(xpath)
    if elements:
        return ' '.join([element.text for element in elements if element.text is not None])
    return "Not specified"

def main():
    pmc_ids_file = 'pmc_ids.json'  # The JSON file containing the list of PMC IDs
    input_dir = 'downloaded_papers'  # Directory containing the downloaded XML files
    output_file = 'extracted_details.json'  # JSON file to save the extracted details

    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist.")
        return

    pmc_ids = load_pmc_ids(pmc_ids_file)
    all_details = []

    for pmc_id in pmc_ids:
        file_path = os.path.join(input_dir, f"{pmc_id}.xml")
        if os.path.exists(file_path):
            details = parse_xml_file(file_path)
            details['PMC ID'] = pmc_id  # Include the PMC ID in the details for reference
            all_details.append(details)
        else:
            print(f"File {file_path} does not exist.")

    # Save the extracted details to a JSON file
    with open(output_file, 'w') as f:
        json.dump(all_details, f, indent=4)

    print(f"Data has been written to {output_file}")

if __name__ == '__main__':
    main()


# Extract using Llama 3

In [None]:
import os
import json
import xml.etree.ElementTree as ET
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def load_llama_model():
    model_name = "meta-llama/Llama-2-8b-hf"  # Adjust if necessary
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    return tokenizer, model

def extract_text_from_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Extract text from all paragraph elements
    paragraphs = root.findall(".//p")
    text = " ".join([p.text for p in paragraphs if p.text])
    
    return text

def extract_preclinical_details(text, tokenizer, model):
    prompt = f"""
    Extract the following preclinical details from the given text:
    - Disease
    - Drug
    - Animal
    - Disease model
    - Dosage
    - Dosage duration

    Text: {text[:4000]}  # Limiting text to 4000 characters to avoid token limit issues

    Provide the output in JSON format.
    """

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
    outputs = model.generate(**inputs, max_length=2048)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract JSON from the response
    try:
        json_start = response.index('{')
        json_end = response.rindex('}') + 1
        json_str = response[json_start:json_end]
        return json.loads(json_str)
    except:
        return {"error": "Failed to extract JSON from model output"}



In [None]:
def process_pmc_papers(folder_path, tokenizer, model):
    results = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".xml"):
            file_path = os.path.join(folder_path, filename)
            try:
                xml_text = extract_text_from_xml(file_path)
                details = extract_preclinical_details(xml_text, tokenizer, model)
                results.append({
                    "filename": filename,
                    "details": details
                })
            except ET.ParseError:
                print(f"Error parsing XML file: {filename}")
            except Exception as e:
                print(f"Error processing file {filename}: {str(e)}")
    return results



In [None]:
def main():
    pmc_folder = ""  # Update this path
    tokenizer, model = load_llama_model()
    results = process_pmc_papers(pmc_folder, tokenizer, model)
    
    # Save results to a JSON file
    with open("preclinical_details.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()