# Stage 0 -- Gathering Relevant Articles in Full-Text Format

## Stage 0 -- I. Retrieving the DOI List of Articles via Elsevier API

In [None]:
import requests
import csv

 Your ScienceDirect API key ()
elsevier_api_key = 'xxxxxxxxxxxxxxxxxxxx'

# Define the API endpoint and query parameters
url = 'https://api.elsevier.com/content/search/sciencedirect'
query_params = {
    'query': 'anaerobic biodegradation',
    'count': 10,  # Retrieve 10 articles
    'start': 0   # Start at the 1st result
}

# Add the API key to the request headers
headers = {
    'X-ELS-APIKey': elsevier_api_key,
    'Accept': 'application/json'
}

# Make the API request
response = requests.get(url, params=query_params, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Parse the response as JSON
    search_results = response.json()
    articles = search_results['search-results']['entry']
    
    # Initialize an empty list to store DOIs
    doi_list = []
    
    # Extract the DOIs from the search results
    for article in articles:
        if 'prism:doi' in article:  # Ensure the article has a DOI
            doi_list.append(article['prism:doi'])
    
    # Save the DOIs to a CSV file
    with open('DOI_list.csv', 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['DOI'])  # Write header
        for doi in doi_list:
            csv_writer.writerow([doi])
    
    print(f"Retrieved {len(doi_list)} DOIs and saved them to DOI_list.csv.")
else:
    print(f"Error: {response.status_code} - {response.text}")

## Stage 0 -- II. Using the DOI List to Download Articles (.xml)

In [None]:
# Get the full-text from DOI as XML file for multiple DOIs in a list

import httpx
import time
import csv

# Function to get full-text XML for a given DOI
def get_full_text_by_doi(paper_doi, apikey):
    headers = {
        "X-ELS-APIKey": apikey,
        "Accept": 'application/xml'  # Request XML format
    }
    
    # Set timeout for requests
    #timeout = httpx.Timeout(10.0, connect=60.0)
    timeout = httpx.Timeout(60.0, connect=60.0)  # Wait up to 60 seconds for a response
    client = httpx.Client(timeout=timeout, headers=headers)
    
    # Elsevier API endpoint for retrieving full text by DOI
    url = f"https://api.elsevier.com/content/article/doi/{paper_doi}"
    
    # Make the request
    response = client.get(url)
    
    print(f"Request for DOI {paper_doi}: {response.status_code}")  # Print the response status
    return response

# Function to save XML content to file using order number for filename
def save_xml_content(xml_content, order_num):
    # Create filename using order number (e.g., 1_full_text.xml)
    filename = f"{order_num}_full_text.xml"
    
    # Save the XML content to a file
    with open(filename, "w", encoding="utf-8") as file:
        file.write(xml_content)
    print(f"Saved XML for Order Number {order_num} to {filename}")

# Your ScienceDirect API key
elsevier_api_key = 'xxxxxxxxxxxxxxxxxxxx'

# Path to the input CSV file containing DOIs
input_csv_file_path = 'DOI_list.csv'

# Path to the output CSV file where we'll save the order numbers and DOIs
output_csv_file_path = 'ordered_doi_list.csv'

# Read the list of DOIs from the input CSV file
with open(input_csv_file_path, newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
    next(csv_reader)  # Skip the header
    doi_list = [row[0] for row in csv_reader]  # Extract DOIs from the first column

# Write the new CSV file with order numbers and DOIs
with open(output_csv_file_path, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Order Number', 'DOI'])  # Write the header

    # Loop through the list of DOIs and assign an order number
    for i, doi in enumerate(doi_list, start=1):  # Order numbers starting from 1
        csv_writer.writerow([i, doi])  # Write the order number and DOI to the new CSV file
        
        # Get the full-text XML for the current DOI
        response = get_full_text_by_doi(doi, elsevier_api_key)
        
        # Check if the response was successful
        if response.status_code == 200:
            xml_content = response.text
            
            # Save the XML content to a file named as "<order_number>_full_text.xml"
            save_xml_content(xml_content, i)
        else:
            print(f"Failed to retrieve XML for DOI {doi}, Status code: {response.status_code}")
        
        # Optional: Add a delay between requests to avoid hitting rate limits
        time.sleep(2)  # Adjust as needed (e.g., 2 seconds between requests)

print(f"Process completed. DOIs and order numbers saved to {output_csv_file_path}.")

## Stage 0 -- III. Cleaning the XML Files and Converting to Text (.txt)

In [None]:
import xml.etree.ElementTree as ET
import os

# Helper function to clean unnecessary tags and return paragraph text
def get_cleaned_paragraph(para_element):
    para_text = ""
    # Iterate through sub-elements, including the text and any tail text
    for sub_elem in para_element.iter():
        # Ignore unnecessary tags but collect text and tail text
        tags_to_ignore = ['ce:inf', 'ce:sup', 'ce:formula', 'ce:display', 'ce:italic', 'ce:chem', 'ce:float-anchor', 'mml:mo', 'mml:mi', 'mml:mfenced', 'mml:msub', 'mml:msup', 'ce:bold', 'ce:hsp', 'mml:math', 'ce:cross-refs', 'ce:cross-ref', 'mml:mrow', 'mml:msubsup']
        if sub_elem.tag not in tags_to_ignore:
            if sub_elem.text:
                para_text += sub_elem.text.strip() + " "
            if sub_elem.tail:
                para_text += sub_elem.tail.strip() + " "
    return para_text.strip()

# Function to process an XML file and extract clean text
def process_xml_file(input_filename, output_filename):
    # Read the XML file
    with open(input_filename, "r", encoding="utf-8") as file:
        xml_content = file.read()

    # Parse the XML content
    root = ET.fromstring(xml_content)

    # Initialize variables to store extracted titles and paragraphs
    extracted_content = []

    # Iterate over the XML elements and extract titles and paragraphs
    for elem in root.iter():
        # Extract section titles
        if elem.tag.endswith('section-title') and elem.text:
            extracted_content.append(f"Title: {elem.text.strip()}")
        
        # Extract paragraphs
        if elem.tag.endswith('para') and elem.attrib.get('view') == 'all':
            cleaned_para = get_cleaned_paragraph(elem)
            if cleaned_para:  # Only add if there's actual text
                extracted_content.append(f"Paragraph: {cleaned_para}")

    # Combine all extracted content into a single text
    cleaned_text = "\n\n".join(extracted_content)

    # Save the cleaned text to a file
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(cleaned_text)

    print(f"Processed and saved: {output_filename}")

# Main loop to process multiple XML files
for i in range(1, 111):  # Loop through files 1 to 10
    input_filename = f"{i}_full_text.xml"  # Input file name (e.g., 1_full_text.xml)
    output_filename = f"{i}_clean_text.txt"  # Output file name (e.g., 1_clean_text.txt)

    # Check if the input file exists before processing
    if os.path.exists(input_filename):
        process_xml_file(input_filename, output_filename)
    else:
        print(f"File {input_filename} does not exist.")

## Stage 0 -- IV. Converting User-Provided PDF Files to Cleaned Text

In [11]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter

In [7]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('1_DOI.pdf')

In [8]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

# Stage 1 -- Screening Relevant Articles

In [None]:
from openai import OpenAI

client = OpenAI(
    api_key="XXXXXXXXXXXXX",  # Replace with your actual OpenAI API key
)

def chat_gpt(clean_text):
    # Define the questions we want GPT to answer
   
    questions = ["I want to collect the paper has anaerobic biodegradation of specific chemical in a batch reactor using sludge or sediment as the inoculum.\
        The chemical should not be surfactant, polymer, plastic, filler, raw wastewater, or hospital wastewater. The batch reactor should not be a membrane, bed, column reactor, UASB, or having continuous influence or electric supply.\
        The sludge or sediment should be directly used in the batch reactor rather than bacteria, colony, strain, or consortium."]
   
    # Construct the prompt by combining the paper text and the questions
    prompt = f"Please analyze the following academic paper and answer the following questions first with 'yes' or 'no':\n\n{clean_text}\n\n and your reason for why it satisfies or not."
    for i, question in enumerate(questions, start=1):
        prompt += f"{i}. {question}\n"
    
    # Send the prompt to GPT and get a response
    response = client.chat.completions.create(
        #model="gpt-3.5-turbo",
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a research paper reviewer. Please use text and respond to the questions with 'yes' or 'no' and then reasons."},
            {"role": "user", "content": prompt}
        ],
        temperature=0 
    )
    
    # Return the GPT response
    return response.choices[0].message.content.strip()


In [None]:

stage_1_resu = []
for i in range (1,101):
    file_path = str(i)+"_clean_text.txt"
    print(i)
    #print(file_path)

    with open(file_path, 'r', encoding='utf-8') as file:
        clean_text = file.read()
        print(len(clean_text))
        cleaned_text = clean_text.replace("Title: ", "").replace("Paragraph: ", "")
        gpt_response = chat_gpt(cleaned_text)
        print(gpt_response)
        stage_1_resu.append(gpt_response)

In [None]:
# 1 question
import pandas as pd

# Extract the initial letter and full response into a structured list
formatted_data = [[item[0], item] for item in stage_1_resu]

# Create a DataFrame with the required structure
df = pd.DataFrame(formatted_data, columns=["Initial", "Full Answer"])

print(df)
df.to_excel('Stage_1_answer.xlsx')

# Stage 2 -- Extracting Data

## Stage 2 -- I. Summarizing Relevant Information

In [None]:
ques_text = ["Which chemicals' anaerobic biodegradation performance were studied? Use the full name of the chemicals. And how much of them were added?", 
 "How did they build the biodegradation experiment batch reactors, list all the content, especially not bypass the numbers?",
 "How much inoculum (sludge or sediment) was added in the anaerobic biodegradation experiments?",
 "How much liquid medium was added? What is the name and initial pH of the medium used in the batch reactors?",
 "What temperature and pH was the anaerobic biodegradation experiments(or batch reactors) incubated at?"]
 #,"Double check your answer based on my question."]

In [None]:
from openai import OpenAI

client = OpenAI(
    api_key="...",  # Replace with your actual OpenAI API key
)

def stageII_1st(clean_text,questions):
    # Define the questions we want GPT to answer
    questions = questions
    
    # Construct the prompt by combining the paper text and the questions
    prompt = f"Please analyze this academic paper and answer the following questions. Note if no related infomation you can say there is no related infomation. \n{clean_text}"
    
    for i, question in enumerate(questions, start=1):
        prompt += f"{i}. {question}\n"
    
    # Send the prompt to GPT and get a response
    response = client.chat.completions.create(
        #model="gpt-3.5-turbo",
        model = "gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a research paper reviewer. Please use the text and respond to the questions with all relevant information."},
            {"role": "user", "content": prompt}
        ],
        temperature=0 
    )
    
    # Return the GPT response
    return response.choices[0].message.content.strip()



In [None]:
answers_all = []
useful_txt_ind = [1]#,4,9,11]

for i in useful_txt_ind:

    file_path = "C:..."

    with open(file_path, 'r', encoding='utf-8') as file:
        clean_text = file.read()

    cleaned_text = clean_text.replace("Title: ", "").replace("Paragraph: ", "")
    
    ans = chat_gpt(cleaned_text,ques_text)
    answers_all.append(ans)
    print (len(answers_all))

## Stage 2 -- II. Extracting Target Features and Formatting as an Excel File

In [None]:
import openai
import constants


# Define the schema and required fields
schema = {
    "properties": {
        "chemical_names": {"type": "string"},
        "chemical_amounts": {"type": "string"},
        "inoculum_name": {"type": "string"},
        "inoculum_dosage": {"type": "string"},
        "medium_name": {"type": "string"},
        "medium_volume": {"type": "string"},
        "pHs": {"type": "string"},
        "Temperatures": {"type": "integer"}
    },
    "required": ["chemical_names", "chemical_amounts", "inoculum_name", "inoculum_dosage", "medium_name", "medium_volume", "pHs", "Temperatures"]
}
def stageII_2nd(gpt_response,schema):
    schema = schema

    # Prompt to guide GPT to extract information as per the schema
    prompt = f"""
    Given the following text, act as a data analyst and collect relevant information from the text. 
    Please extract the corresponding information (corresponding to each key in the schema) from the inputs. 
    Here is the description for each key:
    chemical_names: what chemical is studied and try to list all chemicals studied for anaerobic biodegradation.
    chemical_amounts: The chemical concentrations in the batch reactor.
    inoculum_name: The inoculum name in batch reactor.
    inoculum_dosage: the amount or concentration of the added inoculum in batch reactor.
    medium_name: if just mentioned as medium, then just output medium.
    medium_volume: the amount of the added liquid medium or the working volume.
    pHs: the initial pH for the batch reactor.
    Temperatures: the batch reactor is incubated at what temperature.
    {schema}

    Text:
    {gpt_response}

    Return the data in a structured format based on the schema.
    """

    # Use GPT-3.5 turbo to process the prompt
    response = client.chat.completions.create(
        #model="gpt-3.5-turbo",
        model = "gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a data extractor."},
            {"role": "user", "content": prompt}
        ],
        temperature=0  # You can set the temperature to 0 for more deterministic results
    )

    # Get the response
    structured_output = response.choices[0].message.content.strip()
    return structured_output



In [None]:
import pandas as pd
import json
ind = pd.read_excel(...)
answers_1st = []
answers_2nd = []
useful_txt_ind = ind

for i in useful_txt_ind:

    print (i)
    file_path = "C:..."

    with open(file_path, 'r', encoding='utf-8') as file:
        clean_text = file.read()

    cleaned_text = clean_text.replace("Title: ", "").replace("Paragraph: ", "")
    
    ans_1st = stageII_1st(cleaned_text,ques_text)
    answers_1st.append(ans_1st)
    ans_2nd = stageII_2nd(ans_1st,schema)
    answers_2nd.append(ans_2nd)


In [None]:
import pandas as pd
import json

# Parse each JSON string into a Python dictionary
cleaned_parsed_data = [item.replace("```json", "").replace("```", "").strip() for item in answers_2nd]
parsed_data = [json.loads(item) for item in cleaned_parsed_data]

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(parsed_data)
df['DOI_ind'] = useful_txt_ind
df['ans_1st'] = answers_1st

# Display the resulting DataFrame
print(df)
# Save the DataFrame to an Excel file
df.to_excel('output....xlsx', index=False)