In [3]:
import requests
from transformers import pipeline
from langchain.agents import initialize_agent
from langchain.tools import Tool
import xml.etree.ElementTree as ET
import streamlit as st
from langchain.agents import initialize_agent
from bs4 import BeautifulSoup
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI 
from langchain_community.llms import OpenAI
import xml.etree.ElementTree as ET


In [24]:
def fetch_arxiv(query, max_results=5):
    """Fetch articles from arXiv using flexible query syntax."""
    base_url = "http://export.arxiv.org/api/query"
    params = {
        "search_query": f"all:{query}", 
        "start": 0,
        "max_results": max_results
    }
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching ArXiv articles: {e}")
        return None



In [4]:
def parse_arxiv_response(xml_response):
    """Parse the arXiv XML response into a standardized list of articles."""
    try:
        root = ET.fromstring(xml_response)  # Parse the XML
    except ET.ParseError as e:
        print(f"Error parsing ArXiv XML: {e}")
        return []

    namespace = {"atom": "http://www.w3.org/2005/Atom"}

    articles = []
    for entry in root.findall("atom:entry", namespace):
        title = entry.find("atom:title", namespace).text.strip() if entry.find("atom:title", namespace) is not None else "No title"
        summary = entry.find("atom:summary", namespace).text.strip() if entry.find("atom:summary", namespace) is not None else "No abstract"
        link = entry.find("atom:id", namespace).text.strip() if entry.find("atom:id", namespace) is not None else "No link"

        authors = []
        for author in entry.findall("atom:author", namespace):
            name = author.find("atom:name", namespace).text.strip() if author.find("atom:name", namespace) is not None else "Unknown Author"
            authors.append(name)

        articles.append({
            "title": title,
            "abstract": summary,
            "link": link,
            "authors": ", ".join(authors)
        })

    return articles

In [6]:
def summarize_text(text):
    """Summarize the given text, dynamically adjusting max_length."""
    text_length = len(text.split())
    max_length = min(100, text_length + 10)  
    return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']


def search_and_summarize(query, max_results=5):
    """Search and summarize articles from arxiv"""
    arxiv_data = fetch_arxiv(query, max_results=max_results)
    arxiv_articles = parse_arxiv_response(arxiv_data) if arxiv_data else []

    all_articles = acs_articles

    for article in all_articles:
        if "abstract" in article and article["abstract"] != "No abstract":
            article["summary"] = summarize_text(article["abstract"])
        else:
            article["summary"] = "No abstract available for summarization."

    return all_articles

## Toools for Agent

In [7]:
fetch_arxiv_tool = Tool(
    name="FetchArxiv",
    func=lambda query: parse_arxiv_response(fetch_arxiv(query)),
    description=(
        "Fetches articles from arXiv based on a query. "
        "Returns a list of articles with titles, abstracts, and links."
    )
)

summarize_tool = Tool(
    name="SummarizeText",
    func=summarize_text,
    description=(
        "Summarizes a given text using a pre-trained summarization model. "
        "The input text should be a single abstract or article content."
    )
)

## Initialization of the Agent

In [8]:
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.7)  
agent = initialize_agent(
    tools=[fetch_arxiv_tool, summarize_tool],
    llm=llm,
    agent="zero-shot-react-description",
    verbose=True
)


  agent = initialize_agent(


## Entering the task

In [9]:
query = "Drug discovery"
response = agent.run(query)
print(response)


  response = agent.run(query)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to find recent research articles related to drug discovery to provide a comprehensive answer. 
Action: FetchArxiv
Action Input: "Drug discovery"[0m
Observation: [36;1m[1;3m[{'title': 'Energy-based Generative Models for Target-specific Drug Discovery', 'abstract': 'Drug targets are the main focus of drug discovery due to their key role in\ndisease pathogenesis. Computational approaches are widely applied to drug\ndevelopment because of the increasing availability of biological molecular\ndatasets. Popular generative approaches can create new drug molecules by\nlearning the given molecule distributions. However, these approaches are mostly\nnot for target-specific drug discovery. We developed an energy-based\nprobabilistic model for computational target-specific drug discovery. Results\nshow that our proposed TagMol can generate molecules with similar binding\naffinity scores as real molecules. GAT-based models showed

In [10]:
query = "machine learning"
results = fetch_arxiv(query, max_results=5)
parsed_articles = parse_arxiv_response(results)

In [12]:
parsed_articles

[{'title': 'Lecture Notes: Optimization for Machine Learning',
  'abstract': 'Lecture notes on optimization for machine learning, derived from a course at\nPrinceton University and tutorials given in MLSS, Buenos Aires, as well as\nSimons Foundation, Berkeley.',
  'link': 'http://arxiv.org/abs/1909.03550v1',
  'authors': 'Elad Hazan'},
 {'title': 'An Optimal Control View of Adversarial Machine Learning',
  'abstract': "I describe an optimal control view of adversarial machine learning, where the\ndynamical system is the machine learner, the input are adversarial actions, and\nthe control costs are defined by the adversary's goals to do harm and be hard\nto detect. This view encompasses many types of adversarial machine learning,\nincluding test-item attacks, training-data poisoning, and adversarial reward\nshaping. The view encourages adversarial machine learning researcher to utilize\nadvances in control theory and reinforcement learning.",
  'link': 'http://arxiv.org/abs/1811.04422v1

In [13]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Device set to use mps:0


In [17]:
test_abstract = """Machine learning is a field of artificial intelligence (AI) that uses algorithms
and statistical models to perform tasks without using explicit instructions, relying
instead on patterns and inference. It is widely used in various domains."""

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1)  # Use CPU
test_summary = summarizer(test_abstract, max_length=50, min_length=20, do_sample=False)
print(test_summary)

Device set to use cpu
Your max_length is set to 50, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)


[{'summary_text': ' Machine learning is a field of artificial intelligence (AI) that uses algorithmsand statistical models to perform tasks without using explicit instructions, relying on patterns and inference . It is widely used in various domains .'}]


In [18]:
for article in parsed_articles:
    if article["abstract"] and article["abstract"] != "No abstract":
        article["summary"] = summarizer(article["abstract"], max_length=100, min_length=50, do_sample=False)[0]["summary_text"]
    else:
        article["summary"] = "No abstract available for summarization."

for article in parsed_articles:
    if article["abstract"] and article["abstract"] != "No abstract":
    
        input_length = len(article["abstract"].split())
        max_len = min(50, input_length + 10)  
        min_len = max(10, int(max_len * 0.5))  

        if min_len >= max_len:
            min_len = max_len - 1  

        try:
            summary_result = summarizer(
                article["abstract"],
                max_length=max_len,
                min_length=min_len,
                do_sample=False,
            )
            print(f"Summary Result: {summary_result}")  
            article["summary"] = summary_result[0]["summary_text"]
        except Exception as e:
            print(f"Error summarizing article: {e}")
            article["summary"] = "Summary generation failed."

for article in parsed_articles:
    print(f"Title: {article['title']}")
    print(f"Abstract: {article['abstract']}")
    print(f"Summary: {article['summary']}")
    print("-" * 80)  

Your max_length is set to 100, but your input_length is only 40. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
Your max_length is set to 100, but your input_length is only 51. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)


Summary Result: [{'summary_text': ' Lecture notes on optimization for machine learning, derived from a course at Princeton University and tutorials given in MLSS, Buenos Aires, as well as MLSS .'}]
Summary Result: [{'summary_text': " An optimal control view of adversarial machine learning is defined by the adversary's goals to do harm and be hard to detect . The view includes test-item attacks, training-data poisoning and adversarial reward-shaping ."}]
Summary Result: [{'summary_text': ' The article is devoted to the problem of small learning samples in machine learning . The flaws of maximum likelihood learning and minimax learning are looked at . The concept of minimax deviation learning is introduced .'}]
Summary Result: [{'summary_text': ' In this chapter, we provide a brief overview of applying machine learning techniques for clinical prediction tasks . We begin with a quick introduction to the concepts of machine learning and outline some of the most common machine learning algo

## Further tests with ACS scrapping

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def fetch_acs_articles_selenium(query, max_results=5):
    """Fetch articles from ACS using Selenium."""
    base_url = f"https://pubs.acs.org/action/doSearch?AllField={query}"

    # Set up Selenium with headless Chrome
    service = Service("/Users/davidsegura/chromedriver-mac-arm64/chromedriver")
    options = Options()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--headless")  
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(base_url)

        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "issue-item_metadata"))
        )

        articles = []
        results = driver.find_elements(By.CLASS_NAME, "issue-item_metadata")

        for result in results[:max_results]:  
            try:
                title = result.find_element(By.CLASS_NAME, "issue-item_title").text

                # Find the abstract link
                try:
                    abstract_link = result.find_element(By.XPATH, ".//a[@title='Abstract']")
                    abstract_url = abstract_link.get_attribute("href")
                except Exception:
                    abstract_url = None

                # Open abstract page and extract text
                abstract_text = "No abstract available"
                if abstract_url:
                    driver.execute_script("window.open(arguments[0]);", abstract_url)
                    driver.switch_to.window(driver.window_handles[-1])  
                    time.sleep(5)  

                    try:
                        abstract_elements = driver.find_elements(By.CLASS_NAME, "articleBody_abstractText")
                        if not abstract_elements:
                            abstract_elements = driver.find_elements(By.CLASS_NAME, "NLM_abs")  # Alternative selector

                        if abstract_elements:
                            abstract_text = " ".join([element.text for element in abstract_elements])
                        else:
                            print(f"Could not extract abstract for {title}")

                    except Exception as e:
                        print(f"Error extracting abstract for {title}: {e}")

                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])  # Back to main page

                link = result.find_element(By.TAG_NAME, "a").get_attribute("href")
                articles.append({"title": title, "abstract": abstract_text, "link": link})

            except Exception as e:
                print(f"Error processing an article: {e}")
                continue

        return articles

    except Exception as e:
        print(f"Error fetching ACS articles using Selenium: {e}")
        return []

    finally:
        driver.quit()

query = "machine learning for chemistry"
articles = fetch_acs_articles_selenium(query, max_results=5)

if articles:
    for article in articles:
        print(f"Title: {article['title']}")
        print(f"Abstract: {article['abstract']}")
        print(f"Link: {article['link']}")
        print("-" * 80)
else:
    print("No articles found or an error occurred.")
