In [15]:
# 📚 AI Agent - Find Author Publications (with Open Access Info)
!pip install requests transformers sentencepiece PyMuPDF tqdm --quiet
!pip install requests pandas arxiv rapidfuzz --quiet

import os
import requests
import pandas as pd
import arxiv
from rapidfuzz import fuzz

def norm(s):
    if not s: return ""
    return " ".join(s.replace("\n"," ").split()).strip().lower()

def openalex_author_works(author_name, max_works=50):
    base = "https://api.openalex.org"
    # Step 1: find author ID
    r = requests.get(f"{base}/authors", params={"search": author_name, "per-page": 10})
    if r.status_code != 200: return []
    items = r.json().get("results", [])
    if not items: return []
    # pick best match
    scores = [(fuzz.partial_ratio(author_name.lower(), a.get("display_name","").lower()), a) for a in items]
    scores.sort(reverse=True, key=lambda x: x[0])
    best_score, best_author = scores[0]
    if best_score < 50: return []
    author_id = best_author.get("id").split('/')[-1]
    # Step 2: fetch works
    works = []
    params = {"filter": f"author.id:A{author_id}", "per-page": 25, "sort": "cited_by_count:desc"}
    r2 = requests.get(f"{base}/works", params=params)
    if r2.status_code != 200: return []
    docs = r2.json().get("results", [])[:max_works]
    for w in docs:
        wf = w.get("type", "")
        if wf not in ["article", "book"]:
            continue
        oa_info = w.get("open_access", {})
        oa_status = oa_info.get("is_oa", False)
        pdf_url = w.get("open_access_pdf", {}).get("url")
        works.append({
            "title": w.get("display_name"),
            "authors": ", ".join([au.get("author",{}).get("display_name","") for au in w.get("authorships",[])]),
            "year": w.get("publication_year"),
            "work_type": wf,
            "open_access": oa_status,
            "pdf_url": pdf_url,
            "url": w.get("id"),
            "source": "OpenAlex",
            "doi": w.get("doi")
        })
    return works

def crossref_author_works(author_name, max_results=50):
    url = "https://api.crossref.org/works"
    params = {"query.author": author_name, "rows": max_results}
    r = requests.get(url, params=params, headers={"User-Agent":"research-agent/0.1"})
    if r.status_code != 200: return []
    items = r.json().get("message", {}).get("items", [])
    works = []
    for it in items:
        title = it.get("title", [""])[0]
        tcr = it.get("type", "")
        if tcr not in ["journal-article", "book"]:
            continue
        wf = "article" if tcr=="journal-article" else "book"
        authors = []
        for a in it.get("author", []):
            name = " ".join(filter(None, [a.get("given",""), a.get("family","")]))
            authors.append(name)
        doi = it.get("DOI")
        url_link = f"https://doi.org/{doi}" if doi else it.get("URL")
        works.append({
            "title": title,
            "authors": ", ".join(authors),
            "year": it.get("issued",{}).get("date-parts",[[None]])[0][0],
            "work_type": wf,
            "open_access": False,  # CrossRef usually doesn’t provide OA info
            "pdf_url": None,
            "url": url_link,
            "source": "CrossRef",
            "doi": doi
        })
    return works

def arxiv_author_works(author_name, max_results=50):
    client = arxiv.Client()
    query = f'au:"{author_name}"'
    search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
    works = []
    for r in client.results(search):
        works.append({
            "title": r.title,
            "authors": ", ".join([a.name for a in r.authors]),
            "year": r.published.year if hasattr(r, "published") else None,
            "work_type": "article",
            "open_access": True,
            "pdf_url": r.pdf_url,
            "url": r.entry_id,
            "source": "arXiv",
            "doi": r.doi
        })
    return works

def aggregate_author_papers(author_name, max_per_source=50):
    all_ = []
    all_.extend(openalex_author_works(author_name, max_per_source))
    all_.extend(crossref_author_works(author_name, max_per_source))
    all_.extend(arxiv_author_works(author_name, max_results=50))
    df = pd.DataFrame(all_)
    if df.empty:
        print("No works found.")
        return df
    df['title_norm'] = df['title'].astype(str).apply(norm)
    df = df.drop_duplicates(subset=['title_norm'])
    df = df.drop(columns=['title_norm'])
    df['year'] = pd.to_numeric(df['year'], errors='coerce')
    df = df.sort_values(by=['year'], ascending=False).reset_index(drop=True)
    return df[['title','authors','year','work_type','open_access','pdf_url','source','url','doi']]

# Run
author_name = input("Enter professor/author name: ").strip()
df_author = aggregate_author_papers(author_name, max_per_source=30)
print(f"\nFound {len(df_author)} works for author '{author_name}':\n")
if not df_author.empty:
    display(df_author.head(50))
else:
    print("Consider trying a variant of the name.")


[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/commands/install.py", line 447, in run
    conflicts = self._determine_conflicts(to_install)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/commands/install.py", line 578, in _determine_conflicts
    return check_install_conflicts(to_install)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/operations/check.py", line 101, in check_install_conflicts
    package_set, _

In [None]:
!pip install requests transformers sentencepiece PyMuPDF tqdm --quiet