In [15]:
###########################################################################
#                                                                         #
# EAP 5059 Métodos de Pesq Quali e Quant p Elab de Proj de Intervenção II #
#                                                                         #
# Nome: Anderson Hitoshi Uyekita                                          #
# NUSP: 5175471                                                           #
#                                                                         #
###########################################################################

"""
OpenAlex — ÚNICA QUERY com AND entre múltiplas keywords (frases exatas)

Faz uma única chamada ao endpoint /works com o parâmetro `search` contendo:
    "Circular Economy" AND "Industrial Symbiosis" AND "Sharing Economy"

Inclui:
- Sessão HTTP com retries e timeout
- Reconstrução de abstract a partir do abstract_inverted_index
- Paginação opcional (>200 resultados)
- Ordenação por número de citações (desc)

Requisitos: requests, pandas
"""

import math
import time
from typing import Iterable, List, Dict, Any, Optional

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib.parse import urlencode
from urllib3.util.retry import Retry

# =====================
# Configurações
# =====================
BASE_URL = "https://api.openalex.org/works"
EMAIL = "anderson.uyekita@gmail.com"
DEFAULT_TIMEOUT = 45             # timeout (s) para a requisição
MAX_PER_PAGE = 200               # limite do OpenAlex por página

# Estratégia de retry para lidar com rate limit (429) e erros transitórios (5xx)
RETRY_STRATEGY = Retry(
    total=5,
    backoff_factor=0.8,
    status_forcelist=(429, 500, 502, 503, 504),
    allowed_methods={"GET"},
)

# Sessão HTTP com User-Agent identificável e retries
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": f"openalex-single-and (mailto:{EMAIL})"})
SESSION.mount("https://", HTTPAdapter(max_retries=RETRY_STRATEGY))


def _http_get(url: str, *, timeout: int = DEFAULT_TIMEOUT) -> Dict[str, Any]:
    """Wrapper de GET com raise para status != 2xx e retorno em JSON."""
    resp = SESSION.get(url, timeout=timeout)
    resp.raise_for_status()
    return resp.json()


# =====================
# Utilidades
# =====================
def reconstruir_abstract(abstract_inverted_index: Optional[Dict[str, List[int]]]) -> Optional[str]:
    """
    Reconstrói o abstract a partir do índice invertido do OpenAlex.
    Retorna None quando não houver abstract.
    """
    if not abstract_inverted_index:
        return None

    max_pos = max(pos for positions in abstract_inverted_index.values() for pos in positions)
    tokens = [""] * (max_pos + 1)
    for token, positions in abstract_inverted_index.items():
        for p in positions:
            tokens[p] = token

    text = " ".join(t for t in tokens if t)
    # Ajuste simples de pontuação colada
    return (
        text.replace(" ,", ",")
            .replace(" .", ".")
            .replace(" ;", ";")
            .replace(" :", ":")
    )


# =====================
# Única query com AND
# =====================
def _build_search_and_query(keywords: Iterable[str]) -> str:
    """
    Monta a expressão booleana para o parâmetro `search`, do tipo:
        "kw1" AND "kw2" AND "kw3"
    - Usa aspas para tratar cada keyword como *frase exata*
    - Remove vazios/espacos duplicados
    """
    kws = [str(k).strip() for k in keywords if k and str(k).strip()]
    if not kws:
        raise ValueError("A lista de keywords está vazia.")
    quoted = [f'"{k}"' for k in kws]
    return " AND ".join(quoted)


def _montar_url_busca_and(
    keywords: Iterable[str],
    *,
    ano_inicial: int,
    idioma: Optional[str],
    tipo: Optional[str],
    per_page: int,
    page: int,
) -> str:
    """
    Gera a URL completa da busca /works com:
    - search: expressão booleana com AND entre as frases exatas
    - filter: ano mínimo + (opcionais) idioma e tipo
    - sort: cited_by_count desc
    - paginação: per_page + page
    """
    filtros = [f"from_publication_date:{ano_inicial}-01-01"]
    if idioma:
        filtros.append(f"language:{idioma}")
    if tipo:
        filtros.append(f"type:{tipo}")

    params = {
        "search": _build_search_and_query(keywords),
        "filter": ",".join(filtros),
        "sort": "cited_by_count:desc",
        "per_page": per_page,
        "page": page,
        "mailto": EMAIL,  # boa prática recomendada pelo OpenAlex
    }
    return f"{BASE_URL}?{urlencode(params)}"


def buscar_por_keywords_and_single_query(
    keywords: Iterable[str],
    *,
    ano_inicial: int = 2020,
    idioma: Optional[str] = None,
    tipo: Optional[str] = None,
    n: int = 200,
    incluir_abstract: bool = True,
    sleep_between_pages: float = 0.2,
) -> pd.DataFrame:
    """
    Faz UMA busca no /works usando `search` com:
        "kw1" AND "kw2" AND "kw3"
    e retorna um DataFrame ordenado por citações (desc).

    Parâmetros:
    - keywords: frases exatas (e.g., ["Circular Economy", "Industrial Symbiosis", "Sharing Economy"])
    - ano_inicial: filtra a partir desse ano (inclusivo)
    - idioma: ex. "en", "pt" (opcional)
    - tipo: ex. "article", "preprint", "book-chapter" (opcional)
    - n: número máximo de resultados desejados (paginação automática até ~n)
    - incluir_abstract: reconstrói o abstract quando disponível
    - sleep_between_pages: pequeno intervalo para ser amigável à API
    """
    # Converte o iterável em lista para reutilizar as palavras (e exibir no DataFrame)
    keywords_list = [str(k).strip() for k in keywords if k and str(k).strip()]
    if not keywords_list:
        raise ValueError("A lista de keywords está vazia.")

    n = max(1, int(n))
    per_page = min(MAX_PER_PAGE, n)
    total_pages = math.ceil(n / per_page)

    linhas: List[Dict[str, Any]] = []
    vistos: set[str] = set()

    for page in range(1, total_pages + 1):
        url = _montar_url_busca_and(
            keywords_list,
            ano_inicial=ano_inicial,
            idioma=idioma,
            tipo=tipo,
            per_page=per_page,
            page=page,
        )
        data = _http_get(url)
        results = data.get("results") or []
        if not results:
            # Sem mais resultados (ou busca muito restritiva)
            break

        for w in results:
            wid = w.get("id")
            if not wid or wid in vistos:
                continue
            vistos.add(wid)

            autores = ", ".join(
                a.get("author", {}).get("display_name", "")
                for a in w.get("authorships", [])
                if a.get("author")
            )
            oa_url = (w.get("open_access") or {}).get("oa_url")
            venue = (w.get("primary_location") or {}).get("source", {}) or {}
            abstract_txt = reconstruir_abstract(w.get("abstract_inverted_index")) if incluir_abstract else None

            linhas.append({
                "keywords": ", ".join(keywords_list),
                "titulo": w.get("title"),
                "ano": w.get("publication_year"),
                "citas": w.get("cited_by_count"),
                "doi": w.get("doi"),
                "venue": venue.get("display_name"),
                "autores": autores,
                "open_access_url": oa_url,
                "openalex_id": wid,
                "abstract": abstract_txt,
            })

        if len(linhas) >= n:
            break

        time.sleep(sleep_between_pages)  # educação com a API

    df = pd.DataFrame(linhas)
    if not df.empty:
        df = df.sort_values(by=["citas"], ascending=False, kind="stable").reset_index(drop=True)
        # Corta ao alvo n (caso a última página tenha excedido um pouco)
        if len(df) > n:
            df = df.iloc[:n].reset_index(drop=True)
    return df


# =====================
# Exemplo de uso
# =====================
if __name__ == "__main__":
    keywords = [
        "Agro-industrial Symbiosis"
    ]
    df = buscar_por_keywords_and_single_query(
        keywords,
        ano_inicial=2015,
        idioma="en",        # ex.: "en"
        tipo="article",     # ex.: None para trazer todos os tipos
        n=300,              # paginação automática se n > 200
        incluir_abstract=True,
    )

    # Exibir primeiras linhas (descomente se quiser testar localmente)
    # with pd.option_context("display.max_colwidth", 120):
    #     print(df.head(20).to_string(index=False))

    # (Opcional) salvar CSV
    # df.to_csv("openalex_AND_circular_symbiosis_sharing.csv", index=False, encoding="utf-8")


In [16]:
df

Unnamed: 0,keywords,titulo,ano,citas,doi,venue,autores,open_access_url,openalex_id,abstract
0,Agro-industrial Symbiosis,Critical success and risk factors for circular...,2020,199,https://doi.org/10.1016/j.resconrec.2020.105236,Resources Conservation and Recycling,"Mechthild Donner, Anne Verniquet, J. Broeze, K...",https://doi.org/10.1016/j.resconrec.2020.105236,https://openalex.org/W3097098960,"For a transition from a linear, 'take-make-dis..."
1,Agro-industrial Symbiosis,The Potential of Industrial Symbiosis: Case An...,2019,120,https://doi.org/10.3390/su11247095,Sustainability,"Ângela Neves, Radu Godina, Susana Garrido Azev...",https://www.mdpi.com/2071-1050/11/24/7095/pdf?...,https://openalex.org/W2996100567,"Industrial symbiosis, which is characterised m..."
2,Agro-industrial Symbiosis,Biorefining and industrial symbiosis: A propos...,2017,50,https://doi.org/10.1016/j.jclepro.2017.12.107,Journal of Cleaner Production,"Victoria Emilia Neves Santos, Alessandra Magrini",,https://openalex.org/W2778960058,
3,Agro-industrial Symbiosis,Cement-based concrete modified with Vitellaria...,2022,37,https://doi.org/10.1016/j.conbuildmat.2022.127906,Construction and Building Materials,"Solomon Oyebisi, Thamer Alomayri",http://eprints.covenantuniversity.edu.ng/17453...,https://openalex.org/W4281384947,
4,Agro-industrial Symbiosis,Agro-Industrial Symbiosis and Alternative Heat...,2021,20,https://doi.org/10.3390/su13169040,Sustainability,"Miika Marttila, Ville Uusitalo, Lassi Linnanen...",https://www.mdpi.com/2071-1050/13/16/9040/pdf?...,https://openalex.org/W3188598539,"Greenhouses require large amounts of energy, w..."
5,Agro-industrial Symbiosis,Industrial symbiosis and agri-food system: The...,2023,17,https://doi.org/10.3389/fsufs.2022.1012436,Frontiers in Sustainable Food Systems,"Manal Hamam, Daniela Spina, Maria Raimondo, Gi...",https://doi.org/10.3389/fsufs.2022.1012436,https://openalex.org/W4315779894,Industrial symbiosis is an eco-innovative syst...
6,Agro-industrial Symbiosis,Re-organise: Game-Based Learning of Circular B...,2022,7,https://doi.org/10.3389/frsus.2022.809700,Frontiers in Sustainability,"Kasper Lange, Gijsbert Korevaar, Inge Oskam, P...",https://www.frontiersin.org/articles/10.3389/f...,https://openalex.org/W4281753412,This study furthers game-based learning for ci...
7,Agro-industrial Symbiosis,Life Cycle Assessment of Oat Flake Production ...,2023,6,https://doi.org/10.3390/su15065124,Sustainability,"Luciano Viana, Pierre-Luc Dessureault, Charles...",https://www.mdpi.com/2071-1050/15/6/5124/pdf?v...,https://openalex.org/W4324142830,Canada is one of the world’s largest producers...
8,Agro-industrial Symbiosis,Optimization path of agricultural products mar...,2023,6,https://doi.org/10.1007/s10644-023-09495-8,Economic Change and Restructuring,"Tao Zhai, Jiabin Liu, Daqing Wang",https://link.springer.com/content/pdf/10.1007/...,https://openalex.org/W4323049704,
9,Agro-industrial Symbiosis,Research on Spatial Planning of Petrochemical ...,2022,5,https://doi.org/10.3390/su14084580,Sustainability,"Min Wang, Xiaohan Yuan, Shuqi Yang, Kahaer Abu...",https://www.mdpi.com/2071-1050/14/8/4580/pdf?v...,https://openalex.org/W4223944481,As a practical exploration of industry ecologi...
