In [60]:
# Bloco 1: Imports e configurações gerais
import asyncio
import aiohttp
import requests
import pandas as pd
from datetime import datetime, timedelta

WIKI_API = "https://en.wikipedia.org/w/api.php"
PAGEVIEWS_PER_ARTICLE = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article"
PROJECT = "en.wikipedia"
ACCESS = "all-access"
AGENT = "user"
GRANULARITY = "daily"

TOPICS = [
    "Artificial Intelligence", "AI", "Business",
    "Finance", "Machine Learning", "Neural Networks", "Tech"
]
MAX_SEARCH_PER_TOPIC = 100   # títulos buscados por tópico
MAX_CANDIDATES = 500         # quantos candidatos de views medir
TOP_K = 50                   # quantos artigos finais exibir



In [61]:
# Bloco 2: Obter títulos relevantes por tópico (síncrono)
def search_titles(topic, limit=MAX_SEARCH_PER_TOPIC):
    params = {
        "action":  "query",
        "format":  "json",
        "list":    "search",
        "srsearch": topic,
        "srlimit": limit,
        "srsort":  "relevance",
        "srprop":  ""
    }
    r = requests.get(WIKI_API, params=params)
    return [i["title"] for i in r.json().get("query", {}).get("search", [])]

def get_relevant_titles(topics):
    titles = []
    for t in topics:
        for title in search_titles(t):
            if title not in titles:
                titles.append(title)
            if len(titles) >= MAX_CANDIDATES:
                return titles
    return titles

relevant_titles = get_relevant_titles(TOPICS)


In [62]:
# Bloco 3: Buscar contagem de views por artigo (assíncrono)
async def fetch_views(session, title, date_dt):
    date_str = date_dt.strftime("%Y%m%d")
    url = f"{PAGEVIEWS_PER_ARTICLE}/{PROJECT}/{ACCESS}/{AGENT}/{title.replace(' ', '_')}/{GRANULARITY}/{date_str}/{date_str}"
    async with session.get(url) as resp:
        if resp.status != 200:
            return title, 0
        data = await resp.json()
    views = data.get("items", [{}])[0].get("views", 0)
    return title, views


In [63]:
# Bloco 4: Orquestrar tudo, calcular score e ordenar
async def main(date_str: str = None):
    async with aiohttp.ClientSession() as session:
        trending = await fetch_trending(session, date_str)
        tasks = [fetch_page(session, t) for t in trending]
        raw = await asyncio.gather(*tasks)

    articles = [a for a in raw if a]
    topic_keys = [t.lower() for t in TOPICS]
    for art in articles:
        txt = " ".join([art["title"], art["summary"], art["content"]]).lower()
        art["score"] = sum(kw in txt for kw in topic_keys)

    # ordena primeiro por views (trending) e depois por score de tópicos
    articles.sort(key=lambda x: (x["score"], x["views"]), reverse=True)
    top = articles[:TOP_K]
    return pd.DataFrame(top, columns=[
        "id", "title", "url", "summary", "content", "image", "views", "score"
    ])


In [64]:
# Bloco 4: Buscar detalhes da página (assíncrono)
async def fetch_page(session, title, views):
    params = {
        "action":      "query",
        "format":      "json",
        "prop":        "extracts|info|pageimages",
        "explaintext": 1,
        "redirects":   1,
        "inprop":      "url",
        "piprop":      "original",
        "titles":      title
    }
    async with session.get(WIKI_API, params=params) as resp:
        js = await resp.json()
    page = next(iter(js.get("query", {}).get("pages", {}).values()), {})
    pid = page.get("pageid")
    if not pid or "missing" in page:
        return None
    text = page.get("extract", "")
    summary = text.split("\n\n")[0] if text else ""
    return {
        "id":      pid,
        "title":   page.get("title", ""),
        "url":     page.get("fullurl", ""),
        "summary": summary,
        "content": text,
        "image":   page.get("original", {}).get("source"),
        "views":   views
    }


In [65]:
# Bloco 5: Orquestrar consultas, filtrar e montar DataFrame
async def main(date_str: str = None):
    # define data alvo
    if date_str:
        date_dt = datetime.strptime(date_str, "%Y/%m/%d").date()
    else:
        date_dt = datetime.utcnow().date() - timedelta(days=1)

    # busca views
    async with aiohttp.ClientSession() as session:
        vs_tasks = [fetch_views(session, t, date_dt) for t in relevant_titles]
        vs_results = await asyncio.gather(*vs_tasks)

    # filtra só quem teve views > 0 e ordena
    vs_filtered = [(t, v) for t, v in vs_results if v > 0]
    vs_filtered.sort(key=lambda x: x[1], reverse=True)
    top_titles = [t for t, _ in vs_filtered[:TOP_K]]

    # busca detalhes
    async with aiohttp.ClientSession() as session:
        pg_tasks = [fetch_page(session, t, dict(vs_results)[t]) for t in top_titles]
        pages = await asyncio.gather(*pg_tasks)

    # monta DataFrame
    articles = [p for p in pages if p]
    # opcional: calcular score de tópicos secundário
    tk = [kw.lower() for kw in TOPICS]
    for art in articles:
        txt = " ".join([art["title"], art["summary"], art["content"]]).lower()
        art["score"] = sum(kw in txt for kw in tk)

    df = pd.DataFrame(articles)
    return df.sort_values(["views","score"], ascending=False).reset_index(drop=True)


In [66]:
# Bloco 6: Executar no Jupyter e exibir
df = await main()               # padrão: ontem
# df = await main("2025/05/10") # data específica
df.head(10)


Unnamed: 0,id,title,url,summary,content,image,views,score
0,72417803,ChatGPT,https://en.wikipedia.org/wiki/ChatGPT,ChatGPT is a generative artificial intelligenc...,ChatGPT is a generative artificial intelligenc...,https://upload.wikimedia.org/wikipedia/commons...,77235,6
1,32058867,WhatsApp,https://en.wikipedia.org/wiki/WhatsApp,WhatsApp (officially WhatsApp Messenger) is an...,WhatsApp (officially WhatsApp Messenger) is an...,https://upload.wikimedia.org/wikipedia/commons...,16707,5
2,1164,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,Artificial intelligence (AI) refers to the cap...,Artificial intelligence (AI) refers to the cap...,,9883,7
3,48795986,OpenAI,https://en.wikipedia.org/wiki/OpenAI,"OpenAI, Inc. is an American artificial intelli...","OpenAI, Inc. is an American artificial intelli...",,7481,6
4,6886,Chicago,https://en.wikipedia.org/wiki/Chicago,Chicago is the most populous city in the U.S. ...,Chicago is the most populous city in the U.S. ...,https://upload.wikimedia.org/wikipedia/commons...,7167,4
5,79371410,Vibe coding,https://en.wikipedia.org/wiki/Vibe_coding,Vibe coding (or vibecoding) is an approach to ...,Vibe coding (or vibecoding) is an approach to ...,,5618,4
6,75223933,Grok (chatbot),https://en.wikipedia.org/wiki/Grok_(chatbot),Grok is a generative artificial intelligence c...,Grok is a generative artificial intelligence c...,https://upload.wikimedia.org/wikipedia/commons...,5422,5
7,78452842,DeepSeek,https://en.wikipedia.org/wiki/DeepSeek,Hangzhou DeepSeek Artificial Intelligence Basi...,Hangzhou DeepSeek Artificial Intelligence Basi...,,5138,4
8,73248112,Large language model,https://en.wikipedia.org/wiki/Large_language_m...,A large language model (LLM) is a type of mach...,A large language model (LLM) is a type of mach...,,4266,5
9,75743156,Perplexity AI,https://en.wikipedia.org/wiki/Perplexity_AI,"Perplexity AI, or simply Perplexity, is an Ame...","Perplexity AI, or simply Perplexity, is an Ame...",,3824,4
