In [None]:
import re, html, json, urllib.parse
from datetime import datetime, timedelta, timezone
import requests
from bs4 import BeautifulSoup

# ========= НАСТРОЙКИ =========
KEYWORDS = [
    'трамп','trump','дональд','сша','герман','китай','росси',
    'эконом','инфляц','ставк','ввп','moex','фрс','ecb'
]
DAYS_BACK = 2
TZ_OUT = timezone(timedelta(hours=6))  # Asia/Bishkek
MAX_NEWS_PER_SITE = 50
# лимиты на число карточек, которые просматриваем на «тяжёлых» лентах
MAX_LINKS_TASS = 120
MAX_LINKS_INTERFAX = 120
MAX_LINKS_FINMARKET = 150

# ========= ИСТОЧНИКИ =========
RBC_URL = "https://www.rbc.ru/economics/"
IZ_URL  = "https://iz.ru/news"
IZ_BASE = "https://iz.ru"
DW_RSS  = "https://rss.dw.com/rdf/rss-ru-all"
FINMARKET_LIST = "https://www.finmarket.ru/news/"
TASS_LIST = "https://tass.ru/ekonomika"
INTERFAX_LIST = "https://www.interfax.ru/business"

HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/125.0 Safari/537.36"),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8",
    "Connection": "keep-alive",
    "Referer": "https://google.com"
}

# ========= ВСПОМОГАТЕЛЬНОЕ =========
RU_MONTHS = {
    'января':1,'февраля':2,'марта':3,'апреля':4,'мая':5,'июня':6,
    'июля':7,'августа':8,'сентября':9,'октября':10,'ноября':11,'декабря':12
}
ISO_RE = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}(?::\d{2})?(?:Z|[+\-]\d{2}:\d{2})', re.I)
RU_NUM_RE = re.compile(r'(?P<d>\d{1,2})\.(?P<m>\d{1,2})\.(?P<y>\d{4})(?:\s+(?P<h>\d{1,2}):(?P<min>\d{2}))?')
RU_TXT_RE = re.compile(
    r'(?P<d>\d{1,2})\s+(?P<mon>[А-Яа-я]+)\s+(?P<y>\d{4})(?:\s*г\.?|(?:\s*года)?)?(?:,\s*|\s+)?(?:(?P<h>\d{1,2}):(?P<min>\d{2}))?',
    re.I
)

def _now_utc(): return datetime.now(timezone.utc)
def _within_days(dt, days):
    if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc)
    return (_now_utc() - dt) <= timedelta(days=days)
def _kw_ok(title):
    tl = (title or '').lower()
    return any(kw in tl for kw in KEYWORDS)

def _req(session, url, timeout=20):
    r = session.get(url, headers=HEADERS, timeout=timeout)
    r.raise_for_status()
    r.encoding = r.encoding or r.apparent_encoding or 'utf-8'
    return r

def _parse_iso_or_rfc(s):
    if not s: return None
    s = s.strip()
    try:
        if s.endswith('Z'): dt = datetime.fromisoformat(s.replace('Z', '+00:00'))
        else: dt = datetime.fromisoformat(s)
        return dt.astimezone(timezone.utc)
    except: pass
    try:
        from email.utils import parsedate_to_datetime
        dt = parsedate_to_datetime(s)
        if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc)
    except: return None

def _parse_ru_date(text, default_tz=timezone(timedelta(hours=3))):
    if not text: return None
    s = html.unescape(text.strip().lower())
    m = RU_NUM_RE.search(s)
    if m:
        d,mn,y = int(m['d']), int(m['m']), int(m['y'])
        hh = int(m['h'] or 0); mm = int(m['min'] or 0)
        try: return datetime(y,mn,d,hh,mm,tzinfo=default_tz).astimezone(timezone.utc)
        except: return None
    m = RU_TXT_RE.search(s)
    if m:
        d = int(m['d']); mn = RU_MONTHS.get(m['mon']); y = int(m['y'])
        if not mn: return None
        hh = int(m['h'] or 0); mm = int(m['min'] or 0)
        try: return datetime(y,mn,d,hh,mm,tzinfo=default_tz).astimezone(timezone.utc)
        except: return None
    return None

def _jsonld_datetime(soup):
    for tag in soup.find_all("script", attrs={"type":"application/ld+json"}):
        try: data = json.loads(tag.string or tag.text or "")
        except: continue
        objs = data if isinstance(data, list) else [data]
        for obj in objs:
            if isinstance(obj, dict):
                dp = obj.get("datePublished") or obj.get("dateCreated")
                if dp:
                    dt = _parse_iso_or_rfc(dp)
                    if dt: return dt
    return None

def _extract_datetime(html_text, soup):
    dt = _jsonld_datetime(soup)
    if dt: return dt
    t = soup.find('time', attrs={'datetime':True}) or soup.find('time', attrs={'content':True})
    if t:
        dt = _parse_iso_or_rfc(t.get('datetime') or t.get('content'));
        if dt: return dt
    meta = soup.find('meta', attrs={'itemprop':'datePublished'}) or soup.find('meta', attrs={'property':'article:published_time'})
    if meta and meta.get('content'):
        dt = _parse_iso_or_rfc(meta['content']);
        if dt: return dt
    m = ISO_RE.search(html_text)
    if m:
        dt = _parse_iso_or_rfc(m.group(0))
        if dt: return dt
    flat = ' '.join(soup.stripped_strings)
    return _parse_ru_date(flat)

def _title_generic(soup):
    h1 = soup.find('h1')
    if h1 and h1.get_text(strip=True): return h1.get_text(strip=True)
    og = soup.find('meta', attrs={'property':'og:title'})
    if og and og.get('content'): return og['content'].strip()
    if soup.title and soup.title.text: return soup.title.text.strip()
    return ""

def _emit(items):
    if not items:
        print("❌ Нет подходящих новостей."); return
    items.sort(key=lambda x: x['date'] or datetime.min.replace(tzinfo=timezone.utc), reverse=True)
    print(f"✅ Найдено {len(items)} новостей за {DAYS_BACK} дн.\n")
    try:
        from IPython.display import display, Markdown
        for i, a in enumerate(items, 1):
            dt = a['date'].astimezone(TZ_OUT) if a['date'] else None
            ds = dt.strftime('%Y-%m-%d %H:%M') if dt else ''
            display(Markdown(f"**[{i}] [{a['source']}] {a['title']}**  \n"
                             f"{'🗓 '+ds if ds else ''}  \n"
                             f"🔗 [{a['link']}]({a['link']})\n"))
    except:
        for i, a in enumerate(items, 1):
            dt = a['date'].astimezone(TZ_OUT) if a['date'] else None
            ds = dt.strftime('%Y-%m-%d %H:%M') if dt else ''
            print(f"[{i}] [{a['source']}] {a['title']}\n"
                  f"{('🗓 '+ds+'\n') if ds else ''}"
                  f"🔗 {a['link']}\n")

# ========= ИСТОЧНИКИ: ПАРСИНГ =========

def fetch_rbc(session):
    """RBC Экономика: парсим карточки со страницы (дата берётся из URL)."""
    out = []
    html_text = _req(session, RBC_URL).text
    soup = BeautifulSoup(html_text, 'html.parser')
    items = soup.find_all('div', class_='item')
    rx = re.compile(r'/(\d{2})/(\d{2})/(\d{4})/')
    cutoff = datetime.now() - timedelta(days=DAYS_BACK)
    for it in items:
        title_tag = it.find('span','item__title'); link_tag = it.find('a','item__link')
        if not title_tag or not link_tag: continue
        title = ''.join(title_tag.stripped_strings)
        link = link_tag.get('href','').strip()
        m = rx.search(link);
        if not m: continue
        d,mn,y = map(int, m.groups())
        dt = datetime(y,mn,d)
        if dt < cutoff: continue
        if not _kw_ok(title): continue
        out.append({'source':'RBC','title':title,'link':link,'date':dt.replace(tzinfo=timezone.utc)})
        if len(out) >= MAX_NEWS_PER_SITE: break
    return out

def _extract_iz_json(html_text):
    m = re.search(r'window\.recommendationBlockList\s*=\s*{', html_text)
    if not m: return None
    start = m.end()-1; braces=0; end=start
    for i,ch in enumerate(html_text[start:], start=start):
        if ch=='{': braces+=1
        elif ch=='}':
            braces-=1
            if braces==0: end=i; break
    return html_text[start:end+1]

def fetch_iz(session):
    """Izvestia Новости: берём JSON из window.recommendationBlockList; фильтруем внешние домены."""
    out, seen = [], set()
    html_text = _req(session, IZ_URL).text
    js = _extract_iz_json(html_text)
    if not js: return out
    data = json.loads(js)
    for key in ('even','odd'):
        for item in (data.get(key) or []):
            raw = (item.get('path') or item.get('reference') or '').strip()
            if not raw: continue
            full = raw if raw.startswith('http') else (IZ_BASE + raw if raw.startswith('/') else IZ_BASE+'/'+raw)
            if 'iz.ru' not in urllib.parse.urlparse(full).netloc:  # выкидываем внешние сайты
                continue
            title = (item.get('title') or '').strip()
            if not _kw_ok(title): continue
            if full in seen: continue
            seen.add(full)
            out.append({'source':'Izvestia','title':title,'link':full,'date':None})
            if len(out) >= MAX_NEWS_PER_SITE: break
        if len(out) >= MAX_NEWS_PER_SITE: break
    return out

def fetch_dw(session):
    """DW через RSS — быстро и стабильно."""
    out = []
    r = _req(session, DW_RSS, timeout=20)
    import xml.etree.ElementTree as ET
    root = ET.fromstring(r.content)
    items = [el for el in root.iter() if el.tag.lower().endswith('item')]
    for it in items:
        title=link=date_text=None
        for ch in it:
            tag = ch.tag.lower()
            if tag.endswith('title'): title = (ch.text or '').strip()
            elif tag.endswith('link'): link = (ch.text or '').strip()
            elif tag.endswith('date') or tag.endswith('pubdate'): date_text = (ch.text or '').strip()
        if not title or not link: continue
        if not _kw_ok(title): continue
        dt = _parse_iso_or_rfc(date_text)
        if not dt or not _within_days(dt, DAYS_BACK): continue
        out.append({'source':'DW','title':title,'link':link,'date':dt})
        if len(out) >= MAX_NEWS_PER_SITE: break
    return out

# Finmarket
FM_LINK_RE = re.compile(r'href=["\'](?:https?://[^"\']+)?(/news/\d+/?)(?:\?[^"\']*)?["\']', re.I)
def _fin_title(html_text):
    s = BeautifulSoup(html_text,'html.parser')
    h1 = s.find('h1');
    if h1 and h1.get_text(strip=True): return h1.get_text(strip=True)
    og = s.find('meta', attrs={'property':'og:title'})
    if og and og.get('content'): return og['content'].strip()
    return s.title.text.strip() if s.title and s.title.text else ""

def _fin_date(html_text):
    s = BeautifulSoup(html_text,'html.parser')
    t = s.find('time', attrs={'datetime':True}) or s.find('time', attrs={'content':True})
    if t:
        d = _parse_iso_or_rfc(t.get('datetime') or t.get('content'))
        if d: return d
    meta = s.find('meta', attrs={'itemprop':'datePublished'}) or s.find('meta', attrs={'property':'article:published_time'})
    if meta and meta.get('content'):
        d = _parse_iso_or_rfc(meta['content'])
        if d: return d
    m = ISO_RE.search(html_text)
    if m:
        d = _parse_iso_or_rfc(m.group(0))
        if d: return d
    flat = ' '.join(s.stripped_strings for s in [soup for soup in [s]][0])  # trick to keep it one-liner
    # упрощённо: парсим из всего текста
    flat = ' '.join(BeautifulSoup(html_text,'html.parser').stripped_strings)
    return _parse_ru_date(flat)

def fetch_finmarket(session):
    out, seen = [], set()
    lst = _req(session, FINMARKET_LIST).text
    cands = []
    for m in FM_LINK_RE.finditer(lst):
        rel = m.group(1)
        if rel and rel not in seen:
            seen.add(rel)
            cands.append(urllib.parse.urljoin(FINMARKET_LIST, rel))
        if len(cands) >= MAX_LINKS_FINMARKET: break
    for url in cands:
        r = _req(session, url)
        title = _fin_title(r.text)
        if not title or not _kw_ok(title): continue
        dt = _fin_date(r.text)
        if not dt or not _within_days(dt, DAYS_BACK): continue
        out.append({'source':'Finmarket','title':title,'link':url,'date':dt})
        if len(out) >= MAX_NEWS_PER_SITE: break
    return out

# TASS
TASS_LINK_RE = re.compile(r'href=["\'](?:https?://(?:www\.)?tass\.ru)?(/ekonomika/\d+[^\s"\']*)["\']', re.I)
def fetch_tass(session):
    out, seen = [], set()
    lst = _req(session, TASS_LIST).text
    links = []
    for m in TASS_LINK_RE.finditer(lst):
        rel = m.group(1)
        if not rel: continue
        url = urllib.parse.urljoin(TASS_LIST, rel)
        if url in seen: continue
        seen.add(url); links.append(url)
        if len(links) >= MAX_LINKS_TASS: break
    for url in links:
        r = _req(session, url)
        soup = BeautifulSoup(r.text,'html.parser')
        title = _title_generic(soup)
        if not title or not _kw_ok(title): continue
        dt = _extract_datetime(r.text, soup)
        if not dt or not _within_days(dt, DAYS_BACK): continue
        out.append({'source':'TASS','title':title,'link':url,'date':dt})
        if len(out) >= MAX_NEWS_PER_SITE: break
    return out

# Interfax
INTERFAX_LINK_RE = re.compile(r'href=["\'](?:https?://(?:www\.)?interfax\.ru)?(/business/\d+[^\s"\']*)["\']', re.I)
def fetch_interfax(session):
    out, seen = [], set()
    lst = _req(session, INTERFAX_LIST).text
    links = []
    for m in INTERFAX_LINK_RE.finditer(lst):
        rel = m.group(1)
        if not rel: continue
        url = urllib.parse.urljoin(INTERFAX_LIST, rel)
        if url in seen: continue
        seen.add(url); links.append(url)
        if len(links) >= MAX_LINKS_INTERFAX: break
    for url in links:
        r = _req(session, url)
        soup = BeautifulSoup(r.text,'html.parser')
        title = _title_generic(soup)
        if not title or not _kw_ok(title): continue
        dt = _extract_datetime(r.text, soup)
        if not dt or not _within_days(dt, DAYS_BACK): continue
        out.append({'source':'Interfax','title':title,'link':url,'date':dt})
        if len(out) >= MAX_NEWS_PER_SITE: break
    return out

# ========= MAIN =========
def main():
    sess = requests.Session()
    results, seen_links = [], set()

    for fetch in (fetch_rbc, fetch_iz, fetch_dw, fetch_finmarket, fetch_tass, fetch_interfax):
        try:
            items = fetch(sess)
        except Exception as e:
            print(f"[WARN] {fetch.__name__}: {e}")
            items = []
        for a in items:
            if a['link'] in seen_links: continue
            seen_links.add(a['link']); results.append(a)

    _emit(results)

if __name__ == "__main__":
    main()