In [42]:
import requests
import json
import time
import datetime
import os
import pandas as pd
import google.generativeai as genai
from google.colab import userdata, drive
from datetime import date, timedelta, datetime
from typing import List
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

HEADERS = {"User-Agent": "Mozilla/5.0"}

In [49]:
### Functions for scrapping

## Defining and formatting dates
def get_last_dates(n_days=6, end_date=None):
    if end_date is None:
        end_date = date.today()
    return [end_date - timedelta(days=offset) for offset in range(n_days, -1, -1)]

def format_dates(dates_list, fmt="%Y-%m-%d"):
    return [d.strftime(fmt) for d in dates_list]

## Getting web page soup
def get_page_soup(url, headers=HEADERS, timeout=10):
    resp = requests.get(url, headers=headers, timeout=timeout)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

## Scrapers: Kommersant, Vedomosti, RBC, Agroinvestor, RG.ru, RIA, Autostat

# Kommersant scraper
def fetch_kom(rubrics, dates, output_file,
              base_url_template="https://www.kommersant.ru/archive/rubric/{rubric}/day/{date}"):
    results = {}
    for rubric in rubrics:
        daily = {}
        for dt in dates:
            url = base_url_template.format(rubric=rubric, date=dt)
            print(f"Fetching Kommersant: {url}")
            try:
                soup = get_page_soup(url)
                scripts = soup.find_all("script", type="application/ld+json")
                items = []
                for script in scripts:
                    raw = script.string
                    if not raw:
                        continue
                    try:
                        data = json.loads(raw)
                    except json.JSONDecodeError:
                        continue
                    for entry in data.get("itemListElement", []):
                        title = entry.get("name") or entry.get("headline")
                        link = entry.get("url")
                        if title and link and link not in {i['url'] for i in items}:
                            items.append({"title": title, "url": link})
                daily[dt] = items
            except Exception as e:
                daily[dt] = f"[ERROR] {e}"
        results[rubric] = daily

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"Saved Kommersant data to {output_file}")


# Vedomosti scraper
def fetch_ved(dates, output_file,
              base_url_template="https://www.vedomosti.ru/newspaper/{date}"):
    all_news = []
    for dt in dates:
        url = base_url_template.format(date=dt)
        print(f"Fetching Vedomosti: {url}")
        try:
            soup = get_page_soup(url)
            for item in soup.select("li.waterfall__item"):
                a = item.select_one("a.waterfall__item-title")
                if not a:
                    continue
                title = a.get_text(strip=True)
                href = a.get("href", "")
                full_url = href if href.startswith("http") else f"https://www.vedomosti.ru{href}"
                all_news.append({"date": dt, "title": title, "url": full_url})
        except Exception as e:
            all_news.append({"date": dt, "error": str(e)})

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_news, f, ensure_ascii=False, indent=2)
    print(f"Saved Vedomosti data to {output_file}")

# RBC scraper

def fetch_rbc(rubrics, dates, output_file, base_url_template="https://www.rbc.ru/{rubric}/?utm_source=topline"):
    # Mapping of Russian month names (genitive case) to month numbers
    ru_months = {
        'января': 1, 'февраля': 2, 'марта': 3, 'апреля': 4,
        'мая': 5, 'июня': 6, 'июля': 7, 'августа': 8,
        'сентября': 9, 'октября': 10, 'ноября': 11, 'декабря': 12
    }
    today = date.today()
    all_news = []
    for rubric in rubrics:
        url = base_url_template.format(rubric=rubric)
        print(f"Fetching RBC, {rubric}: {url}")
        soup = get_page_soup(url)
        # Find all news item containers (with schema.org NewsArticle)
        for item in soup.find_all(attrs={"itemscope": True}):
            itemtype = item.get("itemtype", "")
            if "NewsArticle" not in itemtype:
                continue
            name_meta = item.find("meta", {"itemprop": "name"})
            url_meta = item.find("meta", {"itemprop": "url"})
            date_span = item.find("span", {"class": "item__category"})
            if not name_meta or not url_meta or not date_span:
                continue
            title = name_meta.get("content", "").strip()
            url = url_meta.get("content", "").strip()
            date_text = date_span.get_text(strip=True)
            if not title or not url or not date_text:
                continue
            # Parse date_text to datetime.date
            news_date = None
            if any(month in date_text for month in ru_months):
                # Format like "28 мая, 17:52" (if not today)
                date_part = date_text.split(",")[0].strip()  # e.g. "28 мая"
                parts = date_part.split()
                if len(parts) >= 2:
                    try:
                        day = int(parts[0])
                    except ValueError:
                        continue
                    month_name = parts[1].lower()
                    if month_name not in ru_months:
                        continue
                    month = ru_months[month_name]
                    year = today.year
                    if len(parts) >= 3:
                        # If year is present in date_text
                        year_str = parts[2].replace("г.", "").strip()
                        if year_str.isdigit():
                            year = int(year_str)
                    try:
                        news_date = date(year, month, day)
                    except ValueError:
                        continue
                    # Adjust year if the date is in the future (e.g., last year's news in early January)
                    if news_date > today:
                        news_date = date(year - 1, month, day)
            else:
                # Only time given (e.g. "17:52"), assume today's date
                news_date = today
            if news_date is None:
                continue
            all_news.append({"title": title, "url": url, "date": news_date})
    # Filter news by allowed dates
    filtered_news = [item for item in all_news if item["date"] in dates]
    # Remove duplicates by URL
    unique_news = []
    seen_urls = set()
    for item in filtered_news:
        if item["url"] not in seen_urls:
            unique_news.append(item)
            seen_urls.add(item["url"])
    # Convert date objects to ISO format strings for JSON serialization
    for item in unique_news:
        if isinstance(item.get("date"), date):
            item["date"] = item["date"].isoformat()
    # Save results to JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(unique_news, f, ensure_ascii=False, indent=2)
    print(f"Saved RBC data to {output_file}")

# Agro investor scraper

def fetch_agro(dates, output_file, base_url_template="https://www.agroinvestor.ru/"):
    """Fetch news from Agroinvestor main page and save to JSON."""
    print("Fetching Agroinvestor: https://www.agroinvestor.ru/")
    soup = get_page_soup(base_url_template)
    news_list = []
    seen_links = set()
    # Mapping of Russian month names (in genitive case) to month numbers
    ru_months = {
        "января": 1, "февраля": 2, "марта": 3, "апреля": 4,
        "мая": 5, "июня": 6, "июля": 7, "августа": 8,
        "сентября": 9, "октября": 10, "ноября": 11, "декабря": 12
    }
    # Find all news items on the main page
    for anchor in soup.find_all("a", class_="news__item-desc"):
        title = anchor.get_text(strip=True)
        href = anchor.get("href")
        if not href:
            continue
        # Construct full URL for the news article
        url = urljoin(base_url_template, href.strip())
        # Find the date of the news (in a <time> tag following the title link)
        time_tag = anchor.find_next("time")
        if not time_tag:
            continue
        date_text = time_tag.get_text(strip=True).replace("\xa0", " ")
        if not date_text:
            continue
        # Parse the date text (e.g. "30 мая 2025") into a datetime.date object
        try:
            day_str, month_str, year_str = date_text.split()
            day = int(day_str)
            year = int(year_str)
        except Exception:
            # Skip if date format is unexpected
            continue
        month_str = month_str.lower()
        if month_str not in ru_months:
            continue
        month = ru_months[month_str]

        try: date_obj = datetime.date(year, month, day)
        except Exception:
            continue

        # Filter news by allowed dates and avoid duplicates by URL
        if date_obj in dates and url not in seen_links:
            news_list.append({
                "title": title,
                "link": url,
                "date": date_obj.isoformat()
            })
            seen_links.add(url)
    # Save the result to a JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(news_list, f, ensure_ascii=False, indent=2)
    print(f"Saved Agroinvestor data to {output_file}")

# RG.ru scraper

def fetch_rg(rubrics, dates, output_file,
             base_url_template="https://rg.ru/tema/ekonomika/{rubric}"):
    all_news = []
    for rubric in rubrics:
        url = base_url_template.format(rubric=rubric)
        print(f"Fetching RG, {rubric}: {url}")
        soup = get_page_soup(url)
        for title_span in soup.find_all("span", class_="ItemOfListStandard_title__Ajjlf"):
            parent_a = title_span.find_parent("a")
            if not parent_a:
                continue
            href = parent_a.get("href", "").strip()
            if not href:
                continue
            full_url = href if href.startswith("http") else f"https://rg.ru{href}"

            date_a = title_span.find_previous("a", class_="ItemOfListStandard_datetime__GstJi")
            if not date_a:
                continue
            date_href = date_a.get("href", "").strip()
            parts = date_href.strip("/").split("/")  # ['2025','05','30',...]
            if len(parts) < 3:
                continue
            try:
                y, m, d = map(int, parts[:3])
                news_date = date(y, m, d)
            except ValueError:
                continue

            if news_date not in dates:
                continue

            all_news.append({
                "title": title_span.get_text(strip=True),
                "url": full_url,
                "date": news_date.isoformat()
            })

    unique = []
    seen = set()
    for item in all_news:
        if item["url"] not in seen:
            seen.add(item["url"])
            unique.append(item)

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(unique, f, ensure_ascii=False, indent=2)
    print(f"Saved RG data to {output_file}")

# RIA scraper

def fetch_ria(dates, output_file, base_url_template="https://ria.ru/economy/"):
    print("Fetching RIA: https://ria.ru/economy/")
    soup = get_page_soup(base_url_template)
    collected = []

    # Each news item has <a itemprop="url" href="..."></a>
    for a in soup.find_all("a", itemprop="url"):
        href = a.get("href", "").strip()
        if not href:
            continue
        full_url = href if href.startswith("http") else f"https://ria.ru{href}"

        # Next meta tag with itemprop="name" holds the title
        name_meta = a.find_next("meta", itemprop="name")
        if not name_meta:
            continue
        title = name_meta.get("content", "").strip()
        if not title:
            continue

        # Extract date from the URL path: "/YYYYMMDD/..."
        parsed = urlparse(full_url)
        parts = parsed.path.lstrip("/").split("/")
        if not parts or len(parts[0]) != 8 or not parts[0].isdigit():
            continue
        y, m, d = int(parts[0][:4]), int(parts[0][4:6]), int(parts[0][6:8])
        try:
            news_date = date(y, m, d)
        except ValueError:
            continue

        # Filter by provided dates
        if news_date in dates:
            collected.append({
                "title": title,
                "url": full_url,
                "date": news_date.isoformat()
            })

    # Remove duplicates by URL
    unique = []
    seen = set()
    for item in collected:
        if item["url"] not in seen:
            seen.add(item["url"])
            unique.append(item)

    # Save to JSON
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(unique, f, ensure_ascii=False, indent=2)

    print(f"Saved RIA data to {output_file}")


# Autostat scraper

def fetch_autostat(dates, output_file,
                   rubrics=[21, 8, 13, 70, 71],
                   base_url_template="https://m.autostat.ru/news/themes-{rubric}/"):

    if dates is None:
        raise ValueError("Argument 'dates' must be provided as a list of datetime.date objects.")

    all_collected = []
    seen_urls = set()

    ru_months = {
        'января': 1, 'февраля': 2, 'марта': 3, 'апреля': 4,
        'мая': 5, 'июня': 6, 'июля': 7, 'августа': 8,
        'сентября': 9, 'октября': 10, 'ноября': 11, 'декабря': 12
    }
    today = date.today()
    yesterday = today - timedelta(days=1)

    for rubric in rubrics:
        url = base_url_template.format(rubric=rubric)
        print(f"Fetching Autostat, {rubric}: {url}")
        soup = get_page_soup(url)
        if not soup:
            print(f"  (!) Failed to retrieve or parse page for rubric {rubric}")
            continue

        titles = soup.find_all("p", class_="Block-title")
        if not titles:
            print(f"    (!) No <p class='Block-title'> elements found on {url}")
            continue

        for title_p in titles:
            title = title_p.get_text(strip=True)
            if not title:
                continue

            link_a = title_p.find_parent("a", class_="Block-link")
            if not link_a:
                continue
            href = link_a.get("href", "").strip()
            if not href:
                continue
            full_url = urljoin("https://www.autostat.ru", href)

            date_p = title_p.find_next("p", class_="Block-date")
            if not date_p:
                continue
            date_text = date_p.get_text(strip=True)  # e.g. "Сегодня, 15:48" or "28 мая, 15:48"
            date_part = date_text.split(",")[0].strip().lower()

            if date_part == "сегодня":
                news_date = today
            elif date_part == "вчера":
                news_date = yesterday
            else:
                parts = date_part.split()
                if len(parts) != 2:
                    continue
                day_str, month_str = parts
                try:
                    day = int(day_str)
                    month = ru_months.get(month_str)
                    if not month:
                        continue
                    news_date = date(today.year, month, day)
                    if news_date > today:
                        news_date = date(today.year - 1, month, day)
                except Exception:
                    continue

            if news_date in dates and full_url not in seen_urls:
                all_collected.append({
                    "title": title,
                    "url": full_url,
                    "date": news_date.isoformat()
                })
                seen_urls.add(full_url)

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_collected, f, ensure_ascii=False, indent=2)

    print(f"Saved Autostat data to {output_file}")

In [44]:
# Parameters
days_before = 0
dates = get_last_dates(days_before)
dates_kom = format_dates(dates, fmt="%Y-%m-%d")
dates_ved = format_dates(dates, fmt="%Y/%m/%d")

rubrics_kom_rus = [3, 4, 40]
rubrics_kom_world = [3, 5]
rubrics_kom_prices = [41]
rubrics_rbc = ["economics", "business", "finances"]
rubrics_rg = ["politekonom", "industria", "business", "finansy", "kazna", "rabota", "pensii", "vnesh", "apk", "tovary", "turizm"]
rubrics_auto = [21, 8, 13, 70, 71]

In [50]:
# Fetching
fetch_kom(rubrics_kom_rus, dates_kom, "kom_rus.json")
fetch_kom(rubrics_kom_world, dates_kom, "kom_world.json")
fetch_kom(rubrics_kom_prices, dates_kom, "kom_prices.json")
fetch_ved(dates_ved, "ved.json")
fetch_rbc(rubrics_rbc, dates, "rbc.json")
fetch_agro(dates, "agro.json")
fetch_rg(rubrics_rg, dates, "rg.json")
fetch_ria(dates, "ria.json")
fetch_autostat(dates, "autostat.json", rubrics_auto)

Fetching Kommersant: https://www.kommersant.ru/archive/rubric/3/day/2025-06-01
Fetching Kommersant: https://www.kommersant.ru/archive/rubric/4/day/2025-06-01
Fetching Kommersant: https://www.kommersant.ru/archive/rubric/40/day/2025-06-01
Saved Kommersant data to kom_rus.json
Fetching Kommersant: https://www.kommersant.ru/archive/rubric/3/day/2025-06-01
Fetching Kommersant: https://www.kommersant.ru/archive/rubric/5/day/2025-06-01
Saved Kommersant data to kom_world.json
Fetching Kommersant: https://www.kommersant.ru/archive/rubric/41/day/2025-06-01
Saved Kommersant data to kom_prices.json
Fetching Vedomosti: https://www.vedomosti.ru/newspaper/2025/06/01
Saved Vedomosti data to ved.json
Fetching RBC, economics: https://www.rbc.ru/economics/?utm_source=topline
Fetching RBC, business: https://www.rbc.ru/business/?utm_source=topline
Fetching RBC, finances: https://www.rbc.ru/finances/?utm_source=topline
Saved RBC data to rbc.json
Fetching Agroinvestor: https://www.agroinvestor.ru/
Saved Agr

In [52]:
# Kommersant, Vedomosti, RBC, Agroinvestor, RG.ru, RIA, Autostat
section_to_files = {
    "world": [
        "kom_world.json",
        "kom_rus.json",
        "ved.json",
        "rbc.json",
        "agro.json",
        "rg.json",
        "ria.json"
    ],
    "rus": [
        "kom_rus.json",
        "ved.json",
        "rbc.json",
        "agro.json",
        "rg.json",
        "ria.json"
    ],
    "prices": [
        "kom_prices.json",
        "kom_rus.json",
        "ved.json",
        "rbc.json",
        "agro.json",
        "rg.json",
        "ria.json",
        "autostat.json"
    ]
}

In [53]:
#with open('autostat.json', encoding='utf-8') as f:
#    data = json.load(f)
#print(json.dumps(data, ensure_ascii=False, indent=2))

In [54]:
API_KEY = userdata.get('gemini_api_key')
genai.configure(api_key=API_KEY)
model_obj = genai.GenerativeModel('gemini-1.5-flash')

In [None]:
drive.mount('/content/drive')

In [55]:
# Prompts

file_path = '/content/drive/MyDrive/news lists, prompt beginning.txt'

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        propmt_list_start = f.read()
except FileNotFoundError:
    print(f"Error: no file found (path: {file_path})")
except Exception as e:
    print(f"Error while reading file: {e}")

file_path = '/content/drive/MyDrive/bullets, prompt beginning.txt'

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        prompt_bullets_start = f.read()
except FileNotFoundError:
    print(f"Error: no file found (path: {file_path})")
except Exception as e:
    print(f"Error while reading file: {e}")

section_to_continue_prompt = {
    "world": [
        'Пожалуйста, просмотри АБСОЛЮТНО ВСЕ НОВОСТИ в приложенном файле и отбери из них только те, что строго соответствуют критериям и могут быть включены в нумерованный список для раздела по мировой экономике.'
    ],
    "rus": [
        'Пожалуйста, просмотри АБСОЛЮТНО ВСЕ НОВОСТИ в приложенном файле и отбери из них только те, что строго соответствуют критериям и могут быть включены в нумерованный список для раздела по россиийской экономике.'
    ],
    "prices": [
        'Пожалуйста, просмотри АБСОЛЮТНО ВСЕ НОВОСТИ в приложенном файле и отбери из них только те, что строго соответствуют критериям и могут быть включены в нумерованный список для раздела по новостям, релевантным для динамики российских цен.'
    ]
}
prompt_list_finish = 'Пришли мне оформленный в соответствии с требованиями список. ОЧЕНЬ ВАЖНО НЕ ВКЛЮЧАТЬ В ОТВЕТ НИЧЕГО ДОПОЛНИТЕЛЬНОГО, ТОЛЬКО НАЗВАНИЯ НОВОСТЕЙ И ССЫЛКИ.'

section_to_finish_bullets_prompt = {
    "world": [
        'Пожалуйста, подготовь 3 буллита для раздела по мировой экономике в соответствии с требованиями и пришли итоговый результат в таком формате: сначала буллиты, потом нумерованный список.'
    ],
    "rus": [
        'Пожалуйста, подготовь 3 буллита для раздела по россиийской экономике в соответствии с требованиями и пришли итоговый результат в таком формате: сначала буллиты, потом нумерованный список.'
    ],
    "prices": [
        'Пожалуйста, подготовь 3 буллита для раздела по по новостям, релевантным для динамики российских цен, в соответствии с требованиями и пришли итоговый результат в таком формате: сначала буллиты, потом нумерованный список.'
    ]
}

In [56]:
def create_news_lists(section):
    if section not in section_to_files:
        raise ValueError(f"Section '{section}' unknown.")

    # Если сегодня не суббота, пробуем прочитать существующий файл <section>.txt
    if datetime.today().weekday() != 5:  # 5 = Saturday
        drive_folder = "/content/drive/MyDrive"
        file_name = f"{section}.txt"
        file_path = f"{drive_folder}/{file_name}"

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                list_start = f.read()
        except FileNotFoundError:
            print(f"Warning: file not found: {file_path}")
            list_start = ""
        except Exception as e:
            print(f"Error reading file: {e}")
            list_start = ""
    else:
        list_start = ""

    # Достаём список JSON-файлов и соответствующий prompt_list_continue
    json_files = section_to_files[section]
    prompt_list_continue = section_to_continue_prompt[section]

    combined_text_parts = []
    for json_filename in json_files:
        base_name, ext = os.path.splitext(json_filename)
        if ext.lower() != ".json":
            print(f"Пропускаем '{json_filename}', т.к. не .json-файл.")
            continue

        try:
            with open(json_filename, 'r', encoding='utf-8') as f:
                news_data = json.load(f)
        except FileNotFoundError:
            print(f"Файл '{json_filename}' не найден. Пропускаем.")
            continue
        except json.JSONDecodeError as e:
            print(f"Ошибка JSON в '{json_filename}': {e}. Пропускаем.")
            continue

        news_json_string = json.dumps(news_data, ensure_ascii=False, indent=2)

        raw_parts = [
            propmt_list_start,
            prompt_list_continue,
            prompt_list_finish,
            news_json_string
        ]

        prompt_parts = []
        for part in raw_parts:
            if isinstance(part, list):
                # Если это список, склеиваем через переносы строк
                prompt_parts.append("\n".join(part))
            else:
                prompt_parts.append(str(part))

        try:
            response = model_obj.generate_content(prompt_parts)
        except Exception as e:
            print(f"Error in model.generate_content for '{json_filename}': {e}.")
            continue

        header = f"=== {base_name} ({section}) ===\n"
        combined_text_parts.append(header + response.text + "\n\n")

    if not combined_text_parts:
        print(f"For section '{section}', zero JSONs were successfully processed.")
        return

    # Объединяем прочитанное ранее (list_start) с новыми частями
    all_text = list_start + "".join(combined_text_parts)

    # Записываем итог в тот же файл <section>.txt на Google Drive
    drive_folder = "/content/drive/MyDrive"
    file_name = f"{section}.txt"
    file_path = f"{drive_folder}/{file_name}"

    try:
        with open(file_path, "w", encoding="utf-8") as out_f:
            out_f.write(all_text)
        print(f"File successfully written: {file_path}")
    except FileNotFoundError:
        print(f"Error: path not found: {file_path}")
    except Exception as e:
        print(f"Error writing file: {e}")


In [60]:
create_news_lists("world")
time.sleep(60)
create_news_lists("rus")
time.sleep(60)
create_news_lists("prices")

File successfully written: /content/drive/MyDrive/world.txt
File successfully written: /content/drive/MyDrive/rus.txt
File successfully written: /content/drive/MyDrive/prices.txt


In [58]:
def create_bullets(section):
    if section not in section_to_files:
        raise ValueError(f"Section '{section}' unknown.")

    # Путь к файлу списка, который нужно прочитать
    list_file = f"{section}.txt"
    file_path = f"/content/drive/MyDrive/{list_file}"


    try:
        with open(file_path, "r", encoding="utf-8") as f:
            list_content = f.read()
    except FileNotFoundError:
        print(f"Error: path not found: {file_path}")
        return
    except Exception as e:
        print(f"Error reading file: {e}")
        return

    # Берём соответствующий prompt для завершения
    prompt_bullets_finish = section_to_finish_bullets_prompt[section]

    # Формируем prompt_parts
    raw_parts = [
        prompt_bullets_start,
        prompt_bullets_finish,
        list_content
    ]

    prompt_parts = []
    for part in raw_parts:
            if isinstance(part, list):
                # Если это список, склеиваем через переносы строк
                prompt_parts.append("\n".join(part))
            else:
                prompt_parts.append(str(part))

    try:
        response = model_obj.generate_content(prompt_parts)
    except Exception as e:
        print(f"Error in model.generate_content: {e}")
        return

    file_name = f"report_{section}.txt"

    drive_folder = "/content/drive/MyDrive/news_reports"
    os.makedirs(drive_folder, exist_ok=True)
    out_path = f"{drive_folder}/{file_name}"

    try:
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"Успешно записан файл: {out_path}")
    except FileNotFoundError:
        print(f"Error: path not found: {out_path}")
    except Exception as e:
        print(f"Error writing file: {e}")

In [59]:
if datetime.today().weekday() == 3:
  create_bullets("world")
  create_bullets("rus")
  create_bullets("prices")