In [1]:
import os
import json
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import re

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Олег\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Олег\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
INPUT_DIR = "."  # каталог с json-файлами
OUTPUT_DIR = "parsed_articles"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
stop_words = set(stopwords.words("english"))
def clean_text(text): # очистка текста от лишних символов и ссылок
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"https?://\S+", "", text)
    return text.strip()

In [4]:
def extract_main_text_from_url(url): # извлечение главного текста из статей
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")

        # Убираем скрипты и стили
        for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form"]):
            tag.decompose()

        # Пробуем найти основной текст
        paragraphs = soup.find_all("p")
        text = " ".join(p.get_text() for p in paragraphs)
        return clean_text(text)
    

    except Exception as e:
        print(f"Ошибка при извлечении текста: {e}")
        return ""

In [5]:
def extract_topics(text, top_n=10): # извлечение главных тем
    words = word_tokenize(text.lower())
    words = [w for w in words if w.isalnum() and w not in stop_words]
    bigrams = zip(words, words[1:])
    phrases = [" ".join(b) for b in bigrams]
    counter = Counter(phrases)
    return [phrase for phrase, _ in counter.most_common(top_n)]

In [6]:
for filename in os.listdir(INPUT_DIR):
    if filename.endswith("_articles.json"):
        with open(os.path.join(INPUT_DIR, filename), encoding="utf-8") as f:
            articles = json.load(f)

        parsed = []
        for article in articles:
            url = article.get("url")
            if not url:
                continue

            full_text = extract_main_text_from_url(url)
            if not full_text or len(full_text) < 300:
                continue

            sentences = sent_tokenize(full_text)
            topics = extract_topics(full_text)

            parsed.append({
                "title": article.get("title"),
                "source": article.get("source", {}).get("name"),
                "url": url,
                "publishedAt": article.get("publishedAt"),
                "sentences": sentences,
                "topics": topics,
            })

        output_path = os.path.join(OUTPUT_DIR, filename.replace("_articles.json", "_parsed.json"))
        with open(output_path, "w", encoding="utf-8") as f_out:
            json.dump(parsed, f_out, indent=2, ensure_ascii=False)

        print(f"Сохранено: {output_path}")


Сохранено: parsed_articles\au_parsed.json
Сохранено: parsed_articles\ca_parsed.json
Ошибка при извлечении текста: HTTPSConnectionPool(host='www.swissinfo.ch', port=443): Read timed out. (read timeout=10)
Ошибка при извлечении текста: HTTPSConnectionPool(host='www.swissinfo.ch', port=443): Read timed out. (read timeout=10)
Ошибка при извлечении текста: HTTPSConnectionPool(host='www.swissinfo.ch', port=443): Read timed out. (read timeout=10)
Ошибка при извлечении текста: HTTPSConnectionPool(host='www.swissinfo.ch', port=443): Read timed out. (read timeout=10)
Ошибка при извлечении текста: HTTPSConnectionPool(host='www.swissinfo.ch', port=443): Read timed out. (read timeout=10)
Ошибка при извлечении текста: HTTPSConnectionPool(host='www.swissinfo.ch', port=443): Read timed out. (read timeout=10)
Ошибка при извлечении текста: HTTPSConnectionPool(host='www.swissinfo.ch', port=443): Read timed out. (read timeout=10)
Ошибка при извлечении текста: HTTPSConnectionPool(host='www.swissinfo.ch', p