In [1]:
import os
import json
import re
import nltk
from textblob import TextBlob
from collections import Counter

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Олег\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Каталог с JSON-файлами
DATA_DIR = "./"
THEMES = {
    "finance": ["bank", "finance", "investment", "debt", "loan", "currency", "development"],
    "geopolitics": ["un", "security council", "usa", "russia", "china", "diplomacy", "alliances", "nato"],
    "technology": ["blockchain", "ai", "robotics", "technology", "5g", "nanotech", "innovation", "infrastructure"],
    "trade": ["exports", "imports", "trade", "sanctions", "tariffs", "market", "deal"],
    "energy": ["oil", "gas", "energy", "hydrocarbons", "solar", "renewable", "electricity"],
    "military": ["military", "army", "defense", "arms", "weapons", "nuclear"]
}

In [4]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [5]:
def classify_themes(text):
    text_lower = text.lower()
    found_themes = []
    for theme, keywords in THEMES.items():
        if any(word in text_lower for word in keywords):
            found_themes.append(theme)
    return found_themes if found_themes else ["other"]

In [6]:
def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0.1:
        return "positive"
    elif polarity < -0.1:
        return "negative"
    else:
        return "neutral"

In [7]:
def process_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        articles = json.load(f)

    results = []
    for article in articles:
        full_text = article.get("full_content") or article.get("content") or ""
        if not full_text:
            continue

        clean = clean_text(full_text)
        sentences = nltk.sent_tokenize(clean)
        text_joined = " ".join(sentences)

        sentiment = analyze_sentiment(text_joined)
        themes = classify_themes(text_joined)

        results.append({
            "title": article.get("title"),
            "url": article.get("url"),
            "sentiment": sentiment,
            "themes": themes
        })

    return results

In [8]:
# Обработка всех файлов в каталоге
all_results = []
for filename in os.listdir(DATA_DIR):
    if filename.endswith("_articles.json"):
        filepath = os.path.join(DATA_DIR, filename)
        country = filename.split("_")[0]
        print(f"Processing: {country}")
        res = process_file(filepath)
        for r in res:
            r["country"] = country
        all_results.extend(res)

# Сохраняем результаты
with open("classified_articles.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=4, ensure_ascii=False)


Processing: au
Processing: ca
Processing: ch
Processing: gb
Processing: ie
Processing: in
Processing: ph
Processing: pk
Processing: sg
Processing: us
