In [1]:
import os
import json
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Олег\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
input_dir = "parsed_articles"
output_dir = "cleaned_articles"
os.makedirs(output_dir, exist_ok=True)

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text

In [None]:
def filter_sentences(sentences):
    clean_sentences = []
    for s in sentences:
        s = s.strip()
        if len(s) > 20 and re.search(r"[a-zA-Z]", s):
            clean_sentences.append(s)
    return clean_sentences

In [None]:
for filename in os.listdir(input_dir):
    if not filename.endswith("_parsed.json"):
        continue

    with open(os.path.join(input_dir, filename), 
              "r", 
              encoding="utf-8") as f:
        data = json.load(f)

    translated_text = data.get("translated_text_en", "")
    if not translated_text:
        continue

    cleaned = clean_text(translated_text)
    sentences = sent_tokenize(cleaned)
    filtered_sentences = filter_sentences(sentences)

    output_data = {
        "title": data.get("title"),
        "published_date": data.get("published_date"),
        "url": data.get("url"),
        "original_language": data.get("original_language"),
        "cleaned_sentences": filtered_sentences
    }

    base_name = os.path.splitext(filename)[0].replace("_parsed", 
                                                      "_cleaned")
    output_path = os.path.join(output_dir, f"{base_name}.json")

    with open(output_path, "w", encoding="utf-8") as out_file:
        json.dump(output_data, out_file, ensure_ascii=False, indent=2)

    print(f"Cleaned and saved: {output_path}")

Cleaned and saved: cleaned_articles\au_articles_0_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_10_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_11_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_12_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_13_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_14_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_15_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_16_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_17_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_18_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_19_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_1_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_20_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_21_cleaned.json
Cleaned and saved: cleaned_articles\au_articles_22_cleaned.json
Cleaned and saved: cleaned_articles\au_art