In [1]:
import fitz   # PyMuPDF
import re
import json

# PDF filename (newspaper issue)
pdf_path = "01.01.2021.pdf"

# Open the PDF with PyMuPDF
doc = fitz.open(pdf_path)

# Prepare mapping for CID placeholders to actual Kazakh characters
# (This mapping may need adjustment per PDF; here are common ones)
cid_to_char = {
    'cid:19': 'Ө',  # likely mappings from analysis
    'cid:25': 'ө',
    'cid:4':  'Ә',
    'cid:8':  'ә',
    'cid:5':  'і',
    # ... (include other mappings as identified)
}

# Variables to hold global info (journal name, date, number)
journal_name = ""
issue_date = ""
issue_number = ""

# Utility function to clean and fix extracted text
def fix_text(text):
    """Replace CID placeholders and Latin stand-ins with correct Kazakh characters."""
    # Replace (cid:xx) patterns using the mapping
    for cid, char in cid_to_char.items():
        text = text.replace(f"({cid})", char)  # PDFMiner style "(cid:##)"
        text = text.replace(f"{cid})", char)   # sometimes without opening parenthesis
        text = text.replace(f"{cid} ", char)   # or followed by space
    # Replace Latin 'ə' with Cyrillic 'ә' (common mis-mapping for Kazakh 'ә')
    text = text.replace("ə", "ә")
    # (Add other replacements if needed, e.g. unusual quotes or hyphen issues)
    return text

# Function to split text into sentences and return the first N sentences as abstract
def get_abstract(text, num_sentences=10):
    # Simple sentence splitting by punctuation (. ! ?)
    sentences = re.split(r'(?<=[\.!?])\s+', text)
    abstract = " ".join(sentences[:num_sentences])
    return abstract

articles = []  # list to collect article dictionaries
article_count = 0

# Iterate through pages to extract content
for page_index in range(len(doc)):
    page = doc.load_page(page_index)
    blocks = page.get_text("dict")["blocks"]  # get text blocks with details
    
    # Sort blocks top-to-bottom, left-to-right for reading order
    blocks.sort(key=lambda b: (round(b["lines"][0]["bbox"][1]), b["lines"][0]["bbox"][0]) if b["type"]==0 else (float('inf'), float('inf')))
    
    for block in blocks:
        if block["type"] != 0:  # skip non-text blocks (images, etc.)
            continue
        # Concatenate all span texts in this block
        block_text = "".join(span["text"] for line in block["lines"] for span in line["spans"])
        block_text = block_text.strip()
        if not block_text:
            continue

        # Check for header info on first page
        if page_index == 0 and not journal_name:
            # Look for journal name (assuming it might appear in text if not an image)
            # If the newspaper name is not text (logo image), this may remain empty.
            match_journal = re.search(r"Егемен\s*Қазақстан|Egemen\s*Qazaqstan", block_text)
            if match_journal:
                journal_name = match_journal.group(0)
            # Find issue number (e.g. "№84")
            match_number = re.search(r"№\s*\d+", block_text)
            if match_number:
                issue_number = match_number.group(0)
            # Find date (e.g. "1 МАМЫР 2020" or "1 Мамыр 2020")
            match_date = re.search(r"\d{1,2}\s*[^0-9\s]{3,}\s*\d{4}", block_text)
            if match_date:
                issue_date = match_date.group(0)
        
        # Use font sizes to detect titles within the block
        # Find the maximum font size in this block
        max_size = 0
        for line in block["lines"]:
            for span in line["spans"]:
                if span["size"] > max_size:
                    max_size = span["size"]
        # Determine a threshold: consider text as title if its size is near the max_size of the whole page
        # (We could also compute an overall body text size and pick anything significantly larger.)
        # Here, we'll use a simple heuristic: titles are likely above 15 pt.
        is_title_block = max_size > 15  # can adjust threshold based on typical font sizes
        
        # If this block contains a title (large text), start a new article
        if is_title_block:
            # Finalize the previous article (if any is open) – here we assume each title block is a new article.
            # (If an article spans multiple blocks, we'll handle continuation below.)
            # For simplicity, we treat each title block as a separate article start.
            article_count += 1
            
            # Separate title lines and other lines in the block
            title_lines = []
            author_line = ""
            topic_line = ""
            body_lines = []
            
            for line in block["lines"]:
                # Check each line's first span size
                if line["spans"]:
                    span0 = line["spans"][0]
                    text_line = "".join(span["text"] for span in line["spans"]).strip()
                    if span0["size"] == max_size:
                        title_lines.append(text_line)
                        continue
                    # Check if line is all caps (topic)
                    if text_line.isupper():
                        topic_line = text_line
                        continue
                    # Check if the line is likely an author (italic or different font)
                    if "Italic" in span0["font"].split('+')[-1] or span0.get("flags", 0) & 2:  # flag 2 might indicate italic in PyMuPDF
                        author_line = text_line
                        continue
                    # Otherwise, it's part of the body text
                    body_lines.append(text_line)
            
            title = " ".join(title_lines)
            title = fix_text(title)
            topic = fix_text(topic_line) if topic_line else ""
            author = fix_text(author_line) if author_line else ""
            # The body_lines from this block (if any) will be the start of the article text
            text_content = " ".join(body_lines)
            
            # Continue to next blocks to gather rest of the article until next title encountered
            # We peek ahead in the blocks list for consecutive text that likely belongs to this article.
            # Stop when we reach another title block or a big gap.
            # (In this simple approach, we assume the very next block(s) are continuation if not title blocks.)
            continue_index = blocks.index(block) + 1
            while continue_index < len(blocks):
                next_block = blocks[continue_index]
                continue_index += 1
                if next_block["type"] != 0:
                    continue
                # Determine if the next block starts with a large font (another title)
                next_max = 0
                for line in next_block["lines"]:
                    for span in line["spans"]:
                        if span["size"] > next_max:
                            next_max = span["size"]
                if next_max > 15:  # if next block has a large text, assume it's a new article
                    break
                # Otherwise, treat it as continuation of current article
                next_text = "".join(span["text"] for line in next_block["lines"] for span in line["spans"]).strip()
                text_content += " " + next_text
            
            # Fix text encoding issues in the assembled content
            text_content = fix_text(text_content)
            
            # Abstract: first 10 sentences of text
            abstract = get_abstract(text_content, 10)
            
            # Create article dict
            article_data = {
                "title": title,
                "topic": topic,
                "author": author,
                "abstract": abstract,
                "text": text_content,
                "journal": journal_name,
                "date": issue_date,
                "number": issue_number
            }
            # Save JSON file for this article (using title or index for filename)
            filename = f"{article_count}.json"
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(article_data, f, ensure_ascii=False, indent=4)
            articles.append(article_data)

In [2]:
#pip install pymupdf

In [3]:
import fitz  # PyMuPDF
import re
import json

pdf_path = "01.01.2021.pdf"
output_json = "test_01.01.2021.json"

doc = fitz.open(pdf_path)

cid_to_char = {
    'cid:19': 'Ө', 'cid:25': 'ө', 'cid:4': 'Ә', 'cid:8': 'ә', 'cid:5': 'і'
}

def fix_text(text):
    for cid, char in cid_to_char.items():
        text = text.replace(f"({cid})", char)
        text = text.replace(f"{cid})", char)
        text = text.replace(f"{cid} ", char)
    text = text.replace("ə", "ә")
    return text

def get_abstract(text, num_sentences=10):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return " ".join(sentences[:num_sentences])

articles = []
journal_name = ""
issue_date = ""
issue_number = ""
article_count = 0

for page_index in range(len(doc)):
    page = doc.load_page(page_index)
    blocks = page.get_text("dict")["blocks"]
    blocks.sort(key=lambda b: (round(b["lines"][0]["bbox"][1]), b["lines"][0]["bbox"][0]) if b["type"] == 0 else (float('inf'), float('inf')))

    for block in blocks:
        if block["type"] != 0:
            continue
        block_text = "".join(span["text"] for line in block["lines"] for span in line["spans"]).strip()
        if not block_text:
            continue

        if page_index == 0:
            if not journal_name and re.search(r"Егемен\s*Қазақстан|Egemen\s*Qazaqstan", block_text):
                journal_name = "Egemen Qazaqstan"
            if not issue_number and re.search(r"№\s*\d+", block_text):
                issue_number = re.search(r"№\s*\d+", block_text).group(0)
            if not issue_date and re.search(r"\d{1,2}\s*[^0-9\s]{3,}\s*\d{4}", block_text):
                issue_date = re.search(r"\d{1,2}\s*[^0-9\s]{3,}\s*\d{4}", block_text).group(0)

        max_size = max((span["size"] for line in block["lines"] for span in line["spans"]), default=0)
        is_title_block = max_size > 15

        if is_title_block:
            article_count += 1
            title_lines = []
            author_line = ""
            topic_line = ""
            body_lines = []

            for line in block["lines"]:
                if line["spans"]:
                    span0 = line["spans"][0]
                    text_line = "".join(span["text"] for span in line["spans"]).strip()
                    if span0["size"] == max_size:
                        title_lines.append(text_line)
                    elif text_line.isupper():
                        topic_line = text_line
                    elif "Italic" in span0["font"].split('+')[-1] or span0.get("flags", 0) & 2:
                        author_line = text_line
                    else:
                        body_lines.append(text_line)

            title = fix_text(" ".join(title_lines))
            topic = fix_text(topic_line)
            author = fix_text(author_line)
            text_content = " ".join(body_lines)

            continue_index = blocks.index(block) + 1
            while continue_index < len(blocks):
                next_block = blocks[continue_index]
                continue_index += 1
                if next_block["type"] != 0:
                    continue
                next_max = max((span["size"] for line in next_block["lines"] for span in line["spans"]), default=0)
                if next_max > 15:
                    break
                next_text = "".join(span["text"] for line in next_block["lines"] for span in line["spans"]).strip()
                text_content += " " + next_text

            text_content = fix_text(text_content)
            abstract = get_abstract(text_content)

            article_data = {
                "url": "",
                "title": title,
                "date": issue_date,
                "author": author,
                "abstract": abstract,
                "text": text_content,
                "journal": journal_name,
                "category": topic
            }

            articles.append(article_data)

# Сохраняем все статьи в один JSON-файл
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(articles, f, ensure_ascii=False, indent=4)

print(f"✅ Извлечено {len(articles)} статей. Сохранено в {output_json}")


✅ Извлечено 76 статей. Сохранено в test_01.01.2021.json


In [5]:
#pip install sentence-transformers scikit-learn


In [6]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Настройки
threshold = 0.85
alpha = 0.5
tags = ["title", "author", "date", "abstract", "text", "journal", "category"]
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Файлы для сравнения (Test, Train)
file_pairs = [
    ("Test_01_01_2021.json", "Train_01_01_2021.json"),
    ("Test_20_05_2021.json", "Train_20_05_2021.json"),
    ("Test_23_01_2020.json", "Train_23_01_2020.json"),
]

# Функция для расчёта метрик
def evaluate_pair(test_file, train_file):
    with open(test_file, "r", encoding="utf-8") as f1, open(train_file, "r", encoding="utf-8") as f2:
        ref_data = json.load(f1)
        test_data = json.load(f2)

    assert len(ref_data) == len(test_data), f"Размеры файлов {test_file} и {train_file} не совпадают!"

    tss_totals = {tag: [] for tag in tags}
    precision_counts = {tag: 0 for tag in tags}
    N = len(ref_data)

    for ref, pred in zip(ref_data, test_data):
        for tag in tags:
            ref_val = ref.get(tag, "")
            pred_val = pred.get(tag, "")
            # Получаем эмбеддинги
            emb_ref = model.encode(ref_val)
            emb_pred = model.encode(pred_val)
            sim = cosine_similarity([emb_ref], [emb_pred])[0][0]
            tss_totals[tag].append(sim)
            if sim >= threshold:
                precision_counts[tag] += 1

    print(f"\n=== Результаты для пары: {test_file} vs {train_file} ===")
    for tag in tags:
        tss_avg = np.mean(tss_totals[tag])
        precision = precision_counts[tag] / N
        holistic = alpha * precision + (1 - alpha) * tss_avg
        print(f"{tag:10s} | Precision: {precision:.3f} | TSS: {tss_avg:.3f} | Holistic: {holistic:.3f}")

# Запускаем для всех трёх пар
for test_file, train_file in file_pairs:
    evaluate_pair(test_file, train_file)



=== Результаты для пары: Test_01_01_2021.json vs Train_01_01_2021.json ===
title      | Precision: 0.676 | TSS: 0.871 | Holistic: 0.774
author     | Precision: 0.735 | TSS: 0.866 | Holistic: 0.801
date       | Precision: 1.000 | TSS: 1.000 | Holistic: 1.000
abstract   | Precision: 1.000 | TSS: 1.000 | Holistic: 1.000
text       | Precision: 0.882 | TSS: 0.927 | Holistic: 0.905
journal    | Precision: 1.000 | TSS: 1.000 | Holistic: 1.000
category   | Precision: 1.000 | TSS: 1.000 | Holistic: 1.000

=== Результаты для пары: Test_20_05_2021.json vs Train_20_05_2021.json ===
title      | Precision: 0.800 | TSS: 0.928 | Holistic: 0.864
author     | Precision: 1.000 | TSS: 1.000 | Holistic: 1.000
date       | Precision: 1.000 | TSS: 1.000 | Holistic: 1.000
abstract   | Precision: 0.980 | TSS: 0.959 | Holistic: 0.970
text       | Precision: 0.940 | TSS: 0.928 | Holistic: 0.934
journal    | Precision: 1.000 | TSS: 1.000 | Holistic: 1.000
category   | Precision: 1.000 | TSS: 1.000 | Holistic: 