In [None]:
import json
import wikipediaapi
import time
from tqdm import tqdm


In [21]:
# --- 1. إعداد المسارات بدقة ---
INPUT_FILE = 'output/agriculture_pages_optimized.json'  # <- updated to pages JSON
OUTPUT_FILE = 'output/Final_agriculture_pages_ar.jsonl'  # <- updated output file

# --- 2. إعداد الاتصال بويكيبيديا ---
wiki_wiki = wikipediaapi.Wikipedia(
    user_agent='MyAgriResearch/2.0 (research@example.com)',
    language='ar',
    extract_format=wikipediaapi.ExtractFormat.WIKI,
    timeout=15  # زيادة الوقت لتجنب الانقطاع
)


In [3]:
# 1. قراءة ملف الصفحات
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    unique_titles = json.load(f)
print(f"Loaded {len(unique_titles)} unique page titles.")

Loaded 2308 unique page titles.


In [None]:
import random
from urllib.parse import unquote

def get_article_details(title):
    """جلب تفاصيل المقالة من ويكيبيديا"""
    try:
        page = wiki_wiki.page(title)
        
        if not page.exists():
            return None
        
        return {
            "article_id": str(page.pageid),
            "title": page.title,
            "url": unquote(page.fullurl),
            "summary": page.summary.replace('\n', ' ').strip(),
            "full_text": page.text.replace('\n', ' ').strip()
        }
    except Exception as e:
        return None

# example
article = get_article_details(random.choice(unique_titles))
article

# Cleaning the full text

In [None]:
from IPython.display import display, HTML

display(HTML(f"<div style='white-space: pre-wrap; word-wrap: break-word;'>{article['full_text']}</div>"))


In [None]:
import re

def clean_wiki_text(text):
    """تنظيف النصوص من علامات الويكي والمراجع وحذف الأقسام المحددة بالكامل"""
    if not text:
        return ""
    
    # الأقسام المراد حذفها مع نصها بالكامل
    end_patterns = [
        r'==\s*المراجع\s*==',
        r'==\s*وصلات خارجية\s*==',
        r'==\s*انظر أيضًا\s*==',
        r'==\s*الهوامش\s*=='
    ]
    
    # البحث عن أول ظهور لأي قسم وحذف كل شيء بعده
    first_cut = len(text)
    for pattern in end_patterns:
        match = re.search(pattern, text)
        if match:
            first_cut = min(first_cut, match.start())
    
    text = text[:first_cut]
    
    # إزالة المراجع الرقمية [1], [2], ...
    text = re.sub(r'\[\d+\]', '', text)
    
    # إزالة أي وسوم wiki مثل [عدل]
    text = re.sub(r'\[.*?]', '', text)
    
    # إزالة المسافات الزائدة
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def is_valid_article(page_obj, cleaned_text):
    # استبعاد صفحات التوضيح
    if "توضيح" in page_obj.categories:
        return False, 0
    
    # استبعاد المقالات القصيرة جدًا
    word_count = len(cleaned_text.split())
    if word_count < 150:
        return False, word_count
        
    return True, word_count

display(HTML(f"<div style='white-space: pre-wrap; word-wrap: break-word;'>{clean_wiki_text(article['full_text'])}</div>"))

# getting the full dataset in jsonl

In [None]:
from urllib.parse import unquote


# Process all articles, clean text, validate, and save to output JSONL file
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out:
    for title in tqdm(unique_titles, desc="Processing articles"):
        try:
            # Get article details
            page = wiki_wiki.page(title)
            if not page.exists():
                continue

            # Clean the full text
            cleaned_text = clean_wiki_text(page.text.replace('\n', ' ').strip())

            # Validate the article
            is_valid = is_valid_article(page, cleaned_text)

            if is_valid[0]:  # assuming is_valid returns (True, word_count)
                # Prepare article data
                article_data = {
                    "article_id": str(page.pageid),
                    "title": page.title,
                    "url": unquote(page.fullurl),
                    "summary": page.summary.replace('\n', ' ').strip(),
                    "full_text": cleaned_text,
                    "word_count": is_valid[1]
                }
                # Write as one JSON line
                f_out.write(json.dumps(article_data, ensure_ascii=False) + "\n")
                f_out.flush()  # optional, ensures data is saved immediately

            time.sleep(0.1)  # polite delay to avoid hitting Wikipedia too fast

        except Exception as e:
            print(f"Error processing article: {title} error {e}")
            continue

print(f"Processed and saved {len(unique_titles)} articles to {OUTPUT_FILE} (JSONL format)")


Processing articles: 100%|██████████| 2308/2308 [39:13<00:00,  1.02s/it] 

Processed and saved 2308 articles to output/Final_agriculture_pages_ar.jsonl (JSONL format)



