## Mining from OSCAR

In [None]:
!pip install datasets
from datasets import load_dataset
import json
import os

TARGET_SIZE_MB =150
SENTENCE_MIN_LEN = 30  
SENTENCE_MAX_LEN = 300 
OUTPUT_FILE = "oscar_arabic.json"
EST_MB_PER_SAMPLE = 7.3 / 100_000 
EST_SAMPLES = int(TARGET_SIZE_MB / EST_MB_PER_SAMPLE)

print(f"Downloading ~{EST_SAMPLES:,} samples to reach ~{TARGET_SIZE_MB}MB")
dataset = load_dataset("oscar", "unshuffled_deduplicated_ar", split=f"train[:{EST_SAMPLES}]")

filtered_sentences = []
for item in dataset:
    sentence = item["text"].strip()
    if SENTENCE_MIN_LEN <= len(sentence) <= SENTENCE_MAX_LEN:
        filtered_sentences.append(sentence)

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for sentence in filtered_sentences:
        json.dump({"text": sentence}, f, ensure_ascii=False)
        f.write("\n")
        if os.path.getsize(OUTPUT_FILE) > TARGET_SIZE_MB * 1024 * 1024:
            print(f"✅ Reached target file size: {TARGET_SIZE_MB}MB")
            break

print(f"✅ Done! Saved to: {OUTPUT_FILE}")

Downloading ~2,054,794 samples to reach ~150MB


Old caching folder /root/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_ar/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2 for dataset oscar exists but no data were found. Removing it. 


Downloading data files:   0%|          | 0/18 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/557M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/559M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/559M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/563M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/560M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/560M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/562M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/560M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/557M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/561M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/559M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/561M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/561M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/560M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/560M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/154M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9006977 [00:00<?, ? examples/s]

KeyboardInterrupt: 

## URL Mining

In [None]:
!pip install newspaper3k lxml[html_clean] tqdm scrapy
from scrapy import Spider
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse, unquote
from newspaper import Article
from scrapy.crawler import CrawlerProcess

class BetterArticleSpider(Spider):
    name = "better_article_spider"
    ARTICLE_LIMIT = 100  # Set your desired limit here

    def __init__(self, urls_file=None, *args, **kwargs):
        super(BetterArticleSpider, self).__init__(*args, **kwargs)
        self.start_urls = []
        self.allowed_domains = []

        if urls_file:
            try:
                with open(urls_file, 'r', encoding='utf-8') as f:
                    for line in f:
                        url = line.strip()
                        if url:
                            self.start_urls.append(url)
                            self.allowed_domains.append(urlparse(url).netloc)
            except FileNotFoundError:
                self.logger.error(f"URLs file not found: {urls_file}")
                raise
        else:
            self.logger.warning("No URLs file provided. Spider will not start without start_urls.")

        self.article_count = 0 

    def start_requests(self):
        if not self.start_urls:
            self.logger.info("No start URLs to crawl. Exiting.")
            return

        for url in self.start_urls:
            yield Request(url, callback=self.parse)

    def parse(self, response):
        article = Article(response.url, language='ar')
        article.download(input_html=response.text)
        article.parse()
        text = article.text.strip()

        if self.article_count >= self.ARTICLE_LIMIT:
            self.crawler.engine.close_spider(self, 'Article limit reached')
            return

        if len(text) > 500: 
            decoded_url = unquote(response.url)
            yield {
                "url": decoded_url,
                "title": article.title,
                "text": text,
                "publish_date": str(article.publish_date) if article.publish_date else None
            }
            self.article_count += 1 

        for link in LinkExtractor().extract_links(response):
            parsed_link_url = urlparse(link.url)
            if parsed_link_url.netloc in self.allowed_domains or not self.allowed_domains:
                yield response.follow(link, callback=self.parse)

if __name__ == "__main__":
    output_file = "articles.json"
    urls_input_file = "urls.txt" 
    try:
        with open(urls_input_file, 'x', encoding='utf-8') as f:
            f.write("https://mawdoo3.com/\n")
            f.write("https://www.aljazeera.net/\n")
            f.write("https://arabic.cnn.com/\n")
        print(f"Created a sample '{urls_input_file}' with example URLs.")
    except FileExistsError:
        print(f"'{urls_input_file}' already exists. Using existing file.")

    process = CrawlerProcess(settings={
        "FEEDS": {
            output_file: {
                "format": "json",
                "encoding": "utf8",
                "store_empty": False,
                "indent": 2,
            },
        },
        "ROBOTSTXT_OBEY": True,
        "DOWNLOAD_DELAY": 1,
        "DEPTH_LIMIT": 10,    
        "LOG_LEVEL": "INFO", 
    })

    process.crawl(BetterArticleSpider, urls_file=urls_input_file)
    process.start()

## Preprocessing and Simplifying Data
# Note: After mining, manually add the text to double-check the data mined from articles.json and oscar_arabic.json

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

!nohup ollama serve &

import time
time.sleep(5)
print("✅ Ollama server is starting in the background.")

!ollama pull gemma3:4b
!ollama list

!pip install -q ollama

import json
import re
from tqdm import tqdm
import ollama
import os

def clean_text(text):
    text = re.sub(r"[\u064B-\u0655]", "", text)
    text = text.replace('آ', 'ا').replace('۩', '')
    text = re.sub(r'[﴾﴿].*?[﴾﴿]', '', text)
    text = re.sub(r'^\s*\d+\.\s*', '', text)
    text = text.replace('،', ' ')
    text = re.sub(r'(?<!\s)(ال)', r' \1', text)
    keywords = ['تاريخ','التسجيل','اسم','المستعمل','مزاج','عدد','رسائل','صفحة']
    text = re.sub(r'(?<!\s)(' + '|'.join(keywords) + r')', r' \1', text)
    text = re.sub(r"[A-Za-z0-9\(\)\-]", "", text)
    text = re.sub(r"[\u0660-\u0669]", "", text)
    text = re.sub(r"[^\u0600-\u06FF\s\.\!\?\؟]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return ' '.join([w for w in text.split() if len(w) > 1])

try:
    with open('/content/5k_oscar.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    texts = [clean_text(item["text"]) for item in data if "text" in item and item["text"].strip()]
except FileNotFoundError:
    print("❌ Error: 'JSON file' not found. Please upload the file.")
    texts = []

output_file = "simple_pairs_ollama_corrected.json"

processed_texts = set()
simplified_pairs = []
if os.path.exists(output_file):
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            simplified_pairs = json.load(f)
        processed_texts = {pair["complex"] for pair in simplified_pairs}
        print(f"Loaded {len(simplified_pairs)} previously processed texts.")
    except Exception as e:
        print(f"Error loading existing output file: {e}")

if texts:
    print(f"Found {len(texts)} texts to process with the improved Ollama prompt...")
    for text in tqdm(texts, desc="Correcting & Simplifying"):
        if text in processed_texts:
            continue

        if not text:
            continue

        try:
            response = ollama.chat(
                model='gemma3:4b',
                messages=[
                    {
                        'role': 'system',
                        'content': '''You are an expert in Arabic linguistics. Your task has two steps:
                        The text you receive may have grammatical or spelling errors, and its meaning might be unclear. First, understand the intended meaning and correct the text to be a grammatically sound and meaningful Arabic sentence.
                        After correcting the text, simplify this corrected version to be easy for a general audience to understand.
                        Please provide only the final, simplified text, without any introductions or explanations about the correction process. '''
                    },
                    {
                        'role': 'user',
                        'content': f'النص المطلوب معالجته: "{text}"',
                    },
                ]
            )
            simplified = response['message']['content'].strip()
            simplified_pairs.append({"complex": text, "simplification": simplified})

            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(simplified_pairs, f, ensure_ascii=False, indent=2)

        except Exception as e:
            print(f"\nAn error occurred while processing text: {e}")
            simplified_pairs.append({"complex": text, "simplification": f"Error: {e}"})
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(simplified_pairs, f, ensure_ascii=False, indent=2)

    print(f"\n✅ Saved to {output_file}")
else:
    print("No texts to process.")

>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
nohup: appending output to 'nohup.out'
✅ Ollama server is starting in the background.
Loaded 1420 previously processed texts.
Found 5000 texts to process with the improved Ollama prompt...


Correcting & Simplifying: 100%|██████████| 5000/5000 [46:40<00:00,  1.79it/s]


✅ Saved to simple_pairs_ollama_corrected.json





## Postprocessing

In [None]:
import json
import re
import os

input_file = "simple_pairs_ollama_corrected.json"
output_file = "simple_pairs_ollama_corrected_cleaned.json"

english_pattern = re.compile(r'[A-Za-z]')

def is_english(text):
    """Check if the text contains English (Latin) characters."""
    return bool(english_pattern.search(text))

def read_json_with_fallback(file_path):
    """Attempt to read JSON file with fallback encodings."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except UnicodeDecodeError:
        print("⚠️ UTF-8 decoding failed. Trying with 'latin-1' encoding...")
        try:
            with open(file_path, 'r', encoding='latin-1') as f:
                return json.load(f)
        except UnicodeDecodeError:
            print("⚠️ Latin-1 decoding failed. Reading with errors='ignore'...")
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                return json.loads(content)
    except Exception as e:
        print(f"❌ Failed to read JSON file: {e}")
        return None

if not os.path.exists(input_file):
    print(f"❌ Error: '{input_file}' not found.")
    exit(1)

data = read_json_with_fallback(input_file)
if data is None:
    print("❌ Unable to load JSON file. Exiting.")
    exit(1)

initial_count = len(data)
filtered_data = []
for entry in data:
    simplification = entry.get("simplification", "")
    if not isinstance(simplification, str):
        simplification = str(simplification)
    if not is_english(simplification):
        filtered_data.append(entry)

removed_count = initial_count - len(filtered_data)

try:
    with open(output_file, 'w', encoding='utf-8', errors='strict') as f:
        json.dump(filtered_data, f, ensure_ascii=False, indent=2)
    print(f"✅ Processed {initial_count} entries.")
    print(f"Removed {removed_count} entries containing English text.")
    print(f"Saved cleaned data to '{output_file}'.")
except Exception as e:
    print(f"❌ Error saving cleaned JSON file: {e}")

✅ Processed 4875 entries.
Removed 2708 entries containing English text.
Saved cleaned data to 'simple_pairs_ollama_corrected_cleaned.json'.
