In [1]:
from collections import Counter, defaultdict
from datasets import load_dataset
import unicodedata
import tiktoken
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import get_dataset_config_names, load_dataset
from transformers import AutoTokenizer
import re

In [2]:
languages =     ["en", "ru", "fr", "de", "zh", "he", "it", "es", "ar", "pt", "ko", "hu", "sa", "la", "ja", "el",     "sv", "nl", "pl", "vi", "fa", "no", "tr", "fi", "cs", "hy", "da", "bn", "az", "ka", "hi", "id"]

lang_name_map = {
    "en": "English",
    "ru": "Russian",
    "fr": "French",
    "de": "German",
    "zh": "Chinese",
    "he": "Hebrew",
    "it": "Italian",
    "es": "Spanish",
    "ar": "Arabic",
    "pt": "Portuguese",
    "ko": "Korean",
    "hu": "Hungarian",
    "sa": "Sanskrit",
    "la": "Latin",
    "ja": "Japanese",
    "el": "Greek",

    "sv": "Swedish",
    "nl": "Dutch",
    "pl": "Polish",
    "vi": "Vietnamese",
    "fa": "Persian",
    "no": "Norwegian",
    "tr": "Turkish",
    "fi": "Finnish",
    "cs": "Czech",
    "hy": "Armenian",
    "da": "Danish",
    "bn": "Bangla",
    "az": "Azerbaijani",
    "ka": "Georgian",
    "hi": "Hindi",
    "id": "Indonesian"
}

In [3]:
wikipedia_dataset_id = "wikimedia/wikipedia"
wikisource_dataset_id = "wikimedia/wikisource"

In [4]:
def save_list_to_file(data_list, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        f.writelines(f"{item}\n" for item in data_list)

def load_list_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f]

def save_results_to_file(data_list, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(str(data_list))

In [5]:
from collections import Counter
import re
import itertools

top_n = 1000  # how many n-gramms to extract
ngram_range = [2,3,4,5] # -> 'n-gramms-2-3-4-5.txt'

# ranges
CJK_UNICODE_RANGES = [
    ('\u4e00', '\u9fff'),  # CJK Unified Ideographs
    ('\u3040', '\u309f'),  # Hiragana
    ('\u30a0', '\u30ff'),  # Katakana
    ('\uac00', '\ud7af'),  # Hangul Syllables
]

def is_cjk(char):
    return any(start <= char <= end for start, end in CJK_UNICODE_RANGES)

def preprocess(text, lang):
    # filtering
    text = text.replace("’", "'").replace("‘", "'").replace("`", "'")
    # numbers - to remove
    text = re.sub(r'[0-9]', '', text)
    # remove tabs and special symbols
    text = re.sub(r'[\t\r\f\v]', ' ', text)
    return text

def extract_ngrams(text, lang, n):
    tokens = []

    if any(is_cjk(c) for c in text):
        text = preprocess(text, lang)
        # CJK - by symbols
        chars = [c for c in text if not c.isspace()]
        tokens = [''.join(chars[i:i+n]) for i in range(len(chars)-n+1)]
    else:
        # words
        text = preprocess(text, lang)
        # splitting
        words = re.findall(r"\b[a-zA-Z\u00C0-\u017F\u0400-\u04FF']+\b", text)

        for word in words:
            # example: can't  →  ['can', "'t"] 
            if len(word) >= n:
                chars = list(word)
                ngrams = [''.join(chars[i:i+n]) for i in range(len(chars)-n+1)]
                tokens.extend(ngrams)
    return tokens

In [None]:
top_pct = 0.01 # 1% of Wiki

In [13]:
configs_wikipedia = get_dataset_config_names(wikipedia_dataset_id)

In [14]:
configs_wikisource = get_dataset_config_names(wikisource_dataset_id)

In [16]:
results=[]

for lang in languages:
    try:
        ngram_counters = {n: Counter() for n in ngram_range}

        try:
            configs = configs_wikipedia
            latest_config = sorted([c for c in configs if c.endswith(f".{lang}")])[-1]
            ds = load_dataset(wikipedia_dataset_id, latest_config, split="train")
            total_len = len(ds)
            sample_size = int(total_len * top_pct)                  
            ds_sub = ds.shuffle(seed=42).select(range(sample_size)) 
            texts = [x['text'] for x in ds_sub]
        except Exception as e:
            print(f"Error in Wikipedia for {lang}: {e}")
            pass

        if lang != 'ka': 
            try:
                configs = configs_wikisource
                latest_config = sorted([c for c in configs if c.endswith(f".{lang}")])[-1]
                ds = load_dataset(wikisource_dataset_id, latest_config, split="train")                
                total_len = len(ds)
                sample_size = int(total_len * top_pct)                  
                ds_sub = ds.shuffle(seed=42).select(range(sample_size)) 
                texts = texts + [x['text'] for x in ds_sub]
            except Exception as e:
                print(f"Error in Wikisource for {lang}: {e}")
                pass

        for text in texts:
            for n in ngram_range:
                ngrams = extract_ngrams(text, lang, n)
                ngram_counters[n].update(ngrams)
        
        top_ngrams_by_n = {n: ngram_counters[n].most_common(top_n) for n in ngram_range}
        results.append({
            'lang': lang,
            'language_name': lang_name_map.get(lang, lang),
            'top_ngrams': top_ngrams_by_n
        })
       
        print(f"{lang}: top ngrams extracted.")
        save_results_to_file(results, '_n-gramms-2-3-4-5.txt')
    except Exception as e:
        print(f"Error for language '{lang}': {e}")

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

en: top ngrams extracted.


Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

ru: top ngrams extracted.


Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

fr: top ngrams extracted.


Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

de: top ngrams extracted.
zh: top ngrams extracted.
he: top ngrams extracted.
it: top ngrams extracted.
es: top ngrams extracted.
ar: top ngrams extracted.
pt: top ngrams extracted.
ko: top ngrams extracted.
hu: top ngrams extracted.
sa: top ngrams extracted.
la: top ngrams extracted.
ja: top ngrams extracted.
el: top ngrams extracted.
sv: top ngrams extracted.
nl: top ngrams extracted.
pl: top ngrams extracted.
vi: top ngrams extracted.
fa: top ngrams extracted.
no: top ngrams extracted.
tr: top ngrams extracted.
fi: top ngrams extracted.
cs: top ngrams extracted.
hy: top ngrams extracted.
da: top ngrams extracted.
bn: top ngrams extracted.
az: top ngrams extracted.
ka: top ngrams extracted.
hi: top ngrams extracted.
id: top ngrams extracted.
