# Multilingual TinyStories

This notebook compiles the data from [https://huggingface.co/datasets/Gabrui/multilingual\_TinyStories](https://huggingface.co/datasets/Gabrui/multilingual_TinyStories) sourced from the other repositories listed below.

It is provided here for archival purposes only.


We perform data cleaning, standardization, and amalgamation on the following repositories:

- https://huggingface.co/datasets/roneneldan/TinyStories
- https://huggingface.co/datasets/robrenaud/multilingual_tinystories
- https://huggingface.co/datasets/52AI/TinyStoriesZh
- https://huggingface.co/datasets/umarigan/tinystories_tr
- https://huggingface.co/datasets/sboughorbel/tinystories_dataset_arabic
- https://huggingface.co/datasets/nampdn-ai/tinystories-vietnamese
- https://huggingface.co/datasets/marinowskiii/tiny-stories-aze
- https://huggingface.co/datasets/g0ster/TinyStories-Korean
- https://huggingface.co/datasets/SkySyrup/tinystories_german
- https://huggingface.co/datasets/taesiri/TinyStories-Farsi
- https://huggingface.co/datasets/vishnu2308/TinyStories-50k-Hindi
- https://huggingface.co/datasets/Norod78/TinyStoriesV2-GPT4-valid_heb-lineByLine-EoT

In [None]:
import io
import re
import json
import pickle
import tarfile
import unicodedata
from pathlib import Path
from itertools import groupby
from multiprocessing import Pool
from collections import Counter, defaultdict, deque

import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt
from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk

In [None]:
data_path = Path('~/datasets/tinystories/').expanduser()
N_PROCS = 16 # Processes spawn for preprocessing steps
MAX_LEN = 200 # For locating indexes of counters

grouping = {
  "e": ["é"],
  "'": ["'", "‘", "’", "`", "´"],
  '"': ['"', '“', '”', '„', '«', '»'],
  '-': ["-", "–", "—", "−", "‑", "‐"],
  ' ': ["\xa0", "\t", "\u200b", "\u200a", "\u2009", "\u2028", "\u2029", "\ufeff", "\u3000"],
  '': ['\xad']
}
translation_table = str.maketrans({v: k for k, vv in grouping.items() for v in vv})
common_patterns = r'^[a-zA-Z0-9\s!$%^&*()+\-=;:\'",.?/]*$'

def is_common(s):
  return bool(re.match(common_patterns, s))

def load_dataset_from_tar_gz(tar_gz_file, file_numbers=None):
  pattern = re.compile(r'data(\d+)\.json')
  memory_data = {}
  with tarfile.open(tar_gz_file, 'r:gz') as tar:
    for member in tqdm(tar.getmembers()):
      if not member.isfile() or not member.name.endswith('.json'):
        continue
      filename = member.name.lstrip('./')
      match = pattern.match(filename)
      file_number = int(match.group(1))
      if file_numbers is not None and file_number not in file_numbers:
        continue
      file_obj = tar.extractfile(member)
      memory_data[filename] = Dataset.from_list(json.load(file_obj))
  return concatenate_datasets([memory_data[k] for k in sorted(memory_data.keys())])

def normalize(text):
  return unicodedata.normalize('NFKC', text).translate(translation_table)

def update_counters(counter, idxs, idx, data_list):
  current_counter = Counter(data_list)
  counter.update(current_counter)
  for k in current_counter.keys():
    idxs[k].append(idx)

def examine_charset(samples, ids):
  counter = Counter()
  idxs = defaultdict(lambda: deque(maxlen=MAX_LEN))
  for sample, id in zip(samples['story'], ids):
    story = normalize(sample)
    update_counters(counter, idxs, id, story)
  return {'counter': [pickle.dumps(counter)], 'idxs': [pickle.dumps(dict(idxs))]}

def merge_results(dataset):
  counter = Counter()
  idxs = defaultdict(list)
  for cnt, idx in zip(dataset['counter'], dataset['idxs']):
    cnt, idx = pickle.loads(cnt), pickle.loads(idx)
    counter.update(cnt)
    for k, v in idx.items():
      idxs[k].extend(v)
  return counter, idxs

def show_stories(char, char_cnt, char_whe, dataset):
  print(char_cnt[char])
  for i in char_whe[char][:10]:
    print(dataset[i]['story'], end='\n\n-----------------\n\n')

def concatenate_sections(data):
  groups = [list(group) for k, group in groupby(data, lambda x: x == '<|endoftext|>') if not k]
  stories = [' '.join(filter(None, group)) for group in groups]
  return Dataset.from_dict({'story': stories})

def frac_split(qtd):
   return (qtd**0.6*15) / qtd

## English

In [None]:
# !wget https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz -P ~/datasets/tinystories/

In [None]:
try:
  all_english_data = load_from_disk(data_path / 'english')
except FileNotFoundError:
  all_english_data = load_dataset_from_tar_gz(data_path / 'TinyStories_all_data.tar.gz')
  all_english_data.save_to_disk(data_path / 'english')

In [None]:
gpt4_english = all_english_data.filter(lambda s: s['source']=='GPT-4', keep_in_memory=True, num_proc=N_PROCS)

In [None]:
char_cnt, char_whe = merge_results(gpt4_english.select_columns('story').map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
char_cnt

In [None]:
c = '#'
print(char_cnt[c])
for i in char_whe[c][:10]:
  print(gpt4_english[i]['story'], end='\n\n-----------------\n\n')

# {k: unicodedata.name(k, '') for k in char_cnt.keys()}

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 200 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
english_clean = gpt4_english.select_columns('story').map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='english_clean')

In [None]:
char_cnt2, char_whe2 = merge_results(english_clean.map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
char_cnt2

In [None]:
df = pd.DataFrame(english_clean)
df_deduplicated = df.drop_duplicates(subset='story')
# assert len(df_deduplicated) == len(df)
english_clean = Dataset.from_pandas(df_deduplicated).select_columns('story')
del df
del df_deduplicated

In [None]:
english_clean.save_to_disk(data_path / 'english_clean')

## Spanish

In [None]:
spanish_data = load_dataset('robrenaud/multilingual_tinystories', revision="refs/convert/parquet", cache_dir=str(data_path / 'cache'))

In [None]:
char_cnt, char_whe = merge_results(spanish_data['train'].select_columns('story').map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
char_cnt
# {k: unicodedata.category(k) for k in char_cnt.keys()} # {k: unicodedata.name(k, '') for k in char_cnt.keys()}

In [None]:
show_stories('л', char_cnt, char_whe, spanish_data['train'])

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 200 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
spanish_clean = spanish_data['train'].select_columns('story').map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='spanish_clean')

In [None]:
spanish_clean.save_to_disk(data_path / 'spanish_clean')

In [None]:
# df = pd.DataFrame(spanish_clean)
# df_deduplicated = df.drop_duplicates(subset='story')
# assert len(df_deduplicated) == len(df)
# del df
# del df_deduplicated

## Chinese

In [None]:
!wget https://huggingface.co/datasets/52AI/TinyStoriesZh/resolve/main/TinyStories_all_data_zh_1M.tar.gz -P $data_path
!wget https://huggingface.co/datasets/52AI/TinyStoriesZh/resolve/main/TinyStories_all_data_zh_2M.tar.gz -P $data_path

In [None]:
all_chinese_data = concatenate_datasets([load_dataset_from_tar_gz(data_path / 'TinyStories_all_data_zh_1M.tar.gz'),
                                         load_dataset_from_tar_gz(data_path / 'TinyStories_all_data_zh_2M.tar.gz')])
all_chinese_data.save_to_disk(data_path / 'chinese')
all_chinese_data = load_from_disk(data_path / 'chinese')

In [None]:
char_cnt, char_whe = merge_results(all_chinese_data.select_columns('story').map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
# char_cnt.most_common()[-300:]

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 100 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
chinese_clean = all_chinese_data.select_columns('story').map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='chinese_clean')

In [None]:
df = pd.DataFrame(chinese_clean)
df_deduplicated = df.drop_duplicates(subset='story')
print(len(df) - len(df_deduplicated))
chinese_clean = Dataset.from_pandas(df_deduplicated).select_columns('story')
del df
del df_deduplicated

In [None]:
chinese_clean.save_to_disk(data_path / 'chinese_clean')

## Turkish

In [None]:
turkish_data = load_dataset('umarigan/tinystories_tr', cache_dir=str(data_path / 'cache'))

In [None]:
char_cnt, char_whe = merge_results(turkish_data['train'].rename_column("text", "story").map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
char_cnt

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 80 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
turkish_clean = turkish_data['train'].rename_column("text", "story").map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='turkish_clean')

In [None]:
df = pd.DataFrame(turkish_clean)
df_deduplicated = df.drop_duplicates(subset='story')
print(len(df) - len(df_deduplicated))
turkish_clean = Dataset.from_pandas(df_deduplicated).select_columns('story')
del df
del df_deduplicated

In [None]:
turkish_clean.save_to_disk(data_path / 'turkish_clean')

## Arabic

In [None]:
arabic_data = load_dataset('sboughorbel/tinystories_dataset_arabic', revision="refs/convert/parquet", cache_dir=str(data_path / 'cache'))

In [None]:
char_cnt, char_whe = merge_results(arabic_data['train'].rename_column("translation", "story").select_columns("story").map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
{k: (v, unicodedata.name(k, ''), ord(k)) for k, v in char_cnt.most_common()}

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 100 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
arabic_clean = arabic_data['train'].rename_column("translation", "story").select_columns("story").map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='arabic_clean')

In [None]:
df = pd.DataFrame(arabic_clean).drop_duplicates(subset='story', inplace=True)
print(len(arabic_clean) - len(df))
arabic_clean = Dataset.from_pandas(df).select_columns('story')
del df
del df_deduplicated

In [None]:
arabic_clean.save_to_disk(data_path / 'arabic_clean')

## Vietnamese

In [None]:
vietnamese_data = load_dataset('nampdn-ai/tinystories-vietnamese', cache_dir=str(data_path / 'cache'))

In [None]:
char_cnt, char_whe = merge_results(vietnamese_data['train'].rename_column("vi", "story").select_columns("story").map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
{k: (v, unicodedata.name(k, ''), ord(k)) for k, v in char_cnt.most_common()}

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 100 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
vietnamese_clean = vietnamese_data['train'].rename_column("vi", "story").select_columns("story").map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='vietnamese_clean')

In [None]:
df = pd.DataFrame(vietnamese_clean)
df.drop_duplicates(subset='story', inplace=True)
print(len(vietnamese_clean) - len(df))
vietnamese_clean = Dataset.from_pandas(df).select_columns('story')
del df

In [None]:
vietnamese_clean.save_to_disk(data_path / 'vietnamese_clean')

## Azerbaijani

In [None]:
azerbaijani_data = load_dataset('marinowskiii/tiny-stories-aze', cache_dir=str(data_path / 'cache'))

In [None]:
char_cnt, char_whe = merge_results(azerbaijani_data['train'].rename_column("text", "story").select_columns("story").map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
{k: (v, unicodedata.name(k, ''), ord(k)) for k, v in char_cnt.most_common()}

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 100 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
azerbaijani_clean = azerbaijani_data['train'].rename_column("text", "story").select_columns("story").map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='azerbaijani_clean')

In [None]:
df = pd.DataFrame(azerbaijani_clean)
df.drop_duplicates(subset='story', inplace=True)
print(len(azerbaijani_clean) - len(df))
azerbaijani_clean = Dataset.from_pandas(df).select_columns('story')
del df

In [None]:
azerbaijani_clean.save_to_disk(data_path / 'azerbaijani_clean')

## Korean

In [None]:
korean_data = load_dataset('g0ster/TinyStories-Korean', revision="refs/convert/parquet", cache_dir=str(data_path / 'cache'))

In [None]:
korean_data = concatenate_datasets([concatenate_sections(korean_data['train']['text']),
                                    concatenate_sections(korean_data['validation']['text'])])

In [None]:
korean_data.save_to_disk(data_path / 'korean')

In [None]:
korean_data = load_from_disk(data_path / 'korean')

In [None]:
char_cnt, char_whe = merge_results(korean_data.select_columns("story").map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
{k: (v, unicodedata.name(k, ''), ord(k)) for k, v in char_cnt.most_common()[-900:]}

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 55 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
korean_clean = korean_data.select_columns("story").map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='korean_clean')

In [None]:
df = pd.DataFrame(korean_clean)
df.drop_duplicates(subset='story', inplace=True)
print(len(korean_clean) - len(df))
korean_clean = Dataset.from_pandas(df).select_columns('story')
del df

In [None]:
korean_clean.save_to_disk(data_path / 'korean_clean')

## German

In [None]:
!wget https://huggingface.co/datasets/SkySyrup/tinystories_german/resolve/main/german_GEMINI_async-combined -P $data_path

In [None]:
german_data_trans = load_dataset('SkySyrup/tinystories_german', revision="refs/convert/parquet", cache_dir=str(data_path / 'cache'))

In [None]:
with (data_path/'german_GEMINI_async-combined').open('r') as f:
  german_data = concatenate_sections([line.strip() for line in f])

In [None]:
german_data = concatenate_datasets([german_data, concatenate_sections(german_data_trans['train']['text'])])

In [None]:
char_cnt, char_whe = merge_results(german_data.select_columns("story").map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
{k: (v, unicodedata.name(k, ''), ord(k)) for k, v in char_cnt.most_common()}

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 410 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
german_clean = german_data.select_columns("story").map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='german_clean')

In [None]:
df = pd.DataFrame(german_clean)
df.drop_duplicates(subset='story', inplace=True)
print(len(german_clean) - len(df))
german_clean = Dataset.from_pandas(df).select_columns('story')
del df

In [None]:
german_clean.save_to_disk(data_path / 'german_clean')

## Farsi

In [None]:
farsi_data = load_dataset('taesiri/TinyStories-Farsi', revision="refs/convert/parquet", cache_dir=str(data_path / 'cache'))
farsi_data = concatenate_datasets([farsi_data['train'], farsi_data['validation']])
farsi_data = farsi_data.select_columns('Persian').rename_column('Persian', 'story')

In [None]:
char_cnt, char_whe = merge_results(farsi_data.select_columns("story").map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
{k: (v, unicodedata.name(k, ''), ord(k)) for k, v in char_cnt.most_common()}

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 49 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
farsi_clean = farsi_data.select_columns("story").map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='farsi_clean')

In [None]:
df = pd.DataFrame(farsi_clean)
df.drop_duplicates(subset='story', inplace=True)
print(len(farsi_clean) - len(df))
farsi_clean = Dataset.from_pandas(df).select_columns('story')
del df

In [None]:
farsi_clean.save_to_disk(data_path / 'farsi_clean')

## Hindi

In [None]:
hindi_data = load_dataset('vishnu2308/TinyStories-50k-Hindi', cache_dir=str(data_path / 'cache'))
hindi_data = hindi_data['train'].rename_column('hindi_text', 'story')

In [None]:
char_cnt, char_whe = merge_results(hindi_data.select_columns("story").map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
{k: (v, unicodedata.name(k, ''), ord(k)) for k, v in char_cnt.most_common()}

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 21 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
hindi_clean = hindi_data.select_columns("story").map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='hindi_clean')

In [None]:
df = pd.DataFrame(hindi_clean)
df.drop_duplicates(subset='story', inplace=True)
print(len(hindi_clean) - len(df))
hindi_clean = Dataset.from_pandas(df).select_columns('story')
del df

In [None]:
hindi_clean.save_to_disk(data_path / 'hindi_clean')

## Hebrew

In [None]:
hebrew_data = load_dataset('Norod78/TinyStoriesV2-GPT4-valid_heb-lineByLine-EoT', revision="refs/convert/parquet", cache_dir=str(data_path / 'cache'))
hebrew_data = hebrew_data['validation'].rename_column('text', 'story')

In [None]:
char_cnt, char_whe = merge_results(hebrew_data.select_columns("story").map(examine_charset,
              with_indices=True, batched=True, batch_size=20_000, num_proc=16, keep_in_memory=True, remove_columns='story'))

In [None]:
{k: (v, unicodedata.name(k, ''), ord(k)) for k, v in char_cnt.most_common()}

In [None]:
excluded_idx = set()
for char, ctn in char_cnt.items():
  if ctn < 30 and not is_common(char):
    excluded_idx.update(char_whe[char])
len(excluded_idx)

In [None]:
hebrew_clean = hebrew_data.select_columns("story").map(
              lambda stories, ids: {'story': [normalize(s) for s, id in zip(stories, ids) if id not in excluded_idx]},
              input_columns='story', with_indices=True, batched=True, num_proc=16, new_fingerprint='hebrew_clean')

In [None]:
df = pd.DataFrame(hebrew_clean)
df.drop_duplicates(subset='story', inplace=True)
print(len(hebrew_clean) - len(df))
hebrew_clean = Dataset.from_pandas(df).select_columns('story')
del df

In [None]:
hebrew_clean.save_to_disk(data_path / 'hebrew_clean')

## Dataset Aggregation

In [None]:
languages = ['english', 'spanish', 'chinese', 'turkish', 'arabic', 'vietnamese',
             'azerbaijani', 'korean', 'german', 'farsi', 'hindi', 'hebrew']
datasets = [load_from_disk(data_path / f'{lang}_clean') for lang in languages]

In [None]:
for lang, dset in zip(languages, tqdm(datasets)):
  dset_split = dset.train_test_split(frac_split(len(dset)), shuffle=True, seed=42)
  dset_split.push_to_hub('Gabrui/multilingual_TinyStories', lang, commit_message=f'Add {lang} language')