# Unigram Models

In [None]:
from pathlib import Path

import numpy as np
from tqdm import tqdm
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
@tf.function
def count_tokens_dataset(ds, vocab_size):
  counts_ds = ds.map(lambda x: tf.math.bincount(tf.cast(tf.reshape(x['story'], [-1]), tf.int32),
                                                minlength=vocab_size, dtype=tf.int64))
  total_counts = counts_ds.reduce(tf.zeros([vocab_size], dtype=tf.int64), lambda x, y: x + y)
  return total_counts

In [None]:
Path('unigrams').mkdir(exist_ok=True)
languages = ['arabic', 'azerbaijani', 'chinese', 'english', 'farsi', 'german', 'hebrew', 'hindi', 'korean', 'spanish', 'turkish', 'vietnamese']
for lang in tqdm(languages):
  for kind in ['vanilla', 'multi']:
    name = f'tokenized/{lang}_{kind}:1.0.0'
    vocab_size = 15000 if kind == 'vanilla' else 15_000 * 2 * 3
    ds_builder = tfds.builder(name)
    ds = ds_builder.as_dataset(split='train', batch_size=2**12)
    counts = count_tokens_dataset(ds, vocab_size).numpy()
    np.savetxt(f'unigrams/{lang}_{kind}.txt', counts, fmt='%d')