# Environment constants

Put here the constants necessary to run this notebook.

In [None]:
HUGGINGFACE_TOKEN=""
CLICKHOUSE_USER=""
CLICKHOUSE_PASSWORD=""
CLICKHOUSE_HOST=""

# Install dependencies

In [None]:
%pip install datasets==3.6.0
%pip install huggingface_hub
%pip install sentencepiece
%pip install --upgrade transformers
%pip install ipywidgets
%pip install lxml
%pip install clickhouse-connect
%pip install pymongo
%pip install tensorflow
%pip install torch

Collecting datasets==3.6.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-3.6.0
Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m164.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found exis

In [None]:
import random
import math
import numpy as np
from tqdm import tqdm

RANDOM_SEED = 42
random.seed(RANDOM_SEED) # for reproducibility

# ClickHouse



In [None]:
import clickhouse_connect

client = clickhouse_connect.get_client(
        host=CLICKHOUSE_HOST,
        user=CLICKHOUSE_USER,
        password=CLICKHOUSE_PASSWORD,
        secure=True,
    )

Creates the tables in the database:

In [None]:
grams_map = {
    1: 'unigrams',
    2: 'bigrams',
    3: 'trigrams'
}

def table_creation_command(gram: int) -> str:
  tokens = [f"t{i+1}" for i in range(gram)]
  token_types = [f"{t} UInt16" for t in tokens]
  return f"""
  CREATE TABLE {grams_map[gram]} (
    {',\n'.join(token_types)},
    count UInt64
  )
  ENGINE = SummingMergeTree()
  ORDER BY ({','.join(tokens)});
  """

def reset_or_create_table(gram: int) -> str:
  client.query(f"DROP TABLE IF EXISTS {grams_map[gram]}")
  return client.query(table_creation_command(gram))

for gram in grams_map:
  reset_or_create_table(gram)


# Load dataset and tokenizer

In [None]:
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer

# Log into HF
login(token=HUGGINGFACE_TOKEN)

# Load dataset in streaming mode
ds_stream = load_dataset("carolina-c4ai/corpus-carolina", split="corpus", streaming=True, trust_remote_code=True)

# Load the SentencePiece tokenizer
model_path = "TucanoBR/ViTucano-1b5-v1"
tokenizer = AutoTokenizer.from_pretrained(model_path)

README.md: 0.00B [00:00, ?B/s]

corpus-carolina.py: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

In [None]:
unigrams = {}
bigrams = {}
trigrams = {}

In [None]:
import ctypes
import gc

def release_ram():
    """
    Forces garbage collection and releases free memory back to the OS.
    """
    # 1. Collect cyclic garbage in Python
    gc.collect()

    # 2. Force the C memory allocator (glibc) to release memory to the OS
    # This works on Linux-based systems like Google Colab
    try:
        libc = ctypes.CDLL("libc.so.6")
        libc.malloc_trim(0)
        print("RAM released via malloc_trim.")
    except Exception as e:
        print(f"Could not run malloc_trim: {e}")

In [None]:
import time

BATCH = 10_000_000

def time_computation(func):
  start_time = time.time()
  func()
  end_time = time.time()
  return end_time-start_time

def flush(counts: dict[tuple[int], int], gram: int, force_release_ram: bool = False):
  table = grams_map[gram]
  ops = [
      [*k, c] for k,c in counts.items()
  ]

  start_time = time.time()
  client.insert(table, ops, column_names=[f"t{i+1}" for i in range(len(ops[0])-1)]+['count'])
  end_time = time.time()

  counts.clear()

  if force_release_ram:
    release_ram()

  print(f"[{table.upper()}] Flushed in {end_time-start_time}")

In [None]:
import os
os.mkdir("data")

In [None]:
with_newline = 0
amount_tokens = 0

In [None]:
TOTAL_SAMPLES = 2108999 # hard-coded because the dataset does not provide this metadata

In [None]:
paths = {
    'held_out': 'held_out',
    'test': 'test',
}

files = {}

for split in paths:
  files[split] = '/'.join(["data", paths[split]+'.txt'])

print(files)

cnt = 0

with open(files['test'], 'w', encoding = 'utf-8') as test_file:
    for record in tqdm(ds_stream, total = TOTAL_SAMPLES):
        text = record['text']

        if '\n' in text:
          with_newline += 1
          continue

        p = random.random()

        if p <= 0.1:
          test_file.write(text + '\n')
          continue
        # elif p <= 0.2:
        #   held_out_file.write(text + '\n')
        #   continue

        encoding = tokenizer.encode(text)
        amount_tokens += len(encoding)

        for t in encoding:
          unigrams[(t,)] = unigrams.get((t,), 0) + 1

        for t1,t2 in zip(encoding, encoding[1:]):
            bigrams[(t1,t2)] = bigrams.get((t1,t2),0) + 1

        for t1,t2,t3 in zip(encoding, encoding[1:], encoding[2:]):
            trigrams[(t1,t2,t3)] = trigrams.get((t1,t2,t3), 0) + 1

        for counts, gram in [(unigrams, 1), (bigrams, 2), (trigrams, 3)]:
          if len(counts) >= BATCH:
            flush(counts, gram)

        cnt += 1


for counts, gram in [(unigrams, 1), (bigrams, 2), (trigrams, 3)]:
  flush(counts, gram)

print(f"Found {with_newline} files that contained a newline character")

{'held_out': 'data/held_out.txt', 'test': 'data/test.txt'}


  0%|          | 0/2108999 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8359 > 2048). Running this sequence through the model will result in indexing errors
  1%|          | 18061/2108999 [02:05<78:00:05,  7.45it/s]

[TRIGRAMS] Flushed in 8.551223039627075


  2%|▏         | 31965/2108999 [04:15<491:54:20,  1.17it/s]

[TRIGRAMS] Flushed in 8.49529504776001


  3%|▎         | 59098/2108999 [06:21<480:20:13,  1.19it/s]

[TRIGRAMS] Flushed in 8.435988426208496


  5%|▍         | 102600/2108999 [08:35<410:05:29,  1.36it/s]

[TRIGRAMS] Flushed in 8.360456705093384


  7%|▋         | 142105/2108999 [10:56<122:48:07,  4.45it/s]

[TRIGRAMS] Flushed in 10.018821954727173


  8%|▊         | 177581/2108999 [13:10<95:37:50,  5.61it/s]

[TRIGRAMS] Flushed in 10.17020583152771


 11%|█         | 224806/2108999 [15:45<626:58:30,  1.20s/it]

[TRIGRAMS] Flushed in 24.577122449874878


 12%|█▏        | 246248/2108999 [17:55<533:06:53,  1.03s/it]

[TRIGRAMS] Flushed in 13.505202054977417


 14%|█▎        | 287663/2108999 [20:06<121:53:39,  4.15it/s]

[BIGRAMS] Flushed in 8.207112312316895


 14%|█▍        | 298141/2108999 [20:44<19:52:00, 25.32it/s]

[TRIGRAMS] Flushed in 11.006849527359009


 16%|█▌        | 332042/2108999 [23:15<534:51:39,  1.08s/it]

[TRIGRAMS] Flushed in 10.538053035736084


 17%|█▋        | 364227/2108999 [25:40<29:02:24, 16.69it/s]

[TRIGRAMS] Flushed in 13.31982159614563


 46%|████▌     | 965165/2108999 [31:56<17:04:31, 18.61it/s]

[TRIGRAMS] Flushed in 8.292788743972778


 52%|█████▏    | 1090294/2108999 [38:43<128:42:47,  2.20it/s]

[TRIGRAMS] Flushed in 9.35853624343872


 52%|█████▏    | 1106671/2108999 [45:08<298:32:10,  1.07s/it]

[TRIGRAMS] Flushed in 9.585653305053711


 53%|█████▎    | 1112291/2108999 [49:44<2432:41:52,  8.79s/it]

[TRIGRAMS] Flushed in 18.366932153701782


 53%|█████▎    | 1112488/2108999 [54:03<2691:45:52,  9.72s/it]

[BIGRAMS] Flushed in 11.711384296417236


 53%|█████▎    | 1112494/2108999 [54:41<3077:43:05, 11.12s/it]

[TRIGRAMS] Flushed in 8.912476301193237


 53%|█████▎    | 1112680/2108999 [59:04<1940:56:57,  7.01s/it]

[TRIGRAMS] Flushed in 11.198936700820923


 53%|█████▎    | 1112907/2108999 [1:03:22<2425:10:33,  8.76s/it]

[TRIGRAMS] Flushed in 9.768375158309937


 53%|█████▎    | 1113092/2108999 [1:07:55<1659:41:22,  6.00s/it]

[TRIGRAMS] Flushed in 9.814899444580078


 53%|█████▎    | 1113305/2108999 [1:12:16<1249:31:22,  4.52s/it]

[TRIGRAMS] Flushed in 15.457991361618042


 53%|█████▎    | 1113475/2108999 [1:16:21<1601:54:16,  5.79s/it]

[TRIGRAMS] Flushed in 8.6544930934906


 53%|█████▎    | 1113665/2108999 [1:20:37<1892:24:49,  6.84s/it]

[TRIGRAMS] Flushed in 9.421194314956665


 53%|█████▎    | 1113891/2108999 [1:25:01<1809:24:36,  6.55s/it]

[TRIGRAMS] Flushed in 10.23056674003601


 53%|█████▎    | 1113933/2108999 [1:25:52<328:12:34,  1.19s/it]HTTP Error 504 thrown while requesting GET https://huggingface.co/datasets/carolina-c4ai/corpus-carolina/resolve/55e63a519393c70a48dcfa14a558499c6bb0583b/corpus/legislative_branch/LEGbp.xml.gz
Retrying in 1s [Retry 1/5].
HTTP Error 504 thrown while requesting GET https://huggingface.co/datasets/carolina-c4ai/corpus-carolina/resolve/55e63a519393c70a48dcfa14a558499c6bb0583b/corpus/legislative_branch/LEGbp.xml.gz
Retrying in 2s [Retry 2/5].
 53%|█████▎    | 1114073/2108999 [1:29:41<2381:17:23,  8.62s/it]

[TRIGRAMS] Flushed in 12.902647018432617


 53%|█████▎    | 1114308/2108999 [1:34:19<2089:49:49,  7.56s/it]

[TRIGRAMS] Flushed in 10.96242380142212


 53%|█████▎    | 1114498/2108999 [1:38:36<2039:08:58,  7.38s/it]

[TRIGRAMS] Flushed in 7.642170429229736


 53%|█████▎    | 1114682/2108999 [1:42:51<1585:18:10,  5.74s/it]

[BIGRAMS] Flushed in 7.696758031845093


 53%|█████▎    | 1114686/2108999 [1:43:22<2374:57:16,  8.60s/it]

[TRIGRAMS] Flushed in 8.88029170036316


 53%|█████▎    | 1114857/2108999 [1:47:28<2442:44:33,  8.85s/it]

[TRIGRAMS] Flushed in 10.36749815940857


 53%|█████▎    | 1115051/2108999 [1:51:40<2290:50:31,  8.30s/it]

[TRIGRAMS] Flushed in 9.316243410110474


 53%|█████▎    | 1115287/2108999 [1:55:50<1766:21:22,  6.40s/it]

[TRIGRAMS] Flushed in 8.317858934402466


 53%|█████▎    | 1115480/2108999 [2:00:02<1981:20:07,  7.18s/it]

[TRIGRAMS] Flushed in 12.553831100463867


 53%|█████▎    | 1115674/2108999 [2:04:15<1326:52:45,  4.81s/it]

[TRIGRAMS] Flushed in 7.917999505996704


 53%|█████▎    | 1115890/2108999 [2:08:37<2615:54:33,  9.48s/it]

[TRIGRAMS] Flushed in 13.679538249969482


 53%|█████▎    | 1116057/2108999 [2:12:37<1836:39:02,  6.66s/it]

[TRIGRAMS] Flushed in 9.244387865066528


 53%|█████▎    | 1122915/2108999 [2:16:12<65:59:12,  4.15it/s]

[TRIGRAMS] Flushed in 13.92835521697998


 54%|█████▍    | 1139259/2108999 [2:17:06<46:37, 346.60it/s]HTTP Error 504 thrown while requesting GET https://huggingface.co/datasets/carolina-c4ai/corpus-carolina/resolve/55e63a519393c70a48dcfa14a558499c6bb0583b/corpus/university_domains/UNIa.xml.gz
Retrying in 1s [Retry 1/5].
 55%|█████▌    | 1161548/2108999 [2:18:44<109:44:44,  2.40it/s]

[BIGRAMS] Flushed in 10.924925804138184


 55%|█████▌    | 1162167/2108999 [2:19:12<74:00:39,  3.55it/s] 

[TRIGRAMS] Flushed in 10.304357051849365


 57%|█████▋    | 1195573/2108999 [2:21:10<56:25:06,  4.50it/s]

[TRIGRAMS] Flushed in 11.689499616622925


 58%|█████▊    | 1226081/2108999 [2:23:16<72:34:09,  3.38it/s]

[TRIGRAMS] Flushed in 21.775282859802246


 59%|█████▉    | 1252085/2108999 [2:25:13<42:41:56,  5.57it/s]

[TRIGRAMS] Flushed in 10.103196144104004


 61%|██████▏   | 1292264/2108999 [2:27:19<46:48:13,  4.85it/s]

[TRIGRAMS] Flushed in 9.250010967254639


 63%|██████▎   | 1324136/2108999 [2:29:21<48:20:56,  4.51it/s]

[TRIGRAMS] Flushed in 13.173235654830933


 64%|██████▎   | 1340283/2108999 [2:30:34<67:54:37,  3.14it/s]

[BIGRAMS] Flushed in 7.458563327789307


 64%|██████▍   | 1356367/2108999 [2:31:58<60:51:31,  3.44it/s]

[TRIGRAMS] Flushed in 10.745183229446411


 66%|██████▌   | 1382878/2108999 [2:33:49<53:01:36,  3.80it/s]

[TRIGRAMS] Flushed in 10.392333269119263


 67%|██████▋   | 1413608/2108999 [2:35:49<36:35:45,  5.28it/s]

[TRIGRAMS] Flushed in 9.519314289093018


 68%|██████▊   | 1439752/2108999 [2:37:46<58:32:44,  3.18it/s]

[TRIGRAMS] Flushed in 9.539661169052124


 70%|██████▉   | 1473495/2108999 [2:39:48<39:06:39,  4.51it/s]

[TRIGRAMS] Flushed in 8.27842402458191


 71%|███████▏  | 1504072/2108999 [2:41:43<78:17:55,  2.15it/s]

[BIGRAMS] Flushed in 7.033601522445679


 71%|███████▏  | 1507907/2108999 [2:42:23<29:11:55,  5.72it/s]

[TRIGRAMS] Flushed in 11.125389337539673


 73%|███████▎  | 1547353/2108999 [2:44:46<41:13:34,  3.78it/s]

[TRIGRAMS] Flushed in 24.181519746780396


 75%|███████▍  | 1579645/2108999 [2:47:05<23:24:11,  6.28it/s]

[TRIGRAMS] Flushed in 13.150054931640625


 77%|███████▋  | 1613541/2108999 [2:49:22<36:25:17,  3.78it/s]

[TRIGRAMS] Flushed in 9.928622007369995


 78%|███████▊  | 1644424/2108999 [2:51:39<45:58:47,  2.81it/s]

[TRIGRAMS] Flushed in 12.385156869888306


 79%|███████▉  | 1670616/2108999 [2:53:56<32:10:34,  3.78it/s]

[TRIGRAMS] Flushed in 13.619656562805176


 79%|███████▉  | 1676193/2108999 [2:55:06<68:35:48,  1.75it/s]

[BIGRAMS] Flushed in 9.163840770721436


 80%|████████  | 1690504/2108999 [2:56:33<23:28:37,  4.95it/s]

[TRIGRAMS] Flushed in 9.235555410385132


 81%|████████▏ | 1716396/2108999 [2:58:43<25:18:55,  4.31it/s]

[TRIGRAMS] Flushed in 9.35489821434021


 83%|████████▎ | 1754138/2108999 [3:01:07<21:37:49,  4.56it/s]

[TRIGRAMS] Flushed in 11.821866750717163


 85%|████████▍ | 1783355/2108999 [3:03:27<18:40:55,  4.84it/s]

[TRIGRAMS] Flushed in 10.117682456970215


 86%|████████▌ | 1816276/2108999 [3:05:50<23:48:21,  3.42it/s]

[TRIGRAMS] Flushed in 9.677614450454712


 87%|████████▋ | 1838445/2108999 [3:07:49<21:32:53,  3.49it/s]

[BIGRAMS] Flushed in 8.880490779876709


 87%|████████▋ | 1842126/2108999 [3:08:36<15:42:57,  4.72it/s]

[TRIGRAMS] Flushed in 9.770225048065186


 89%|████████▊ | 1868086/2108999 [3:10:50<10:32:47,  6.35it/s]

[TRIGRAMS] Flushed in 11.759564399719238


 90%|█████████ | 1903350/2108999 [3:13:05<21:45:27,  2.63it/s]

[TRIGRAMS] Flushed in 8.060620069503784


 92%|█████████▏| 1931044/2108999 [3:15:23<15:17:22,  3.23it/s]

[TRIGRAMS] Flushed in 11.244827508926392


 93%|█████████▎| 1966697/2108999 [3:17:47<28:16:05,  1.40it/s]

[TRIGRAMS] Flushed in 9.584753036499023


 95%|█████████▍| 1995573/2108999 [3:20:10<7:32:52,  4.17it/s]

[TRIGRAMS] Flushed in 11.404606103897095


 95%|█████████▌| 2004421/2108999 [3:21:18<9:20:57,  3.11it/s] 

[BIGRAMS] Flushed in 12.300870418548584


 96%|█████████▌| 2027998/2108999 [3:23:04<2:19:20,  9.69it/s]

[TRIGRAMS] Flushed in 10.489047288894653


 98%|█████████▊| 2071294/2108999 [3:25:39<2:46:31,  3.77it/s]

[TRIGRAMS] Flushed in 9.163848876953125


100%|█████████▉| 2098570/2108999 [3:28:14<2:00:39,  1.44it/s]

[TRIGRAMS] Flushed in 24.835835933685303


100%|██████████| 2108999/2108999 [3:28:54<00:00, 168.25it/s]


[UNIGRAMS] Flushed in 2.0995469093322754
[BIGRAMS] Flushed in 8.120527744293213
[TRIGRAMS] Flushed in 3.4668264389038086
Found 72 files that contained a newline character
