# Environment constants

Put here the constants necessary to run this notebook.

In [None]:
HUGGINGFACE_TOKEN=""
CLICKHOUSE_USER=""
CLICKHOUSE_PASSWORD=""
CLICKHOUSE_HOST=""

# Install dependencies

In [None]:
%pip install datasets==3.6.0
%pip install huggingface_hub
%pip install sentencepiece
%pip install --upgrade transformers
%pip install ipywidgets
%pip install lxml
%pip install clickhouse-connect
%pip install pymongo
%pip install tensorflow
%pip install torch

In [None]:
import random
import math
import numpy as np
from tqdm import tqdm

RANDOM_SEED = 42
random.seed(RANDOM_SEED) # for reproducibility

# ClickHouse



In [None]:
import clickhouse_connect

client = clickhouse_connect.get_client(
        host=CLICKHOUSE_HOST,
        user=CLICKHOUSE_USER,
        password=CLICKHOUSE_PASSWORD,
        secure=True,
    )

Creates the tables in the database:

In [None]:
grams_map = {
    1: 'unigrams',
    2: 'bigrams',
    3: 'trigrams'
}

def table_creation_command(gram: int) -> str:
  tokens = [f"t{i+1}" for i in range(gram)]
  token_types = [f"{t} UInt16" for t in tokens]
  return f"""
  CREATE TABLE {grams_map[gram]} (
    {',\n'.join(token_types)},
    count UInt64
  )
  ENGINE = SummingMergeTree()
  ORDER BY ({','.join(tokens)});
  """

def reset_or_create_table(gram: int) -> str:
  client.query(f"DROP TABLE IF EXISTS {grams_map[gram]}")
  return client.query(table_creation_command(gram))

for gram in grams_map:
  reset_or_create_table(gram)


# Load dataset and tokenizer

In [None]:
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer

# Log into HF
login(token=HUGGINGFACE_TOKEN)

# Load dataset in streaming mode
ds_stream = load_dataset("carolina-c4ai/corpus-carolina", split="corpus", streaming=True, trust_remote_code=True)

# Load the SentencePiece tokenizer
model_path = "TucanoBR/ViTucano-1b5-v1"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
unigrams = {}
bigrams = {}
trigrams = {}

In [None]:
import ctypes
import gc

def release_ram():
    """
    Forces garbage collection and releases free memory back to the OS.
    """
    # 1. Collect cyclic garbage in Python
    gc.collect()

    # 2. Force the C memory allocator (glibc) to release memory to the OS
    # This works on Linux-based systems like Google Colab
    try:
        libc = ctypes.CDLL("libc.so.6")
        libc.malloc_trim(0)
        print("RAM released via malloc_trim.")
    except Exception as e:
        print(f"Could not run malloc_trim: {e}")

In [None]:
import time

BATCH = 10_000_000

def time_computation(func):
  start_time = time.time()
  func()
  end_time = time.time()
  return end_time-start_time

def flush(counts: dict[tuple[int], int], gram: int, force_release_ram: bool = False):
  table = grams_map[gram]
  ops = [
      [*k, c] for k,c in counts.items()
  ]

  start_time = time.time()
  client.insert(table, ops, column_names=[f"t{i+1}" for i in range(len(ops[0])-1)]+['count'])
  end_time = time.time()

  counts.clear()

  if force_release_ram:
    release_ram()

  print(f"[{table.upper()}] Flushed in {end_time-start_time}")

In [None]:
import os
os.mkdir("data")

In [None]:
with_newline = 0
amount_tokens = 0

In [None]:
TOTAL_SAMPLES = 2108999 # hard-coded because the dataset does not provide this metadata

In [None]:
paths = {
    'held_out': 'held_out',
    'test': 'test',
}

files = {}

for split in paths:
  files[split] = '/'.join(["data", paths[split]+'.txt'])

print(files)

cnt = 0

with open(files['test'], 'w', encoding = 'utf-8') as test_file:
    for record in tqdm(ds_stream, total = TOTAL_SAMPLES):
        text = record['text']

        if '\n' in text:
          with_newline += 1
          continue

        p = random.random()

        if p <= 0.1:
          test_file.write(text + '\n')
          continue
        # elif p <= 0.2:
        #   held_out_file.write(text + '\n')
        #   continue

        encoding = tokenizer.encode(text)
        amount_tokens += len(encoding)

        for t in encoding:
          unigrams[(t,)] = unigrams.get((t,), 0) + 1

        for t1,t2 in zip(encoding, encoding[1:]):
            bigrams[(t1,t2)] = bigrams.get((t1,t2),0) + 1

        for t1,t2,t3 in zip(encoding, encoding[1:], encoding[2:]):
            trigrams[(t1,t2,t3)] = trigrams.get((t1,t2,t3), 0) + 1

        for counts, gram in [(unigrams, 1), (bigrams, 2), (trigrams, 3)]:
          if len(counts) >= BATCH:
            flush(counts, gram)

        cnt += 1


for counts, gram in [(unigrams, 1), (bigrams, 2), (trigrams, 3)]:
  flush(counts, gram)

print(f"Found {with_newline} files that contained a newline character")