In [2]:
from src.textclf_transformer.tokenizer.wordpiece_tokenizer_wrapper import WordPieceTokenizerWrapper
tok = WordPieceTokenizerWrapper()
tok.load(tokenizer_dir="src/textclf_transformer/tokenizer/BERT_original")
from datasets import load_dataset
import torch

In [None]:

def token_stats(dataset: str, max_len: int):
    ds = load_dataset(dataset)
    result = {}
    for split in list(ds.keys()):
        if dataset == 'ccdv/arxiv-classification' and split == 'train':
            continue
        if dataset == 'imdb' and split =='unsupervised':
            continue
        
        data = list(ds[split]['text'])
        tokenized = tok.encode(data, max_length=max_len)
        real_tokens = tokenized[:][0] != 0
        token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)

        max_len_achieved = torch.sum(torch.sum(real_tokens, dim=1) == max_len)
        print(f"Split: {split}\nNumber of examples that are longer that max_len:\n{max_len_achieved}\n{max_len_achieved/len(data)*100}% of total data")

        stats = {
            "avg_tokens": float(token_lengths.mean()),
            "std_tokens": float(token_lengths.std()),
            "median_tokens": float(torch.median(token_lengths)),
            "min_tokens": int(token_lengths.min()),
            "max_tokens": int(token_lengths.max()),
        }
        result[split] = stats
    return result

In [9]:
imdb = token_stats('imdb', 4000)

[INFO] input is treated as a list of input texts
Split: train
Number of examples that are longer that max_len:
0
0.0% of total data
[INFO] input is treated as a list of input texts
Split: test
Number of examples that are longer that max_len:
0
0.0% of total data
[INFO] input is treated as a list of input texts
Split: unsupervised
Number of examples that are longer that max_len:
0
0.0% of total data


In [10]:
imdb

{'train': {'avg_tokens': 313.8713073730469,
  'std_tokens': 234.29586791992188,
  'median_tokens': 233.0,
  'min_tokens': 13,
  'max_tokens': 3127},
 'test': {'avg_tokens': 306.77099609375,
  'std_tokens': 227.89404296875,
  'median_tokens': 230.0,
  'min_tokens': 10,
  'max_tokens': 3157},
 'unsupervised': {'avg_tokens': 314.8410339355469,
  'std_tokens': 234.513671875,
  'median_tokens': 234.0,
  'min_tokens': 13,
  'max_tokens': 3446}}

In [5]:
arxiv = token_stats('ccdv/arxiv-classification', 32768)

[INFO] input is treated as a list of input texts
Split: validation
Number of examples that are longer that max_len:
182
7.28000020980835% of total data
[INFO] input is treated as a list of input texts
Split: test
Number of examples that are longer that max_len:
164
6.559999942779541% of total data


In [6]:
arxiv

{'validation': {'avg_tokens': 15012.7451171875,
  'std_tokens': 8385.609375,
  'median_tokens': 12885.0,
  'min_tokens': 1373,
  'max_tokens': 32768},
 'test': {'avg_tokens': 14745.8623046875,
  'std_tokens': 8257.8232421875,
  'median_tokens': 12631.0,
  'min_tokens': 1268,
  'max_tokens': 32768}}

In [13]:
import torch

tokenized = torch.load("data/tokenized/wikipedia_train.pt", weights_only=False)
real_tokens = tokenized[:450000][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result = stats
print(result)

real_tokens = tokenized[450000:][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result = stats
print(result)

{'avg_tokens': 127.41764068603516, 'std_tokens': 5.4616780281066895, 'median_tokens': 128.0, 'min_tokens': 19, 'max_tokens': 128}
{'avg_tokens': 452.4222412109375, 'std_tokens': 111.8324203491211, 'median_tokens': 512.0, 'min_tokens': 20, 'max_tokens': 512}


In [8]:
import pickle
import numpy as np

path = "data/raw/wikipedia.pkl"  # or .pickle

with open(path, "rb") as f:
    sample = pickle.load(f)

token_lengths = np.array([len(t.split()) for t in sample])
char_lengths = np.array([len(t) for t in sample])
result =  {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(np.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
    "avg_chars": float(char_lengths.mean()),
    "std_chars": float(char_lengths.std()),
}
print(result)

{'avg_tokens': 481.087155, 'std_tokens': 162.2585449491212, 'median_tokens': 600.0, 'min_tokens': 11, 'max_tokens': 600, 'avg_chars': 3003.112855, 'std_chars': 1003.5419222012681}


In [15]:
import torch

tokenized = torch.load("data/tokenized/arxiv_train.pt", weights_only=False)
real_tokens = tokenized[:][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result = {}
result['train'] = stats

tokenized = torch.load("data/tokenized/arxiv_test.pt", weights_only=False)
real_tokens = tokenized[:][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result['test'] = stats


tokenized = torch.load("data/tokenized/arxiv_val.pt", weights_only=False)
real_tokens = tokenized[:][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result['val'] = stats
print('arxiv')
print(result)


arxiv
{'train': {'avg_tokens': 12094.607421875, 'std_tokens': 4260.59130859375, 'median_tokens': 12836.0, 'min_tokens': 910, 'max_tokens': 16384}, 'test': {'avg_tokens': 12062.6240234375, 'std_tokens': 4342.11865234375, 'median_tokens': 12863.0, 'min_tokens': 1009, 'max_tokens': 16384}, 'val': {'avg_tokens': 12055.3427734375, 'std_tokens': 4320.77392578125, 'median_tokens': 12795.0, 'min_tokens': 650, 'max_tokens': 16384}}


In [16]:
import torch

tokenized = torch.load("data/tokenized/imdb_train.pt", weights_only=False)
real_tokens = tokenized[:][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result = {}
result['train'] = stats

tokenized = torch.load("data/tokenized/imdb_test.pt", weights_only=False)
real_tokens = tokenized[:][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result['test'] = stats


tokenized = torch.load("data/tokenized/imdb_val.pt", weights_only=False)
real_tokens = tokenized[:][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result['val'] = stats
print('imdb')
print(result)


imdb
{'train': {'avg_tokens': 262.88775634765625, 'std_tokens': 137.1915740966797, 'median_tokens': 220.0, 'min_tokens': 10, 'max_tokens': 512}, 'test': {'avg_tokens': 262.44219970703125, 'std_tokens': 137.07003784179688, 'median_tokens': 221.0, 'min_tokens': 13, 'max_tokens': 512}, 'val': {'avg_tokens': 263.4132080078125, 'std_tokens': 137.46031188964844, 'median_tokens': 220.0, 'min_tokens': 18, 'max_tokens': 512}}


In [17]:
import torch

tokenized = torch.load("data/tokenized/hyperpartisan_train.pt", weights_only=False)
real_tokens = tokenized[:][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result = {}
result['train'] = stats

tokenized = torch.load("data/tokenized/hyperpartisan_test.pt", weights_only=False)
real_tokens = tokenized[:][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result['test'] = stats


tokenized = torch.load("data/tokenized/hyperpartisan_val.pt", weights_only=False)
real_tokens = tokenized[:][0] != 0
token_lengths = real_tokens.sum(dim=1, dtype=torch.float32)


stats = {
    "avg_tokens": float(token_lengths.mean()),
    "std_tokens": float(token_lengths.std()),
    "median_tokens": float(torch.median(token_lengths)),
    "min_tokens": int(token_lengths.min()),
    "max_tokens": int(token_lengths.max()),
}
result['val'] = stats
print('hyperpartisan')
print(result)


hyperpartisan
{'train': {'avg_tokens': 2422.914306640625, 'std_tokens': 807.1834106445312, 'median_tokens': 2150.0, 'min_tokens': 100, 'max_tokens': 4096}, 'test': {'avg_tokens': 2504.314453125, 'std_tokens': 898.734375, 'median_tokens': 2181.0, 'min_tokens': 1296, 'max_tokens': 4096}, 'val': {'avg_tokens': 2503.489501953125, 'std_tokens': 896.273681640625, 'median_tokens': 2197.0, 'min_tokens': 1255, 'max_tokens': 4096}}
