#### Imports

In [1]:
import re
import os
import numpy as np
from src.data_utils import load_texts, clean_text, save_texts, tokenize, save_tokenized, load_tokenized, is_ascii

  from .autonotebook import tqdm as notebook_tqdm


#### Clean dataset and save to file or load cleaned dataset

In [2]:
processed_path = 'data/dataset_processed.txt'

if os.path.exists(processed_path):
  cleaned_ascii_texts = load_texts(processed_path)
else:
  texts = load_texts('data/tweets.txt')
  cleaned_texts = [clean_text(t) for t in texts]
  cleaned_ascii_texts = [t for t in cleaned_texts if is_ascii(t)]

  save_texts(cleaned_ascii_texts, 'data/dataset_processed.txt')
  print(f'Dataset cleaned and saved: {len(cleaned_ascii_texts)} lines')

Dataset cleaned and saved: 1596158 lines


#### Tokenize cleaned dataset and save to file or load tokenized dataset

In [3]:
tokenized_path = 'data/dataset_tokenized.json'

if os.path.exists(tokenized_path):
  tokenized = load_tokenized(tokenized_path)
  print(f'Loaded from file: {len(tokenized)} samples')
else:
  tokenized = tokenize(cleaned_ascii_texts)
  save_tokenized(tokenized, tokenized_path)
  print(f'Tokenized and saved: {len(tokenized)} samples')

Tokenized and saved: 1596158 samples


#### Analyze lengths of samples

In [4]:
lengths = [len(t) for t in tokenized]

print(f'Min: {min(lengths)}, Max: {max(lengths)}, Mean: {np.mean(lengths):.2f}')
for p in [50, 75, 90, 95, 99]:
  print(f'P{p}: {int(np.percentile(lengths, p))}')

Min: 1, Max: 94, Mean: 16.47
P50: 15
P75: 23
P90: 29
P95: 32
P99: 37
