In [1]:
import re
import string
import numpy as np
import spacy
import torch
import gzip
import json
from nltk.tokenize import word_tokenize, TweetTokenizer
from polyglot.text import Text
import itertools

In [2]:
with open(r'contents\spacy_lg_reduced_embeddings.npy', 'rb') as f:
    embeddings = np.load(f)
nlp = spacy.load('en_core_web_lg', disable=['tok2vec','tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
nlp.max_length = len(' '.join(list(nlp.vocab.strings)))+1
all_vocab_doc = nlp(' '.join(list(nlp.vocab.strings)))
all_vocab_str = [f'{t}' for t in all_vocab_doc]
embeddings = torch.from_numpy(embeddings)
embeddings = (embeddings - torch.min(embeddings)) / (torch.max(embeddings)-torch.min(embeddings))
token_vocab_dict = dict(zip(all_vocab_str, embeddings))
token_vocab_dict['<n>'] = token_vocab_dict['newline']

In [3]:
ttokenizer = TweetTokenizer()

In [4]:
sample_text = 'This is sample sentents<n>and this is another' + '\x01'
sample_tokens = ttokenizer.tokenize(sample_text)
print(sample_tokens)

['This', 'is', 'sample', 'sentents', '<n>', 'and', 'this', 'is', 'another', '\x01']


In [5]:
with gzip.open(rf'contents\web_contents.json.gz', 'r') as file:
    web_contents = json.load(file)

In [6]:
chars_to_replace = string.punctuation + ' \n\t'
allowed_chars = string.ascii_letters + string.digits + string.punctuation
all_chars = set(allowed_chars)
def break_stucked_together_words(doc):
    if len(doc) < 3:
        return
    ptext = Text(doc)
    ptext.language = "en"
    Text(doc)
    tokens = [str(word) for word in ptext.morphemes]
    tokens = [t for t in tokens if t in token_vocab_dict]
    return tokens
    
    
def remove_residuals(s, tokenizer):
    s = s.replace('\n', '<n>')
    
    # Define regex patterns for better handling of redundant symbols and characters
    redundant_symbols_pattern = rf'\s([{chars_to_replace}]){{2,}}'
    non_allowed_chars_pattern = rf'[^{re.escape(allowed_chars)}]'
    excessive_short_words_pattern = r'((\b\w{1,2}\b) ){5,}'  # Adjusted threshold to prevent over-removal
    excessive_repetition_pattern = r'(.)\1{2,}'  # Keep a max of 2 consecutive repetitions to preserve real words
    excessive_alpha_repetition_pattern = r'([a-zA-Z])\1{3,}'  # Remove only extreme cases of repetition
    
    for i in range(2):
        s = re.sub('\s([' + chars_to_replace + ']){2}[' + chars_to_replace + ']+', r'\1', s)
        # s = re.sub(r'([{}])\1+'.format(re.escape(string.punctuation + ' ')), r'\1', s)
        s = re.sub(f'[^{re.escape(allowed_chars)}]', ' ', s)
        # s = re.sub(r'([0-9!\"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~ \t\n\r]{9,})', '', s)
        s = re.sub( r'([a-zA-Z])\1{3,}', r'\1', s)
        s = re.sub(r'((\b\w{1,2}\b) ){4,}', '', s)
        s = re.sub(r'(.)\1+', r'\1 ', s)
        
    tokens = tokenizer(s)
    # print(f'1: {tokens}')
    # tokens = [t if t in token_vocab_dict else break_stucked_together_words(t) for t in tokens ]
    # print(f'2: {tokens}')
    # tokens = list(itertools.chain.from_iterable(tokens))
    tokens = [t for t in tokens if t in token_vocab_dict]
    # print(f'3: {tokens}')
    s = ' '.join(tokens)
    s = re.sub(r'([{}])\1+'.format(re.escape(string.punctuation + ' ')), r'\1', s)
    s = re.sub(f'[^{re.escape(allowed_chars)}]', ' ', s)
    s = re.sub(r'([0-9!\"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~ \t\n\r]{9,})', '', s)
    s = re.sub( r'([a-zA-Z])\1{3,}', r'\1', s)
    s = re.sub(r'(.)\1+', r'\1', s)
    s = re.sub(r'\s(.\s){3,}', ' ', s)
        
    return s

In [7]:
import re
import string

def remove_residuals(s, tokenizer):
    # Temporary placeholder for newlines
    s = s.replace('\n', '<n>')

    # Define regex patterns for better handling of redundant symbols and characters
    redundant_symbols_pattern = rf'\s([{chars_to_replace}]){{2,}}'
    non_allowed_chars_pattern = rf'[^{re.escape(allowed_chars)}]'
    excessive_short_words_pattern = r'((\b\w{1,2}\b) ){5,}'  # Adjusted threshold to prevent over-removal
    excessive_repetition_pattern = r'(.)\1{2,}'  # Keep a max of 2 consecutive repetitions to preserve real words
    excessive_alpha_repetition_pattern = r'([a-zA-Z])\1{3,}'  # Remove only extreme cases of repetition

    # Initial cleanup loop
    for i in range(2):
        s = re.sub(redundant_symbols_pattern, r'\1', s)
        s = re.sub(non_allowed_chars_pattern, ' ', s)
        s = re.sub(excessive_alpha_repetition_pattern, r'\1', s)
        s = re.sub(excessive_short_words_pattern, '', s)
        s = re.sub(excessive_repetition_pattern, r'\1', s)  # Allow two consecutive characters max

    # Tokenize and filter tokens based on vocabulary but keep tokens that match typical word structures
    tokens = tokenizer(s)
    refined_tokens = []
    for t in tokens:
        if t in token_vocab_dict or re.match(r'\b\w{3,}\b', t):  # Keep tokens if in vocab or likely valid
            refined_tokens.append(t)

    # Reassemble the text from the cleaned tokens
    s = ' '.join(refined_tokens)

    # Final cleanup to remove unnecessary punctuation or spaces
    s = re.sub(r'([{}])\1+'.format(re.escape(string.punctuation + ' ')), r'\1', s)
    s = re.sub(non_allowed_chars_pattern, ' ', s)
    s = re.sub(excessive_alpha_repetition_pattern, r'\1', s)
    s = re.sub(excessive_repetition_pattern, r'\1', s)
    s = re.sub(r'\s(.\s){3,}', ' ', s)

    # Restore newlines
    s = s.replace('<n>', '\n')

    return s


In [8]:
text_lengths = [len(v) for k,v in web_contents.items()]


In [12]:
text_keys[np.argmax(text_lengths)]

'27473'

In [9]:

text_keys = [k for k,v in web_contents.items()]
len(web_contents[text_keys[np.argmax(text_lengths)]])

14630099

In [10]:
# target_text = 5
# text = web_contents[f'{target_text}']
text = web_contents[text_keys[np.argmax(text_lengths)]][:10000]
# print(len(web_contents[f'{target_text}']))
print(text.replace('\n', ''))
print(len(ttokenizer.tokenize(text)))
print(ttokenizer.tokenize(text))

%PDF-1.4%����2275 0 obj<>endobjxref2275 270000000016 00000 n0000003037 00000 n0000003188 00000 n0000003660 00000 n0000003775 00000 n0000003892 00000 n0000004590 00000 n0000005176 00000 n0000005816 00000 n0000006454 00000 n0000007074 00000 n0000007680 00000 n0000008285 00000 n0000008381 00000 n0000009086 00000 n0000009764 00000 n0000010381 00000 n0000010952 00000 n0000016755 00000 n0000016834 00000 n0000017111 00000 n0000020183 00000 n0000023255 00000 n0000039748 00000 n0000214862 00000 n0000002818 00000 n0000000851 00000 ntrailer<]/Prev 15555179/XRefStm 2818>>startxref0%%EOF2301 0 obj<>streamh��VkP�W>ߗ{DIA!�PVB,�b u�p�����q���� �	CR��(�jo`�d�A�8rME���k�R�e����=	�;�s��7��9�����   �� @�7p.o��D� :J,j�@e (�X:� Ԉ�� f1��C�I����@~� �<źg8����$>,��`>�7�|ߎ%d�w�\<�B����:t��ߏ��זʈs��&ia��M��w�o�d����z�����v�d6�f��LԼ3Q��h��/��U�!Z���w�N���2�jO��Zl�UN�NMgA登��M{�t��&RQ�KvUJP@��}ۨ��H�H��/X��%��̳�� �z��Q��}>�J?e��4��I	Ij�Eé9�1��ߦO�/�U�]�yl�r����ȕ�O�v˟��=H��Ԣ~+

In [11]:
reduced_text = remove_residuals(text, ttokenizer.tokenize)
print(len(reduced_text))
print(reduced_text)
print(len(ttokenizer.tokenize(reduced_text)))
print(ttokenizer.tokenize(reduced_text))

3471
% PDF -1.4 
 % 
 2275 0 obj 
 < > 
 endobj 
 xref 
 2275 27 
 016 0 n 
 03037 0 n 
 03188 0 n 
 03660 0 n 
 03775 0 n 
 03892 0 n 
 04590 0 n 
 05176 0 n 
 05816 0 n 
 06454 0 n 
 07074 0 n 
 07680 0 n 
 08285 0 n 
 08381 0 n 
 09086 0 n 
 09764 0 n 
 010381 0 n 
 010952 0 n 
 016755 0 n 
 016834 0 n 
 0171 0 n 
 020183 0 n 
 023255 0 n 
 039748 0 n 
 0214862 0 n 
 02818 0 n 
 0851 0 n 
 trailer 
 Prev 15179 / XRefStm 2818 > > 
 startxref 
 0 
 % % EOF 
 2301 0 obj 
 < > stream 
 h VkP W { DIH A PVB jo 8rME k R 
 e 
 
 7p 
 @ ~ g8 
 
 s & ia M 
 L 3Q 
 2 
 U N NMgA RQ KvUJP 
 
 
 
 yl r 
 
 n3 KiUi 
 
 C5G WB P is Q6 \ { QT 
 + d= a6 SZ 
 N 
 Kl D 
 Hr { yj 
 zJH $ 
 
 i_in 
 vk 0 
 bs 
 tKk I 
 
 Wg z da ` 5 TX Hj_ Ic / + 7o Q N 
 V LNT 
 
 s 4 
 ( 83 
 A 
 H 16 
 
 
 
 w 
 
 r 
 O3g 
 F 
 endstream 
 endobj 
 2300 0 obj 
 Filter / FlateDecode / Index [ 423 1852 ] / Length 65 / Size 2275 / Type / XRef stream 
 
 Kc sx 9 < 
 sx 9 < 
 
 ! E 
 endstream 
 endobj 
 2276 0 obj 
 Metad

In [318]:
for i in range(10):
    reduced_text2 = remove_residuals(text, ttokenizer.tokenize)

In [320]:
with gzip.open(rf'contents\web_contents.json.gz', 'rt') as file:
    web_contents = json.load(file)

# doc_lengths = np.array([len(web_contents[i]) for i in web_contents])
web_contents = {int(k): remove_residuals(v, ttokenizer.tokenize) for k, v in web_contents.items()}
    
# sstrip = lambda doc: re.sub(' +', ' ', doc)
# web_contents2 = {i:sstrip(web_contents[i]) for i in web_contents}
with gzip.open(rf'contents\web_contents_reduced.json.gz', 'wt') as file:
    json.dump(web_contents, file)