In [1]:
from pathlib import Path
from tqdm import tqdm
import os
import spacy
nlp = spacy.load('en_core_web_lg')
nlp.max_length = 1000000

In [2]:
def remove_junk(file: Path):
    output_file = file.with_suffix('.sentences.txt')
    if output_file.exists():
        return
    with open(file, 'r', errors='ignore') as f:
        data = f.read()
        if len(data) > nlp.max_length:
            print(f'{file} is too long, using chunks...')
            start = 0
            while start < len(data):
                chunk = data[start:start + nlp.max_length]
                parsed = nlp(chunk)
                with open(output_file, 'a') as f:
                    for sentence in parsed.sents:
                        f.write(" ".join([f'{word}' for word in sentence if word.pos_ not in ['SPACE','PUNCT'] and word.dep_ not in ['punct','nummod']]) + '\n')
                start += nlp.max_length
        else:
            parsed = nlp(data)
            with open(output_file, 'w') as f:
                for sentence in parsed.sents:
                    f.write(" ".join([f'{word}' for word in sentence if word.pos_ not in ['SPACE','PUNCT'] and word.dep_ not in ['punct','nummod']]) + '\n')


In [3]:
root = Path('data')
file_list = list(root.rglob('*.parsed.txt'))
file_list.sort(key=lambda f: os.path.getsize(f))
file_count = len(file_list)

for file in tqdm(file_list, total=file_count):
    remove_junk(file)

  0%|                                                                                                                                                                                                                                                            | 0/1400 [00:00<?, ?it/s]

data/Doc3704154726/04_TT203001_Toronto_EIR_BlackCreekSanitaryTrunkSewer_9Mar21.parsed.txt is too long, using chunks...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1400/1400 [04:06<00:00,  5.67it/s]
