# Chunking

This code was taken from Saric (https://github.com/SanjaSaric/HSA-Topics/blob/master/Chunking.ipynb)

In [1]:
from pathlib import Path
import os 
import re

In [2]:
data = 'C:/Users/elisa/DH-MA/data' 

## Laden und sortieren

In [3]:
path_to_corpus = Path(data, '1808-1865') # Ausgangspfad (wird nicht überschrieben)

In [4]:
sorted(os.listdir(path=path_to_corpus))

['fpn-ball-ball.txt',
 'fpn-brownw-brown.txt',
 'fpn-grandy-grandy.txt',
 'fpn-hortonlife-horton.txt',
 'fpn-hortonpoem-hortonpoem.txt',
 'fpn-jackson-jackson.txt',
 'fpn-jacobs-jacobs.txt',
 'fpn-jones-jones.txt',
 'fpn-northup-northup.txt',
 'fpn-roper-roper.txt',
 'fpn-steward-steward.txt',
 'fpn-williams-williams.txt',
 'neh-aaron-aaron.txt',
 'neh-adamsh-adamsh.txt',
 'neh-aga-aga.txt',
 'neh-allen-allen.txt',
 'neh-allinson-allinson.txt',
 'neh-anderson-anderson.txt',
 'neh-andersonw-andersonw.txt',
 'neh-armistead-armistead.txt',
 'neh-auntjudy-auntjudy.txt',
 'neh-ballslavery-ball.txt',
 'neh-baquaqua-baquaqua.txt',
 'neh-barber-barber.txt',
 'neh-barrett-barrett.txt',
 'neh-bayley-bayley.txt',
 'neh-beard63-beard63.txt',
 'neh-beardj-beard.txt',
 'neh-bethune-bethune.txt',
 'neh-bibb-bibb.txt',
 'neh-black-black.txt',
 'neh-boen-boen.txt',
 'neh-boxbrown-boxbrown.txt',
 'neh-brinch-brinch.txt',
 'neh-brown47-brown47.txt',
 'neh-brown52-brown52.txt',
 'neh-brown55-brown55.txt',

In [6]:
filenames = [os.path.join(path_to_corpus, fn) for fn in sorted(os.listdir(path_to_corpus))]
filenames

['C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-ball-ball.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-brownw-brown.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-grandy-grandy.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-hortonlife-horton.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-hortonpoem-hortonpoem.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-jackson-jackson.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-jacobs-jacobs.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-jones-jones.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-northup-northup.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-roper-roper.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-steward-steward.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\fpn-williams-williams.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\neh-aaron-aaron.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808-1865\\neh-adamsh-adamsh.txt',
 'C:\\Users\\elisa\\DH-MA\\data\\1808

## Dokumente in chunks teilen

In [7]:
def split_text(filename, n_words):
    """Split a text into chunks approximately `n_words` words in length."""
    input = open(filename, 'r', encoding="utf-8")
    words = " ".join(re.sub(',|\.|\;|\:|\(|\)|\-','',input.read()).split()).split(' ') # remove special charachters and normalize space
    input.close()
    chunks = []
    current_chunk_words = []
    current_chunk_word_count = 0
    for word in words:
        current_chunk_words.append(word)
        current_chunk_word_count += 1
        if current_chunk_word_count == n_words:
            chunks.append(' '.join(current_chunk_words))
            current_chunk_words = []
            current_chunk_word_count = 0
    chunks.append(' '.join(current_chunk_words) )
    return chunks

In [8]:
filenames.sort()

In [9]:
chunk_length = 10000
chunks = []

for filename in filenames:
    chunk_counter = 0
    texts = split_text(filename, chunk_length)
    for text in texts:
        chunk = {'text': text, 'number': chunk_counter, 'filename': filename} # make dictionary with content and information
        chunks.append(chunk)
        chunk_counter += 1
        

Anzahl der Originaldateien:

In [10]:
len(filenames)

141

Anzahl der erzeugten chunks:

In [11]:
len(chunks)

588

Prüfen, ob eine Originaldatei zu kurz war, um sie im Topic Modeling überhaupt zu verwenden. 

In [12]:
i = 0
for chunk in chunks:
    l_chunk = len(chunk['text'].split(' '))
    if l_chunk < 23000 and chunk['number'] == 0:
        i+=1
        print(l_chunk, chunk['filename'],chunk['number'])
print('Anzahl der Dateien, die du entfernen solltest: ', i)

10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-ball-ball.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-brownw-brown.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-grandy-grandy.txt 0
3450 C:\Users\elisa\DH-MA\data\1808-1865\fpn-hortonlife-horton.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-hortonpoem-hortonpoem.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-jackson-jackson.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-jacobs-jacobs.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-jones-jones.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-northup-northup.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-roper-roper.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-steward-steward.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\fpn-williams-williams.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-aaron-aaron.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-adamsh-adamsh.txt 0
8787 C:\Users\elisa\DH-MA\data\1808-1865\neh-aga-aga.txt 0
10000 C

10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-twelvetr-twelvetr.txt 0
2723 C:\Users\elisa\DH-MA\data\1808-1865\neh-upham-upham.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-vale-vale.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-wards-ward.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-warner-warner.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-watkin52-watkin52.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-watkins-watkins.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-watson-watson.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-weld-weld.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-wheatley-wheatley.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-white-white.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-whitegeo-whitegeo.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-wilkerson-wilkerson.txt 0
10000 C:\Users\elisa\DH-MA\data\1808-1865\neh-williamsjames-williams.txt 0
Anzahl der Dateien, die du entfernen solltest:  141


Hatte eine Datei beispielsweise 110 Tokens, werden 2 chunks produziert: <br>
1) mit 100 Tokens <br>
2) mit 10 Tokens. <br>
Wir möchten diese kurzen chunks ihren vorhergehenden Geschwisterdateien hinzufügen.  

In [13]:
i = 0
for chunk in chunks:
    index = chunks.index(chunk)
    l_chunk = len(chunk['text'].split(' '))
    if l_chunk < 3000 and chunk['number'] != 0:
        i+=1
        chunks[index-1]['text'] = chunks[index-1]['text'] + ' ' + chunk['text']
        print('Chunk ' + chunk['filename'] + str(chunk['number']-1) + ' erweitert mit chunk ' + str(chunk['number']) + ' auf dem Index ' + str(index))
        
print('Anzahl der erweiterten chunks: ' + str(i))

Chunk C:\Users\elisa\DH-MA\data\1808-1865\fpn-northup-northup.txt7 erweitert mit chunk 8 auf dem Index 41
Chunk C:\Users\elisa\DH-MA\data\1808-1865\fpn-steward-steward.txt7 erweitert mit chunk 8 auf dem Index 52
Chunk C:\Users\elisa\DH-MA\data\1808-1865\neh-adamsh-adamsh.txt7 erweitert mit chunk 8 auf dem Index 67
Chunk C:\Users\elisa\DH-MA\data\1808-1865\neh-allen-allen.txt1 erweitert mit chunk 2 auf dem Index 71
Chunk C:\Users\elisa\DH-MA\data\1808-1865\neh-andersonw-andersonw.txt1 erweitert mit chunk 2 auf dem Index 76
Chunk C:\Users\elisa\DH-MA\data\1808-1865\neh-barber-barber.txt1 erweitert mit chunk 2 auf dem Index 117
Chunk C:\Users\elisa\DH-MA\data\1808-1865\neh-bayley-bayley.txt0 erweitert mit chunk 1 auf dem Index 120
Chunk C:\Users\elisa\DH-MA\data\1808-1865\neh-beard63-beard63.txt11 erweitert mit chunk 12 auf dem Index 133
Chunk C:\Users\elisa\DH-MA\data\1808-1865\neh-beardj-beard.txt11 erweitert mit chunk 12 auf dem Index 146
Chunk C:\Users\elisa\DH-MA\data\1808-1865\neh-b

In [14]:
#chunks

Nun können diejenigen chunks, die bereits zu ihren Geschwisterdateien kopiert wurden, sowie diejenigen chunks, die sehr kurz waren und keine Geschwister hatten (= kurze Originalfiles) gelöscht werden.

In [15]:
i = 0
for chunk in chunks:
    index = chunks.index(chunk)
    l_chunk = len(chunk['text'].split(' '))
    if l_chunk < 3000:
        i+=1
        chunks.remove(chunk)
        
print('Gelöschte chunks: ' + str(i))

Gelöschte chunks: 53


In [17]:
print('Übriggebliebene: ' + str(len(chunks)))

Übriggebliebene: 535


## chunks zu Textdateien speichern

In [18]:
output_dir = 'C:/Users/elisa/DH-MA/data/1808-1865-chunks'

In [19]:
""" Quelle für Code: DARIAH-DE (https://liferay.de.dariah.eu/tatom/index.html)
    for chunk in chunks:
    basename = os.path.basename(chunk['filename'])
    fn = os.path.join(output_dir, "{}{:04d}".format(basename, chunk['number']))
    with open(fn, 'w', encoding='utf-8') as f:
        f.write(chunk['text'])
"""
# umgeändert, sodass valide txt-Dateien als output kommen
for chunk in chunks:
    basename = os.path.basename(chunk['filename'])
    fn_base, fn_ext = os.path.splitext(basename)
    fn = os.path.join(output_dir, "{}_{:04d}{}".format(fn_base, chunk['number'], fn_ext))
    with open(fn, 'w', encoding='utf-8') as f:
        f.write(chunk['text'])

Testen, ob kurze Dateien übriggeblieben sind:

In [20]:
# Test if short files remained
i = 0
for chunkfile in Path(data, output_dir).glob('*.txt'):
    with open(chunkfile, encoding='utf-8') as f:
        text = f.read().split(' ')
        #print(len(text))
        if len(text) < 3000:
            i+=1
            print(chunkfile.name)
print('Übriggebliebene kurze Files: ', i)

neh-bethune-bethune_0000.txt
neh-gallaudet-gallaudet_0000.txt
Übriggebliebene kurze Files:  2


Wenn ja, dann sollten diese auch manuell entfernt oder zu Geschwisterdateien hinzugefügt werden.
