In [9]:
import os, subprocess, shutil, re, tarfile
import pandas as pd
from nltk.corpus import stopwords

In [10]:
ORIG_DIR = '../GraphSeg/data/input_orig'
INPUT_DIR = '../GraphSeg/data/input'
OUTPUT_DIR = '../GraphSeg/data/output'
SEG_EN = '../GraphSeg/binary/graphseg_en.jar'
SEG_NL = '../GraphSeg/binary/graphseg_nl.jar'

In [17]:
def copy_data(location: str, n=5, wiki=False):
    """
    Copy n files from location to input_orig directory.
    Removes wiki-header if wiki=True.
    """
    count = 0
    for root, _, files in os.walk(location):
        for file in files:
            if count >= n:
                return
            if not os.path.exists(os.path.join(ORIG_DIR, file)):
                shutil.copy(os.path.join(root, file), ORIG_DIR)
                if wiki:
                    with open(os.path.join(ORIG_DIR, file), 'r+') as orig:
                        data = orig.read()
                        orig.seek(0)
                        orig.write(re.sub(r'^(?:(<doc)|(\n<\/doc)).*\n', '', data, flags=re.MULTILINE))
                        orig.truncate()
            count += 1

def clean_data():
    """
    Clean up docs, place them in input directory.
    """
    for root, _, files in os.walk(ORIG_DIR):
        for file in files:
            with open(os.path.join(root, file), 'r') as f:
                doc = f.read()
            with open(os.path.join(INPUT_DIR, file), 'w') as f2:
                f2.write(re.sub(r'^=+.*\n+', '', doc, flags=re.MULTILINE))

def reset_data_folder():
    """
    Reset the input_orig, input and output directories.
    """
    for folder in [ORIG_DIR, INPUT_DIR, OUTPUT_DIR]:
        for file in os.listdir(folder):
            path = os.path.join(folder, file)
            try:
                if os.path.isfile(path) or os.path.islink(path):
                    os.unlink(path)
                elif os.path.isdir(path):
                    shutil.rmtree(path)
            except OSError as e:
                print(f'Failed to delete {path}. Reason: {e}')

#### ENWiki tests

In [None]:
reset_data_folder()
copy_data('../ENWiki/data', n=5)
clean_data()

In [None]:
# Call the jar (and 🙏)
output = subprocess.run(['java', '-jar', SEG_EN, INPUT_DIR, OUTPUT_DIR, '0.25', '2'], capture_output=True, text=True)
print(output.stdout)
# TODO: Possibly change graphseg_en.jar to include the new quiet option so live-printing works nicer
# with subprocess.Popen(['java', '-jar', SEG_EN, INPUT_DIR, OUTPUT_DIR, '0.25', '2'], stdout=subprocess.PIPE, universal_newlines=True) as popen:
#     for line in popen.stdout:
#         print(line)
# if popen.returncode != 0:
#     raise subprocess.CalledProcessError(popen.returncode, popen.args)

#### NLWiki tests
Dutch alternatives:
* https://github.com/dcferreira/multilingual-joint-embeddings for word embeddings (it is based on what's used by the English version)
* http://crr.ugent.be/programs-data/subtitle-frequencies/subtlex-nl/downloading for word frequencies
* `nltk.corpus.stopwords.words('dutch')` for stopwords

To compile the .jar:
```
cd text_segmentation/GraphSeg/source/
mvn package
```
File is found in /binary folder.

##### Creating replacements for the English resources

In [7]:
# Extract correct word embeddings
tar = tarfile.open('../GraphSeg/backups/multilingual_embeddings.tar.gz')
names = tar.getnames()
nl_file = [tar.getmember(name=n) for n in names if n.endswith('.nl')]
tar.extractall(path='../GraphSeg/source/res', members=nl_file)

In [5]:
# Transform word frequency csv into correct .txt format
df = pd.read_csv('../GraphSeg/backups/SUBTLEX-NL.cd-above2.txt', sep='\t', header=0)
out_str = df.iloc[:, :2].to_string(header=False, index=False, justify='left')
with open('../GraphSeg/source/res/freqs.txt', 'w') as out:
    spaces_replaced = re.sub(r'[ \t]+', ' ', out_str, flags=re.MULTILINE)   # Replace multi-spaces/tabs with single space
    out.write(re.sub(r'^[ ]', '', spaces_replaced, flags=re.MULTILINE))     # Remove space at start

In [45]:
# Get stopwords from nltk and put them into a .txt file
nl_sw = stopwords.words('dutch')
with open('../GraphSeg/source/res/stopwords.txt', 'w') as out:
    out.write('\n'.join(nl_sw))

##### Testing

In [20]:
reset_data_folder()
copy_data('../NLWiki/data', wiki=True)
clean_data()

In [None]:
# Call the jar (and 🙏)
with subprocess.Popen(['java', '-jar', SEG_NL, INPUT_DIR, OUTPUT_DIR, '0.25', '2', '-q'], stdout=subprocess.PIPE, universal_newlines=True) as popen:
    for line in popen.stdout:
        print(line)
if popen.returncode != 0:
    raise subprocess.CalledProcessError(popen.returncode, popen.args)