# Data preparation



The data in [NLP-abstracts] correspond to approximately 2,000 abstracts of PhD Theses and journal abstracts in the NLP domain in French associated with their English translation (or vice-versa). These texts have been downloaded from public sources, manually curated, and aligned at the sentence level. It is redistributed under the terms of the [CC-BY Licence](https://creativecommons.org/licenses/by/4.0/).

This data has been used in the following paper, published at the 2024 edition of the TALN conference: 

>Ziqian Peng, Rachel Bawden, and François Yvon. 2024. À propos des difficultés de traduire automatiquement de longs documents. In Actes de la 31ème Conférence sur le Traitement Automatique des Langues Naturelles, volume 1 : articles longs et prises de position, pages 2–21, Toulouse, France. ATALA and AFPC.



In [1]:
# from xml.etree import ElementTree as ET
import re
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
import xml.etree.ElementTree as ET
print(ET.VERSION)

import unicodedata

1.3.0


In [2]:
# change the path to the corresponding folder in your local
tmx_path_dict= {
    'THE': 'TAL/corpora/tmx/theses.fr',
    'rTAL': 'TAL/corpora/tmx/rTAL_abstract',
    'ISTEX': 'TAL/corpora/tmx/abstract_m_trankit',
}

store_path_dict = {
    'THE': 'TAL/corpora/NLP_abstracts/THE_abstracts',
    'rTAL': 'TAL/corpora/NLP_abstracts/rTAL_abstracts',
    'ISTEX': 'TAL/corpora/NLP_abstracts/ISTEX_abstracts',
}


In [3]:
def tmx2txt_file(fpath):
    try:
        tree = ET.parse(fpath, ET.XMLParser(encoding='utf-8'))
    except:
        print(f"Cannot parse, potentially no text in {fpath}")
        warnings.warn(f"Cannot parse file {fpath} as tree")
        
    root = tree.getroot() 

    sents_dict = {  'EN': [], 'FR': []} 
    
    doc_id = 'docid'
    for d in root.iter('header'):
        for el in d.iter():
            if el.tag == 'docid':
                doc_id = el.text.strip()
    # todo check empty alignment
    for d in root.iter('body'):
        doc_info = []
        for el in d.iter():
            if el.tag == 'tu':
                # print(el.attrib) # segId
                # reset lang for each pair of sentences
                lang = ''
            if el.tag == 'tuv':
                lang = el.attrib['{http://www.w3.org/XML/1998/namespace}lang']

            if el.tag == 'seg':
                # !!! unicode normalization
                txt = el.text if el.text is not None else '' 
                    
                sents_dict[lang] = sents_dict[lang] + [ txt ]
    assert(len(sents_dict['EN']) ==len(sents_dict['FR']))
    return doc_id, sents_dict



# process functions  adapted from the initial version made by Maxime Bouthor
# to process the documents in THE and rTAL
def process(sentence):
    if isinstance(sentence, list):
        # if sentence is a list of token form
        sentence = ' '.join(sentence)
    sentence = re.sub("’", "'", sentence)
    sentence = re.sub("‘", "'", sentence)
    sentence = re.sub('', '*', sentence)
    sentence = re.sub('', '*', sentence)
    sentence = re.sub('', ',', sentence)
    
    sentence = re.sub(' +\.', '.', sentence)
    # sentence = re.sub(' *-', '-', sentence)
    sentence = re.sub(' *,', ',', sentence)
    sentence = re.sub(' *\?', '?', sentence)
    sentence = re.sub(' *!', '!', sentence)
    sentence = re.sub(' *:', ':', sentence)
    sentence = re.sub(' *\)', ')', sentence)
    sentence = re.sub('\( *', '(', sentence)
    sentence = re.sub(' *\]', ']', sentence)
    sentence = re.sub('\[ *', '[', sentence)
    sentence = re.sub(' @@', '', sentence)
    sentence = re.sub(' *»', '»', sentence)
    sentence = re.sub('« *', '«', sentence)
    sentence = re.sub(' +', ' ', sentence)

    return sentence.strip()

def _process_to_french(sentence):
    sentence = re.sub('\?', ' ?', sentence)
    sentence = re.sub('!', ' !', sentence)
    sentence = re.sub(' *\? +!', '?!', sentence)
    sentence = re.sub(' *: *', ' : ', sentence)
    sentence = re.sub(' *; +', ' ; ', sentence)
    sentence = re.sub(' *» *', ' » ', sentence)
    sentence = re.sub(' *« *', ' « ', sentence)
    sentence = re.sub('&lt ;', '&lt;', sentence)
    sentence = re.sub('&gt ;', '&gt;', sentence)
    sentence = re.sub('&amp ;', '&amp;', sentence)
    
    return sentence.strip()


def process_to_french(sentences):
    return list(map(_process_to_french, sentences))

In [4]:



def docid2dataset_sent(data_path, lst_path, store_path_prefix, task = 'train', tmx_fname = 'MaTOS_THE_aligned.tmx'):
    # store_path_prefix, the store path without extension, such as .../.../train-these_sent
    id_list = [ l.strip() for l in open(lst_path, 'r').read().strip().split('\n') ]
       
    data_en = []
    data_fr = []
    data_idx = []
    for order, docid in enumerate( id_list):
        # read parallel sentences
        doc_id, sents_dict = tmx2txt_file(os.path.join(data_path, docid, tmx_fname ))
        
        for i in range(len(sents_dict['EN'])):
            data_en.append(sents_dict['EN'][i] )
            data_fr.append(sents_dict['FR'][i] )
            data_idx.append(f'{order}.{i+1}')
    
    assert(len(data_fr) == len(data_en))
    
    data_en = list(map(process, data_en))   
    data_fr = list(map(process, data_fr))   
    data_fr = process_to_french(data_fr)

    with open(f'{store_path_prefix}.en', 'w') as f:
        f.write('\n'.join( data_en).strip() )
    
    with open(f'{store_path_prefix}.fr', 'w') as f:
        f.write('\n'.join(data_fr).strip() )
    
    with open(f'{store_path_prefix}.idx', 'w') as f:
        f.write('\n'.join(data_idx))

## sent-level corpus

In [5]:
# nb_dev = 101
# nb_test=100
# store_path = f'TAL/corpora/NLP_abstracts/these_data-1500-{nb_dev}-{nb_test}'

key = 'THE'
data_path = tmx_path_dict[key]

store_path = store_path_dict[key]


fpath_dict = {
    'train' : 'txt_train/sents/THE_sent_train',
    'dev' : 'txt_dev/sents/dev',
    'test': 'txt_test/sents/THE_sent'
}


for task in ['train', 'dev', 'test']:
    print(task)
    store_dir = os.path.join(store_path, os.path.dirname(fpath_dict[task]))
    print(store_dir)
    Path(store_dir).mkdir(parents=True, exist_ok=True)
    
    store_path_prefix =  os.path.join(store_path, fpath_dict[task])
    docid2dataset_sent(data_path, os.path.join(data_path,  f'train.lst'), store_path_prefix, task = task)



train
TAL/corpora/NLP_abstracts/THE_abstracts/txt_train/sents
dev
TAL/corpora/NLP_abstracts/THE_abstracts/txt_dev/sents
test
TAL/corpora/NLP_abstracts/THE_abstracts/txt_test/sents


In [6]:


key = 'rTAL'

data_path = tmx_path_dict[key]
store_path = f"{store_path_dict[key]}/txt_test"

for task in ['test']:
    print(task)
    store_dir = os.path.join(store_path, 'sents')
    print(store_dir)
    Path(store_dir).mkdir(parents=True, exist_ok=True)
    
    store_path_prefix =  os.path.join(store_dir, f'{key}_sent')
    docid2dataset_sent(data_path, os.path.join(data_path,  f'rTAL_doc.lst'), store_path_prefix, task = task, tmx_fname = f'MaTOS_{key}_aligned.tmx')
    



test
TAL/corpora/NLP_abstracts/rTAL_abstracts/txt_test/sents


In [7]:



key = 'ISTEX'
data_path = tmx_path_dict[key]
store_path = f"{store_path_dict[key]}/txt_train"


Path( store_path ).mkdir(parents=True, exist_ok=True)

for task in ['train']:
    store_dir = os.path.join(store_path, 'sents')
    print(store_dir)
    Path(store_dir).mkdir(parents=True, exist_ok=True)
    
    store_path_prefix =  os.path.join(store_dir, f'{key}_sent_train')
    docid2dataset_sent(data_path, os.path.join(data_path,  f'train.lst'), store_path_prefix, task = task, tmx_fname = f'MaTOS_{key}_aligned.tmx')



TAL/corpora/NLP_abstracts/ISTEX_abstracts/txt_train/sents


## doc-level corpus

In [8]:
def docid2dataset_doc(data_path, lst_path, src_store_path, tgt_store_path, tmx_fname = 'MaTOS_THE_aligned.tmx', sep_tag = '</eos>'):
    id_list = open(lst_path, 'r').read().strip().split('\n')
       
    data_en = []
    data_fr = []
    for docid in id_list:
        # read parallel sentences for each document pair
        _, sents_dict = tmx2txt_file(os.path.join(data_path, docid, tmx_fname ))

        data_en.append(f'{sep_tag} '.join(sents_dict['EN']))
        data_fr.append(f'{sep_tag} '.join(sents_dict['FR'])) 
    assert(len(data_fr) == len(data_en))
    
    data_en = list(map(process, data_en))   
    data_fr = list(map(process, data_fr))   
    data_fr = process_to_french(data_fr)    
    
    with open(os.path.join(src_store_path), 'w') as f:
        f.write('\n'.join(data_en))
    
    with open(os.path.join(tgt_store_path), 'w') as f:
        f.write('\n'.join(data_fr))

In [9]:



key = 'THE'
data_path = tmx_path_dict[key]
store_path = store_path_dict[key]

fpath_dict = {
    'train' : 'txt_train/doc_with_sep/THE_doc_sep_train',
    'dev' : 'txt_dev/doc_with_sep/dev',
    'test': 'txt_test/doc_with_sep/THE_doc_sep'
}


for task in ['train', 'dev', 'test']:
    print(task)
    store_dir = os.path.join(store_path, os.path.dirname(fpath_dict[task]))
    print(store_dir)
    Path(store_dir).mkdir(parents=True, exist_ok=True)
    
    src_store_path = f'{store_path}/{fpath_dict[task]}.en'
    tgt_store_path = f'{store_path}/{fpath_dict[task]}.fr'
    docid2dataset_doc(
        data_path, 
        os.path.join(data_path,  f'{task}.lst'), 
        src_store_path,
        tgt_store_path, 
        tmx_fname = f'MaTOS_{key}_aligned.tmx', 
        sep_tag = '<sep>'
    )


train
TAL/corpora/NLP_abstracts/THE_abstracts/txt_train/doc_with_sep
dev
TAL/corpora/NLP_abstracts/THE_abstracts/txt_dev/doc_with_sep
test
TAL/corpora/NLP_abstracts/THE_abstracts/txt_test/doc_with_sep


In [10]:


key = 'THE'
data_path = tmx_path_dict[key]
store_path = store_path_dict[key]


fpath_dict = {
    'train' : 'txt_train/THE_doc_train',
    'dev' : 'txt_dev/dev',
    'test': 'txt_test/THE_doc'
}


for task in ['train', 'dev', 'test']:
    print(task)
    store_dir = os.path.join(store_path, os.path.dirname(fpath_dict[task]))
    print(store_dir)
    Path(store_dir).mkdir(parents=True, exist_ok=True)
    
    src_store_path = f'{store_path}/{fpath_dict[task]}.en'
    tgt_store_path = f'{store_path}/{fpath_dict[task]}.fr'
    
    docid2dataset_doc(
        data_path, 
        os.path.join(data_path,  f'{task}.lst'), 
        src_store_path,
        tgt_store_path, 
        tmx_fname = f'MaTOS_{key}_aligned.tmx', 
        sep_tag = ''
    )

train
TAL/corpora/NLP_abstracts/THE_abstracts/txt_train
dev
TAL/corpora/NLP_abstracts/THE_abstracts/txt_dev
test
TAL/corpora/NLP_abstracts/THE_abstracts/txt_test


In [11]:



key = 'rTAL'

data_path = tmx_path_dict[key]
store_path = f"{store_path_dict[key]}/txt_test"


for task in [ 'test']:
    store_dir = os.path.join(store_path, 'doc_with_sep' )
    print(store_dir)
    Path(store_dir).mkdir(parents=True, exist_ok=True)
    
    src_store_path = f'{store_dir}/{key}_doc_sep.en'
    tgt_store_path = f'{store_dir}/{key}_doc_sep.fr'
    docid2dataset_doc(data_path, os.path.join(data_path,  f'rTAL_doc.lst'), src_store_path, tgt_store_path, tmx_fname = f'MaTOS_{key}_aligned.tmx', sep_tag = '<sep>')

    
    src_store_path = f'{store_path}/{key}_doc.en'
    tgt_store_path = f'{store_path}/{key}_doc.fr'
    docid2dataset_doc(data_path, os.path.join(data_path,  f'rTAL_doc.lst'), src_store_path, tgt_store_path, tmx_fname = f'MaTOS_{key}_aligned.tmx', sep_tag = '')

TAL/corpora/NLP_abstracts/rTAL_abstracts/txt_test/doc_with_sep


In [12]:
# ISTEX collected by Mathilde


key = 'ISTEX'

data_path = tmx_path_dict[key]
store_path = f"{store_path_dict[key]}/txt_train"

for task in [ 'train']:
    store_dir = os.path.join(store_path, 'doc_with_sep' )
    print(store_dir)
    Path(store_dir).mkdir(parents=True, exist_ok=True)
    
    src_store_path = f'{store_dir}/{key}_doc_sep_{task}.en'
    tgt_store_path = f'{store_dir}/{key}_doc_sep_{task}.fr'
    docid2dataset_doc(data_path, os.path.join(data_path,  'train.lst'), src_store_path, tgt_store_path, tmx_fname = f'MaTOS_{key}_aligned.tmx', sep_tag = '<sep>')

    
    src_store_path = f'{store_path}/{key}_doc_{task}.en'
    tgt_store_path = f'{store_path}/{key}_doc_{task}.fr'
    docid2dataset_doc(data_path, os.path.join(data_path,  'train.lst'), src_store_path, tgt_store_path, tmx_fname = f'MaTOS_{key}_aligned.tmx', sep_tag = '')

TAL/corpora/NLP_abstracts/ISTEX_abstracts/txt_train/doc_with_sep


## TAL-D and TAL-S

In [15]:
import os,re

DATA_DIR = 'TAL/corpora/NLP_abstracts'
THE_path = f'{DATA_DIR}/THE_abstracts/txt_train'
ISTEX_path = f'{DATA_DIR}/ISTEX_abstracts/txt_train'

train_paths_doc = [
    f'{THE_path}/THE_doc_train',
    f'{ISTEX_path}/ISTEX_doc_train',
]

train_paths_sent = [
    f'{THE_path}/sents/THE_sent_train',
    f'{ISTEX_path}/sents/ISTEX_sent_train',
]

FTdata_path = f'{DATA_DIR}/NLP_abstracts_txt_all/txt_train'
Path( f"{FTdata_path}/TAL-D" ).mkdir(parents=True, exist_ok=True)
Path( f"{FTdata_path}/TAL-S" ).mkdir(parents=True, exist_ok=True)


for lang in ['en', 'fr']:
    to_write = [open(f"{fpath}.{lang}").read().strip() for fpath in train_paths_doc]
    
    with open(os.path.join(FTdata_path, 'TAL-D', f'train.{lang}'), 'w') as f:
        f.write(re.sub( '<sep>', '', '\n'.join(to_write)).strip() )
        
    to_write = [open(f"{fpath}.{lang}").read().strip() for fpath in train_paths_sent]
    with open(os.path.join(FTdata_path , 'TAL-S' ,f'train.{lang}'), 'w') as f:
        f.write('\n'.join(to_write).strip() )