In [1]:
import os
import json
import re
from datetime import timezone, datetime

In [31]:
# Create a file list
def list_files(dir, corp_dirs):
    r = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            file_path = os.path.join(root, name)
            corpus = [corp for corp in corp_dirs if corp in file_path][0]
            r.append((file_path, corpus))
    return r

def cleaner(text):
    text = re.sub(' +', ' ', text)
    text = re.sub('\n+', '\n', text)
    text = re.sub('\t+', '\t', text)
    return text

In [43]:
# dictionary that contains information about the corpus' sources
corpora = {
    '2b_articles': {
        'info': "The To Be Healthy Corpus contains articles from the medical website To Be Healthy (L'Hiyot Bari, 2b-bari)",
        'encoding': 'utf-8',
        'counter': 0
    },
    '2b_forum': {
        'info': "The To Be Healthy Corpus contains forum discussions from the medical website To Be Healthy (L'Hiyot Bari, 2b-bari)",
        'encoding': 'utf-8',
        'counter': 0
    },
    'a7_articles': {
        'info': "News and articles from the Arutz 7 news website, 2001-2006.",
        'encoding': 'cp1255',
        'counter': 0
    },
    'doctors': {
        'info': "Articles from the Doctors medical website.",
        'encoding': 'utf-8',
        'counter': 0
    },
    'haaretz_txt': {
        'info': "News and articles from the HaAretz news website, 1990-1991.",
        'encoding': 'utf-8',
        'counter': 0
    },
    'infomed_06': {
        'info': "Question and answer discussions from the Infomed website's medical forum, from 2006",
        'encoding': 'utf-8',
        'counter': 0
    },
    'infomed_07': {
        'info': "Question and answer discussions from the Infomed website's medical forum, from 2007",
        'encoding': 'utf-8',
        'counter': 0
    },
    'knesset_16': {
        'info': "Session protocols of the Knesset (Israeli Parliament) during January 2004 - November 2005.",
        'encoding': 'cp1255',
        'counter': 0
    },
    'knesset_17': {
        'info': " 	Session protocols of the Knesset (Israeli Parliament) during January 2004 - November 2005.",
        'encoding': 'cp1255',
        'counter': 0
    },
    'tapuz_text1': {
        'info': "Forum discussions from the Tapuz People website, on a variety of subjects.",
        'encoding': 'utf-8',
        'counter': 0
    },
    'tapuz_text2': {
        'info': "Forum discussions from the Tapuz People website, on a variety of subjects.",
        'encoding': 'utf-8',
        'counter': 0
    },
    'themarker': {
        'info': "Articles from the TheMarker financial newspaper, May - October 2002.",
        'encoding': 'cp1255',
        'counter': 0
    },
    'wallasport': {
        'info': "Articles from Walla Sport website, 2014-2015.",
        'encoding': 'utf-8',
        'counter': 0
    }
}

In [44]:
# The following code parses the .txt files into a dictionary that contains related metadata as well as the text itself
dir = 'Data'
file_list = list_files(dir, corpora)
extracted_corpus = {}

for file_path, corp in file_list:
    with open(file_path, "r", encoding='utf-8') as f:
        text = cleaner(f.read())
        creation_ts = datetime.fromtimestamp(os.path.getmtime(file_path)).astimezone(timezone.utc).isoformat()
        collection_ts = datetime.now().astimezone().replace(microsecond=0).isoformat()
        corpora[corp]['counter'] += 1
        id = f'{corp}_{corpora[corp]["counter"]}'
        lang = 'heb'
        possible_langs = ['eng']
        extracted_corpus[id] = {
            'lang': lang,
            'possible_langs': possible_langs,
            'creation_ts': creation_ts,
            'collection_ts': collection_ts,
            'corpus': corp,
            'text': text
        }

In [45]:
# dump the dictionaries into .json files

with open("MILA_corpus.json", "w", encoding='utf-8') as file:
    json.dump(extracted_corpus, file, ensure_ascii=False, indent=3)

with open("MILA_resources.json", "w", encoding='utf-8') as file:
    json.dump(corpora, file, ensure_ascii=False, indent=3)