## Data preparation pipeline
- Load file to prepare
- Clean the file
- Create an index
- Concatenate



## Functions

In [1]:
from bs4 import BeautifulSoup
import os
import unicodedata
import json
from pprint import pprint as pp
import re

In [34]:
def browse_and_show_unique_chars(filename):
    '''
    This function reads a simple structured file and returns a set of unique characters in the file.
    The aim is to quickly indentify characters that should be removed from the text.
    '''

    with open(filename, "r") as f:
        all_books = json.load(f)
    
    character_set = set()

    for book in all_books:
        for chapter in book["text"]:
            for verse in chapter:
                if isinstance(verse, str):
                    character_set.update(verse)
                elif isinstance(verse, list):
                    for sub_verse in verse:
                        character_set.update(sub_verse)

    return character_set 

In [35]:
filename = 'Jerusalem_Talmud_complete.json'

browse_and_show_unique_chars(filename)

{'\n',
 ' ',
 '"',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 'C',
 'G',
 'H',
 'J',
 'L',
 'N',
 'P',
 'T',
 'V',
 '[',
 ']',
 'a',
 'b',
 'c',
 'd',
 'e',
 'g',
 'h',
 'i',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'y',
 '{',
 '|',
 '}',
 '·',
 '̇',
 '̈',
 '͏',
 '֑',
 '֒',
 '֓',
 '֔',
 '֕',
 '֖',
 '֗',
 '֘',
 '֙',
 '֚',
 '֛',
 '֜',
 '֝',
 '֞',
 '֠',
 '֡',
 '֢',
 '֣',
 '֤',
 '֥',
 '֦',
 '֧',
 '֨',
 '֩',
 '֪',
 '֫',
 '֬',
 '֭',
 '֮',
 'ְ',
 'ֱ',
 'ֲ',
 'ֳ',
 'ִ',
 'ֵ',
 'ֶ',
 'ַ',
 'ָ',
 'ֹ',
 'ֻ',
 'ּ',
 'ֽ',
 '־',
 'ֿ',
 '׀',
 'ׁ',
 'ׂ',
 '׃',
 'ׄ',
 'ׇ',
 'א',
 'ב',
 'ג',
 'ד',
 'ה',
 'ו',
 'ז',
 'ח',
 'ט',
 'י',
 'ך',
 'כ',
 'ל',
 'ם',
 'מ',
 'ן',
 'נ',
 'ס',
 'ע',
 'ף',
 'פ',
 'ץ',
 'צ',
 'ק',
 'ר',
 'ש',
 'ת',
 'װ',
 '׳',
 '״',
 '\u2009',
 '\u200c',
 '\u200d',
 '\u200e',
 '\u200f',
 '‘',
 '…',
 '\u202c',
 'ﬞ',
 'תּ'}

In [22]:
def create_index(filename):
    '''
    Create a function to browse simple structures files and create index entries.
    With the index, we can then quickly identify the reference of a text chunk,
    knowing the start and end character position of the text chunk in the book.
    '''
    # Load the file
    with open(filename, "r") as f:
        all_books = json.load(f)

    # Create a list to store index entries
    index = {}
    # add the title of the book to the index
    index["title"] = all_books["title"]
    index['categories'] = all_books['categories']
    index['sectionNames'] = all_books['sectionNames']
    index['lines'] = []

    # Generate a unique identifier for each of the smallest text unit,
    # E.g. verses.
    def generate_uid(title, number):
        return f"{title.replace(' ', '_')}_{number}"

    # Counts the chars, and fills the index with start and end char position
    # Hebrew diacritics are removed from the count.
    def browse_text_and_count_verses_chars(text):
        def remove_hebrew_diacritics(text):
            normalized_text = unicodedata.normalize('NFKD', text)
            return ''.join(c for c in normalized_text if not unicodedata.combining(c))

        # Variable to track unique identification number
        uid = 1
        # Initialize character counter
        verse_start_char = 1
        # Intialize chapter counter
        chapter_number = 1

        for chapter in all_books['text']:
            # Initialize verse counter
            verse_number = 1

            for verse in chapter:
                if isinstance(verse, str):
                    verse = remove_hebrew_diacritics(verse)
                    verse_end_char = verse_start_char + len(verse) - 1

                    # Add verse to index with UID
                    index['lines'].append({
                        "uid": generate_uid(index['title'], uid),  # UID unique
                        "chapter_number": chapter_number,
                        "verse_number": verse_number,
                        "start_char": verse_start_char,
                        "end_char": verse_end_char,
                        "length": len(verse),
                        "text": verse
                    })

                    # Increment UID for next verse
                    uid += 1

                    # Increment verse number for next verse
                    verse_number += 1

                    # Update character counter for next verse
                    verse_start_char = verse_end_char + 1
                elif isinstance(verse, list):

                    # Initialize subverse counter
                    sub_verse_number = 1

                    for sub_verse in verse:
                        sub_verse = remove_hebrew_diacritics(sub_verse)
                        sub_verse_end_char = verse_start_char + len(sub_verse) - 1

                        # Add verse to index with UID
                        index['lines'].append({
                            "uid": generate_uid(index['title'], uid),  # UID unique
                            "chapter_number": chapter_number,
                            "verse_number": verse_number,
                            "sub_verse_number": sub_verse_number,
                            "start_char": verse_start_char,
                            "end_char": sub_verse_end_char,
                            "length": len(sub_verse),
                            "text": sub_verse
                        })

                        # Increment UID for next verse
                        uid += 1

                        # Increment verse number for next verse
                        sub_verse_number += 1

                        # Update character counter for next verse
                        verse_start_char = sub_verse_end_char + 1
                    verse_number += 1
            chapter_number += 1
    
    # Load the nodes
    text = all_books['text']
    browse_text_and_count_verses_chars(text)

    # Create a new directory named "indexes" if it doesn't exist
    index_dir = "indexes"
    if not os.path.exists(index_dir):
        os.makedirs(index_dir)

    # Save the index to a file, in the "indexes" directory
    title = index["title"]
    index_filename = os.path.join(index_dir, "index_" + title.replace(' ', '_') + ".json")
    with open(index_filename, "w") as f:
        json.dump(index, f, indent=2, ensure_ascii=False)


In [23]:
create_index(filename)

TypeError: list indices must be integers or slices, not str

## Functions for the pipeline

In [41]:
def browse_and_clean(filename):

    '''
    Remove unwanted characters from the text (e.g. numbers, special unicodes, html tags, etc.)
    Handles both simple and complex structured files.
    '''

    def remove_angle_brackets(text):
        return re.sub(r'\<.*?\>', '', text)

    def remove_special_unicodes(text):
        chars_to_remove = {'\n', '\u2003', '\u200d', '\u200e', '\u202c','\u2009', '\xa0' ,'\xad'}
        for char in chars_to_remove:
            text = text.replace(char, '')
        return text

    def remove_numbers(text):
        return ''.join(char for char in text if not char.isdigit())
    
    def clean_html(text):
        if text.strip():  # Checks if text is not empty after removing spaces
            soup = BeautifulSoup(text, 'html.parser')
            
            # Specifically deletes <b> and </b> tags
            for bold_tag in soup.find_all(['b', 'strong']):
                bold_tag.unwrap()
            
            return soup.get_text()
        else:
            return text  # Returns text unchanged if empty

    # we try to catch and display the MarkupResemblesLocatorWarning warning
    # import warnings

    # def clean_html(text):
    #     with warnings.catch_warnings(record=True) as w:
    #         warnings.simplefilter("always")  # Capture tous les avertissements
    #         if text.strip():
    #             text_to_parse = text  # Stocker le texte avant de l'analyser
    #             soup = BeautifulSoup(text, 'html.parser')
    #             text_without_html = soup.get_text()
    #             # Parcourir tous les avertissements capturés
    #             for warning in w:
    #                 if "MarkupResemblesLocatorWarning" in str(warning.message):
    #                     print("MarkupResemblesLocatorWarning captured while parsing the following text:")
    #                     print(text_to_parse)
    #             return text
    #         else:
    #             return text
    
    def remove_braces(text):
        return re.sub(r'\{.*?\}', '', text)
 
    def clean_text(text):
        # text = remove_angle_brackets(text)
        text = clean_html(text)
        text = remove_special_unicodes(text)
        text = remove_numbers(text)
        text = remove_braces(text)

        return text

    with open(filename, "r") as f:
        all_books = json.load(f)

    for book in all_books:

        if isinstance(book['text'], list):

            # Browse and clean verses directly in the data structure
            for chapter in book["text"]:
                if isinstance(chapter, str):
                    chapter = clean_text(chapter)
                else:
                    for i, verse in enumerate(chapter):
                        if isinstance(verse, str):
                            chapter[i] = clean_text(verse)
                        elif isinstance(verse, list):
                            for j, sub_verse in enumerate(verse):
                                verse[j] = clean_text(sub_verse)    
 
        


    # Create a new directory named "cleaned" if it doesn't exist
    if not os.path.exists('cleaned'):
        os.makedirs('cleaned')

    # Define the cleaned filename with spaces replaced by underscores
    base_filename = os.path.basename(filename)
    cleaned_filename = os.path.join('cleaned', base_filename.replace(' ', '_').split('.')[0] + "_clean.json")
    
    
    # Save the cleaned file in the "cleaned" directory
    with open(cleaned_filename, "w") as f:
        json.dump(all_books, f, indent=2, ensure_ascii=False)

In [39]:
filename = 'Jerusalem_Talmud_complete.json'
browse_and_clean(filename)

In [40]:
filename = 'cleaned/Jerusalem_Talmud_complete_clean.json'
browse_and_show_unique_chars(filename)

{' ',
 '"',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '/',
 ':',
 '<',
 '>',
 '?',
 'G',
 '[',
 ']',
 '|',
 '·',
 '̇',
 '̈',
 '͏',
 '֑',
 '֒',
 '֓',
 '֔',
 '֕',
 '֖',
 '֗',
 '֘',
 '֙',
 '֚',
 '֛',
 '֜',
 '֝',
 '֞',
 '֠',
 '֡',
 '֢',
 '֣',
 '֤',
 '֥',
 '֦',
 '֧',
 '֨',
 '֩',
 '֪',
 '֫',
 '֬',
 '֭',
 '֮',
 'ְ',
 'ֱ',
 'ֲ',
 'ֳ',
 'ִ',
 'ֵ',
 'ֶ',
 'ַ',
 'ָ',
 'ֹ',
 'ֻ',
 'ּ',
 'ֽ',
 '־',
 'ֿ',
 '׀',
 'ׁ',
 'ׂ',
 '׃',
 'ׄ',
 'ׇ',
 'א',
 'ב',
 'ג',
 'ד',
 'ה',
 'ו',
 'ז',
 'ח',
 'ט',
 'י',
 'ך',
 'כ',
 'ל',
 'ם',
 'מ',
 'ן',
 'נ',
 'ס',
 'ע',
 'ף',
 'פ',
 'ץ',
 'צ',
 'ק',
 'ר',
 'ש',
 'ת',
 'װ',
 '׳',
 '״',
 '\u200c',
 '\u200f',
 '‘',
 '…',
 'ﬞ',
 'תּ'}

In [42]:
def concatenate_verses(filename):
    '''
    Concatenate the verses of a book into a single text file.
    Handles both simple and complex structured files.
    '''
    with open(filename, "r") as f:
        data = json.load(f)

    concatenated_text = ""

    def remove_hebrew_diacritics(text):
        normalized_text = unicodedata.normalize('NFKD', text)
        return ''.join(c for c in normalized_text if not unicodedata.combining(c))

    if isinstance(data['text'], list):
        # Browse and clean verses directly in the data structure
        for chapter in data["text"]:
            for verse in chapter:
                if isinstance(verse, str):
                    concatenated_text += verse
                elif isinstance(verse, list):
                    for sub_verse in verse:
                        if isinstance(sub_verse, str):
                            concatenated_text += sub_verse

    elif isinstance(data['text'], dict):

        def browse_concat_complex(nodes):
            text = ""
            for value in nodes.values():
                if isinstance(value, list):
                    for elem in value:
                        if isinstance(elem, str):
                            text += elem
                        elif isinstance(elem, list):
                            for sub_elem in elem:
                                if isinstance(sub_elem, str):
                                    text += sub_elem
                                elif isinstance(sub_elem, list):
                                    for sub_sub_elem in sub_elem:
                                        if isinstance(sub_sub_elem, str):
                                            text += sub_sub_elem
                                        elif isinstance(sub_sub_elem, list):
                                            print("Nested verses detected and not handled!")
                elif isinstance(value, dict):
                    text += browse_concat_complex(value)
            return text

        # Load the nodes
        concatenated_text = browse_concat_complex(data['text'])

    # Create a new directory named "concatenated" if it doesn't exist
    concatenated_dir = 'concatenated'
    if not os.path.exists(concatenated_dir):
        os.makedirs(concatenated_dir)

    # Save the concatenated text to a file, in the "concatenated" directory
    concatenated_filename = os.path.join(concatenated_dir, os.path.splitext(os.path.basename(filename))[0] + "_concatenated.txt")
    with open(concatenated_filename, "w") as f:
        f.write(remove_hebrew_diacritics(concatenated_text))


In [48]:
import os
import json
import unicodedata

def concatenate_verses(filename):
    '''
    Concatenate the verses of a book into a single text file.
    Handles both simple and complex structured files.
    '''
    with open(filename, "r", encoding='utf-8') as f:
        data = json.load(f)

    concatenated_text = ""

    def remove_hebrew_diacritics(text):
        normalized_text = unicodedata.normalize('NFKD', text)
        return ''.join(c for c in normalized_text if not unicodedata.combining(c))

    def extract_text_from_node(node):
        if isinstance(node, dict):
            for key, value in node.items():
                if key == 'text':
                    concatenated_texts.append(concatenate_from_text(value))
                else:
                    extract_text_from_node(value)
        elif isinstance(node, list):
            for item in node:
                extract_text_from_node(item)

    def concatenate_from_text(text_data):
        text = ""
        if isinstance(text_data, list):
            for element in text_data:
                if isinstance(element, str):
                    text += element
                elif isinstance(element, list) or isinstance(element, dict):
                    text += concatenate_from_text(element)
        elif isinstance(text_data, str):
            text += text_data
        return text

    concatenated_texts = []
    extract_text_from_node(data)

    # Concatenate all extracted texts into a single string
    concatenated_text = "".join(concatenated_texts)

    # Create a new directory named "concatenated" if it doesn't exist
    concatenated_dir = 'concatenated'
    if not os.path.exists(concatenated_dir):
        os.makedirs(concatenated_dir)

    # Save the concatenated text to a file, in the "concatenated" directory
    concatenated_filename = os.path.join(concatenated_dir, os.path.splitext(os.path.basename(filename))[0] + "_concatenated.txt")
    with open(concatenated_filename, "w", encoding='utf-8') as f:
        f.write(remove_hebrew_diacritics(concatenated_text))


In [49]:
concatenate_verses("Jerusalem_Talmud_complete.json")

## Pipeline

In [14]:
# For all the files in the directory, browse, clean and concatenate the verses
import os
path = "/"

# Variable to track whether an error has occurred
error_occurred = False

# Clean the texts in the directory
for filename in os.listdir(path):
    if filename.endswith(".json"):
        try:
            browse_and_clean(os.path.join(path, filename))
        except Exception as e:
            error_occurred = True
            print(f"An error occurred while processing file {filename}: {str(e)}")

# # create an index
# for filename in os.listdir("cleaned"):
#     if filename.endswith(".json"):
#         try:
#             create_index(os.path.join("cleaned", filename))
#         except Exception as e:
#             error_occurred = True
#             print(f"An error occurred while creating index for file {filename}: {str(e)}")

# # Concatenate the verses in the 'cleaned' directory
# for filename in os.listdir("cleaned"):
#     if filename.endswith(".json"):
#         try:
#             concatenate_verses(os.path.join("cleaned", filename))
#         except Exception as e:
#             error_occurred = True
#             print(f"An error occurred while concatenating verses for file {filename}: {str(e)}")

# Afficher un message indiquant que le processus s'est terminé sans erreur
if not error_occurred:
    print("The process was completed without error.")

The process was completed without error.


In [12]:
# import os

# path = "../../Corpuses_from_Sefaria/Liturgy"

# # Variable to track whether an error has occurred
# error_occurred = False

# # Clean the texts in the directory
# for filename in os.listdir(path):
#     if filename.endswith(".json"):
#         browse_and_clean(os.path.join(path, filename))

# # create an index
# for filename in os.listdir("cleaned"):
#     if filename.endswith(".json"):
#         create_index(os.path.join("cleaned", filename))

# # Concatenate the verses in the 'cleaned' directory
# for filename in os.listdir("cleaned"):
#     if filename.endswith(".json"):
#         concatenate_verses(os.path.join("cleaned", filename))

# # Afficher un message indiquant que le processus s'est terminé sans erreur
# if not error_occurred:
#     print("The process was completed without error.")
