#Introduction and Setup
Explains the purpose and installs dependencies


In [1]:
# Sefaria Text Formatter
# This notebook helps you retrieve and format texts from Sefaria for easy copy-pasting into Google Docs

# requests

import requests
import json
import pandas as pd
import re
from IPython.display import display, HTML, Markdown
from word2number import w2n

# Display introduction
display(HTML("""
<div style="background-color: #f0f7fb; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
    <h1 style="color: #0d47a1;">Sefaria Text Formatter</h1>
    <p>This tool helps you retrieve texts from Sefaria and format them for easy copy-pasting into Google Docs with the following features:</p>
    <ul>
        <li>Remove nikud (vowel marks) from Hebrew text</li>
        <li>Standardize terminology to preferred terms</li>
        <li>Format text in Calibri 11pt for clean copy-paste</li>
        <li>Organize text into sections with Hebrew and English</li>
    </ul>
    <p>To use: Run the cells below, then use the search form to find and format your text.</p>
</div>
"""))

# Helper Functions
Contains the core processing functions (terminology, nikud removal)


In [13]:
# ---------------------------
# HELPER FUNCTIONS
# ---------------------------

# Dictionary for terminology preferences
TERMINOLOGY_PREFERENCES = {
    r'\bGemara\b': 'Talmud',
    r'\bRabbi\b': 'R\'',
    r'\bThe Sages taught\b': 'A baraita states',
    r'\bDivine Voice\b': 'bat kol',
    r'\bDivine Presence\b': 'Shekhina',
    r'\bdivine inspiration\b': 'Holy Spirit',
    r'\bthe Lord\b': 'YHWH',
    r'\bleper\b': 'metzora',
    r'\bleprosy\b': 'tzara\'at',
    r'\bphylacteries\b': 'tefillin',
    r'\bgentile\b': 'non-Jew',
    r'\bignorant\b': 'am ha\'aretz',
    r'\bignoram(us|i)\b': 'am ha\'aretz',
    r'\bmaidservant\b': 'female slave',
    r'\bbarrel\b': 'jug',
}



def remove_nikud(text):
    """Remove Hebrew vowel marks (nikud) while preserving standard punctuation."""
    if not text:
        return text
    return re.sub(r'[\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7]', '', text)



def standardize_terminology(text):
    """Standardize terminology according to preferred terms with improved number handling."""
    if not text:
        return text

    # -------------------------------------------------------------------------
    # 1) Apply terminology preferences first
    # -------------------------------------------------------------------------
    for pattern, replacement in TERMINOLOGY_PREFERENCES.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

    # -------------------------------------------------------------------------
    # 2) Define helper data/functions to parse spelled-out numbers safely
    # -------------------------------------------------------------------------

    # Basic dictionary of known word-values (cardinals + ordinals).
    number_values = {
        # Cardinals
        'zero': 0,
        # 'one': 1, 'two': 2, 'three': 3,
        'four': 4, 'five': 5,
        'six': 6, 'seven': 7, 'eight': 8, 'nine': 9,
        'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14,
        'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19,
        'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50,
        'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90,
        'hundred': 100, 'thousand': 1000,

        # Ordinals (map to same numeric value as cardinal)
        'first': 1, 'second': 2, 'third': 3, 'fourth': 4, 'fifth': 5,
        'sixth': 6, 'seventh': 7, 'eighth': 8, 'ninth': 9, 'tenth': 10,
        'eleventh': 11, 'twelfth': 12, 'thirteenth': 13, 'fourteenth': 14,
        'fifteenth': 15, 'sixteenth': 16, 'seventeenth': 17, 'eighteenth': 18,
        'nineteenth': 19, 'twentieth': 20, 'thirtieth': 30, 'fortieth': 40,
        'fiftieth': 50, 'sixtieth': 60, 'seventieth': 70, 'eightieth': 80,
        'ninetieth': 90, 'hundredth': 100, 'thousandth': 1000,
    }

    # Words that specifically indicate an ordinal form
    ordinal_words = {
        'first', 'second',
        'third', 'fourth', 'fifth',
        'sixth', 'seventh', 'eighth', 'ninth', 'tenth',
        'eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth',
        'sixteenth', 'seventeenth', 'eighteenth', 'nineteenth',
        'twentieth', 'thirtieth', 'fortieth', 'fiftieth',
        'sixtieth', 'seventieth', 'eightieth', 'ninetieth',
        'hundredth', 'thousandth',
    }

    def ordinal_suffix(n: int) -> str:
        """Return the appropriate English ordinal suffix for the integer n."""
        if 11 <= (n % 100) <= 13:
            return "th"
        last_digit = n % 10
        return {1: 'st', 2: 'nd', 3: 'rd'}.get(last_digit, 'th')

    def parse_number_phrase(phrase: str) -> str:
        """
        Convert an English spelled-out number phrase (possibly containing ordinals)
        into its digit form. Safeguards against converting 'and' alone to '0'.
        """
        phrase_lower = phrase.lower()
        tokens = phrase_lower.split()

        # Expand hyphens (e.g. "twenty-five" -> ["twenty","five"])
        expanded = []
        for tok in tokens:
            if '-' in tok:
                expanded.extend(tok.split('-'))
            else:
                expanded.append(tok)

        # Replace "a" -> "one" but skip "and"
        filtered = [("one" if w == "a" else w) for w in expanded if w != "and"]

        # If no recognized numeric tokens remain, return original phrase
        if not any(w in number_values for w in filtered):
            return phrase  # <-- This prevents "and" from becoming "0"

        total = 0
        current = 0
        for w in filtered:
            if w in number_values:
                val = number_values[w]
                if val >= 100:
                    if current == 0:
                        current = 1
                    current *= val
                    # Add immediately for thousand/million/etc.
                    if val >= 1000:
                        total += current
                        current = 0
                else:
                    current += val
            # Unknown tokens are just skipped

        numeric_value = total + current

        # Check if final token is an ordinal word (e.g. "seventh", "hundredth")
        if filtered and filtered[-1] in ordinal_words:
            return str(numeric_value) + ordinal_suffix(numeric_value)
        else:
            return str(numeric_value)

    # -------------------------------------------------------------------------
    # 3) Convert spelled-out numbers to digits in the text
    # -------------------------------------------------------------------------

    # Pattern to match any "spelled-out" number phrase
    number_word_pattern = re.compile(
        r'\b(?:a|one|two|three|four|five|six|seven|eight|nine|ten|'
        r'eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|'
        r'eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|'
        r'eighty|ninety|hundred|thousand|million|billion|trillion|'
        r'first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|'
        r'eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|'
        r'seventeenth|eighteenth|nineteenth|twentieth|thirtieth|fortieth|'
        r'fiftieth|sixtieth|seventieth|eightieth|ninetieth|hundredth|'
        r'thousandth|millionth|billionth|trillionth)(?:-[a-zA-Z]+)?'
        r'(?:\s+(?:a|one|two|three|four|five|six|seven|eight|nine|ten|'
        r'eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|'
        r'eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|'
        r'eighty|ninety|hundred|thousand|million|billion|trillion|'
        r'first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|'
        r'eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|'
        r'seventeenth|eighteenth|nineteenth|twentieth|thirtieth|fortieth|'
        r'fiftieth|sixtieth|seventieth|eightieth|ninetieth|hundredth|'
        r'thousandth)(?:-[a-zA-Z]+)?)*\b',
        flags=re.IGNORECASE
    )

    def convert_number_words(match):
        # Grab the entire matched phrase
        phrase = match.group(0)
        # Convert with the new parser
        return parse_number_phrase(phrase)

    # Do the substitution
    text = re.sub(number_word_pattern, convert_number_words, text)

    # -------------------------------------------------------------------------
    # 4) Handle numeric ordinals in certain contexts (if you still need them)
    # -------------------------------------------------------------------------
    def add_ordinal_suffix(match):
        num = int(match.group(1))
        # English ordinal suffix logic
        if 10 <= num % 100 <= 20:
            suffix = 'th'
        else:
            suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(num % 10, 'th')
        return f"{num}{suffix}"

    # Example context patterns:
    text = re.sub(r'the (\d+) day', lambda m: f"the {add_ordinal_suffix(m)} day", text)
    text = re.sub(r'(\d+) century', lambda m: f"{add_ordinal_suffix(m)} century", text)

    return text

    # Add to the HELPER FUNCTIONS section:

def split_by_punctuation(text):
    """Split text by periods, colons, and question marks while preserving punctuation."""
    if not text:
        return []

    # Split by periods, colons, and question marks
    # This regex looks for these punctuation marks followed by a space or end of string
    splits = re.split(r'([.?:](?:\s|$))', text)

    # Recombine the split results to keep the punctuation with the preceding text
    result = []
    i = 0
    while i < len(splits):
        if i + 1 < len(splits) and any(splits[i+1].startswith(p) for p in ['.', '?', ':']):
            combined = splits[i] + splits[i+1]
            result.append(combined.strip())
            i += 2
        else:
            if splits[i].strip():  # Only add non-empty segments
                result.append(splits[i].strip())
            i += 1

    return result




#Sefaria API Functions
Handles the API calls and text formatting


In [22]:
# ---------------------------
# SEFARIA API FUNCTIONS
# ---------------------------

def query_sefaria(ref, language="en", context=1):
    """Query the Sefaria API for a specific text reference."""
    # Format the reference for the API
    formatted_ref = ref.replace(" ", "_")

    # Construct the API URL
    base_url = "https://www.sefaria.org/api/texts/"
    url = f"{base_url}{formatted_ref}"

    # Add parameters for language and context
    params = {
        "context": context,
        "language": language,
    }

    # Make the request
    response = requests.get(url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

# Modify the display_sefaria_text function:
def display_sefaria_text(data, remove_nikud_marks=False, standardize_terms=False, split_sentences=False):
    """Display the Sefaria text data in a notebook-friendly format with enhanced formatting."""
    if not data:
        display(Markdown("No data received."))
        return

    # Process Hebrew text if needed
    if remove_nikud_marks and 'he' in data and data['he']:
        if isinstance(data['he'], list):
            # Process each line individually
            data['he_no_nikud'] = [remove_nikud(line) for line in data['he']]
        else:
            # Process single string
            data['he_no_nikud'] = remove_nikud(data['he'])

    # Process English text for terminology standardization if needed
    if standardize_terms and 'text' in data and data['text']:
        if isinstance(data['text'], list):
            # Process each line individually
            data['text_standardized'] = [standardize_terminology(line) for line in data['text']]
        else:
            # Process single string
            data['text_standardized'] = standardize_terminology(data['text'])

    # Create HTML for displaying text
    if 'text' in data and (('he' in data and data['he']) or ('he_no_nikud' in data and data['he_no_nikud'])):
        # Determine which Hebrew text to use
        hebrew_text = data['he_no_nikud'] if remove_nikud_marks and 'he_no_nikud' in data else data['he']

        # Determine which English text to use
        english_text = data['text_standardized'] if standardize_terms and 'text_standardized' in data else data['text']

        # Create a clean, copy-paste friendly format with minimal styling
        html_content = '<div style="font-family: Calibri, sans-serif; font-size: 11pt; color: black; background-color: white;">'

        # Determine if we're dealing with a list or a single item
        if isinstance(english_text, list) and isinstance(hebrew_text, list):
            # Standard case - process each section
            for i, (heb, eng) in enumerate(zip(hebrew_text, english_text)):
                # Split Hebrew and English text within this section if requested
                if split_sentences:
                    split_heb = split_by_punctuation(heb)
                    split_eng = split_by_punctuation(eng)

                    # Join with line breaks
                    heb_formatted = '<br>'.join(split_heb)
                    eng_formatted = '<br>'.join(split_eng)
                else:
                    heb_formatted = heb
                    eng_formatted = eng

                html_content += f"""
                <div style="margin-bottom: 15px;">
                  <h3 style="font-family: Calibri, sans-serif; margin-bottom: 5px;">Section {i+1}</h3>
                  <div style="text-align: right; direction: rtl; margin-bottom: 5px;"><b>{heb_formatted}</b></div>
                  <div>{eng_formatted}</div>
                </div>
                """
        else:
            # For single item
            if split_sentences:
                split_heb = split_by_punctuation(hebrew_text)
                split_eng = split_by_punctuation(english_text)

                # Join with line breaks
                heb_formatted = '<br>'.join(split_heb)
                eng_formatted = '<br>'.join(split_eng)
            else:
                heb_formatted = hebrew_text
                eng_formatted = english_text

            html_content += f"""
            <div>
              <h3 style="font-family: Calibri, sans-serif; margin-bottom: 5px;">Section 1</h3>
              <div style="text-align: right; direction: rtl; margin-bottom: 5px;"><b>{heb_formatted}</b></div>
              <div>{eng_formatted}</div>
            </div>
            """

        html_content += '</div>'
        display(HTML(html_content))
    elif 'text' in data:
        # Only English text available
        english_text = data['text_standardized'] if standardize_terms and 'text_standardized' in data else data['text']

        html_content = '<div style="font-family: Calibri, sans-serif; font-size: 11pt; color: black; background-color: white;">'

        if isinstance(english_text, list):
            for i, line in enumerate(english_text):
                if split_sentences:
                    split_eng = split_by_punctuation(line)
                    eng_formatted = '<br>'.join(split_eng)
                else:
                    eng_formatted = line

                html_content += f"""
                <div style="margin-bottom: 10px;">
                  <h3 style="font-family: Calibri, sans-serif; margin-bottom: 5px;">Section {i+1}</h3>
                  <div>{eng_formatted}</div>
                </div>
                """
        else:
            if split_sentences:
                split_eng = split_by_punctuation(english_text)
                eng_formatted = '<br>'.join(split_eng)
            else:
                eng_formatted = english_text

            html_content += f"""
            <div>
              <h3 style="font-family: Calibri, sans-serif; margin-bottom: 5px;">Section 1</h3>
              <div>{eng_formatted}</div>
            </div>
            """

        html_content += '</div>'
        display(HTML(html_content))
    elif 'he' in data or 'he_no_nikud' in data:
        # Only Hebrew text available
        hebrew_text = data['he_no_nikud'] if remove_nikud_marks and 'he_no_nikud' in data else data['he']

        html_content = '<div style="font-family: Calibri, sans-serif; font-size: 11pt; color: black; background-color: white;">'

        if isinstance(hebrew_text, list):
            for i, line in enumerate(hebrew_text):
                if split_sentences:
                    split_heb = split_by_punctuation(line)
                    heb_formatted = '<br>'.join(split_heb)
                else:
                    heb_formatted = line

                html_content += f"""
                <div style="margin-bottom: 10px;">
                  <h3 style="font-family: Calibri, sans-serif; margin-bottom: 5px;">Section {i+1}</h3>
                  <div style="text-align: right; direction: rtl;"><b>{heb_formatted}</b></div>
                </div>
                """
        else:
            if split_sentences:
                split_heb = split_by_punctuation(hebrew_text)
                heb_formatted = '<br>'.join(split_heb)
            else:
                heb_formatted = hebrew_text

            html_content += f"""
            <div>
              <h3 style="font-family: Calibri, sans-serif; margin-bottom: 5px;">Section 1</h3>
              <div style="text-align: right; direction: rtl;"><b>{heb_formatted}</b></div>
            </div>
            """

        html_content += '</div>'
        display(HTML(html_content))

    # [Rest of the function remains unchanged]
    elif 'text' in data:
        # Only English text available
        english_text = data['text_standardized'] if standardize_terms and 'text_standardized' in data else data['text']

        html_content = '<div style="font-family: Calibri, sans-serif; font-size: 11pt; color: black; background-color: white;">'

        if isinstance(english_text, list):
            for i, line in enumerate(english_text):
                html_content += f"""
                <div style="margin-bottom: 10px;">
                  <span>Section {i+1}: </span>
                  <span>{line}</span>
                </div>
                """
        else:
            html_content += f'<div>{english_text}</div>'

        html_content += '</div>'
        display(HTML(html_content))
    elif 'he' in data or 'he_no_nikud' in data:
        # Only Hebrew text available
        hebrew_text = data['he_no_nikud'] if remove_nikud_marks and 'he_no_nikud' in data else data['he']

        html_content = '<div style="font-family: Calibri, sans-serif; font-size: 11pt; color: black; background-color: white;">'

        if isinstance(hebrew_text, list):
            for i, line in enumerate(hebrew_text):
                html_content += f"""
                <div style="margin-bottom: 10px;">
                  <span>Section {i+1}: </span>
                  <span style="text-align: right; direction: rtl;"><b>{line}</b></span>
                </div>
                """
        else:
            html_content += f'<div style="text-align: right; direction: rtl;"><b>{hebrew_text}</b></div>'

        html_content += '</div>'
        display(HTML(html_content))


# Modify the get_sefaria_text function to include adjacent pages:

def get_sefaria_text_with_adjacent(reference, language="all", context=1,
                                  remove_nikud_marks=True, standardize_terms=True,
                                  split_sentences=True, include_adjacent=True, silent=True):
    """Query the Sefaria API for the main reference and adjacent pages."""
    if not silent:
        print(f"Querying Sefaria API for: {reference}")

    # Get the main text
    main_data = query_sefaria(reference, language, context)

    if include_adjacent and '.' in reference and any(x in reference.lower() for x in ['a', 'b']):
        prev_ref, next_ref = get_adjacent_pages(reference)

        # Get previous page if it exists
        prev_data = None
        if prev_ref:
            if not silent:
                print(f"Retrieving previous page: {prev_ref}")
            prev_data = query_sefaria(prev_ref, language, context)

        # Get next page if it exists
        next_data = None
        if next_ref:
            if not silent:
                print(f"Retrieving next page: {next_ref}")
            next_data = query_sefaria(next_ref, language, context)

        # Add formatted page header for previous page
        if prev_data:
            # Using h2 tag that Google Docs will recognize as Heading 2
            display(HTML(f'<h2 style="font-family: Calibri, sans-serif; color: #2C5282; margin-top: 20px; margin-bottom: 10px;">Previous Page ({prev_ref})</h2>'))
            display_sefaria_text(prev_data, remove_nikud_marks, standardize_terms, split_sentences)

        # Add formatted page header for current page
        if main_data:
            display(HTML(f'<h2 style="font-family: Calibri, sans-serif; color: #2C5282; margin-top: 20px; margin-bottom: 10px;">Current Page ({reference})</h2>'))
            display_sefaria_text(main_data, remove_nikud_marks, standardize_terms, split_sentences)

        # Add formatted page header for next page
        if next_data:
            display(HTML(f'<h2 style="font-family: Calibri, sans-serif; color: #2C5282; margin-top: 20px; margin-bottom: 10px;">Next Page ({next_ref})</h2>'))
            display_sefaria_text(next_data, remove_nikud_marks, standardize_terms, split_sentences)

    elif main_data:
        display_sefaria_text(main_data, remove_nikud_marks, standardize_terms, split_sentences)

    return main_data

# Add to the SEFARIA API FUNCTIONS section:

def get_adjacent_pages(ref):
    """Determine the previous and next pages for a given Talmud reference."""
    # Parse the reference to extract tractate and page
    if '.' not in ref:
        return None, None

    parts = ref.split('.')
    tractate = parts[0]
    page_info = parts[1]

    # Handle different page formats (e.g., "2a", "10b")
    page_number = ''.join(filter(str.isdigit, page_info))
    page_side = 'a' if page_info[-1] == 'a' else 'b'

    # Calculate previous page
    prev_page = None
    if page_side == 'b':
        # If current is "Xb", previous is "Xa"
        prev_page = f"{tractate}.{page_number}a"
    elif int(page_number) > 2:  # Talmud typically starts at 2a
        # If current is "Xa", previous is "(X-1)b"
        prev_page = f"{tractate}.{int(page_number)-1}b"

    # Calculate next page
    next_page = None
    if page_side == 'a':
        # If current is "Xa", next is "Xb"
        next_page = f"{tractate}.{page_number}b"
    else:
        # If current is "Xb", next is "(X+1)a"
        next_page = f"{tractate}.{int(page_number)+1}a"

    return prev_page, next_page


#User Interface
Creates an easy-to-use form for searching

In [23]:
# ---------------------------
# USER INTERFACE
# ---------------------------

from ipywidgets import widgets
from IPython.display import display, clear_output

# Create form elements
reference_input = widgets.Text(
    value='Megillah.11b',
    placeholder='Enter reference (e.g., Berakhot.2a)',
    description='Reference:',
    style={'description_width': 'initial'},
    layout={'width': '400px'}
)

language_dropdown = widgets.Dropdown(
    options=[('Hebrew and English', 'all'), ('English only', 'en'), ('Hebrew only', 'he')],
    value='all',
    description='Language:',
    style={'description_width': 'initial'},
    layout={'width': '300px'}
)

context_slider = widgets.IntSlider(
    value=0,
    min=0,
    max=5,
    step=1,
    description='# of adjacent pages:',
    style={'description_width': 'initial'},
    layout={'width': '300px'}
)

remove_nikud_checkbox = widgets.Checkbox(
    value=True,
    description='Remove nikud (vowel marks)',
    style={'description_width': 'initial'}
)

standardize_terms_checkbox = widgets.Checkbox(
    value=True,
    description='Standardize terminology',
    style={'description_width': 'initial'}
)

submit_button = widgets.Button(
    description='Get Text',
    button_style='primary',
    tooltip='Click to get the text',
    icon='search'
)

# In the USER INTERFACE section, add these new widgets:

split_sentences_checkbox = widgets.Checkbox(
    value=True,
    description='Split text by periods',
    style={'description_width': 'initial'}
)


include_adjacent_checkbox = widgets.Checkbox(
    value=False,
    description='Include adjacent pages',
    style={'description_width': 'initial'}
)


output_area = widgets.Output()

# Layout the form
# Update the form layout:
form = widgets.VBox([
    widgets.HTML('<h3>Search Sefaria</h3>'),
    reference_input,
    widgets.HBox([language_dropdown, context_slider]),
    widgets.HBox([remove_nikud_checkbox, standardize_terms_checkbox]),
    widgets.HBox([split_sentences_checkbox, include_adjacent_checkbox]),  # Add the new row
    submit_button,
    widgets.HTML('<p style="color: gray; font-style: italic; margin-top: 5px;">After the text appears, simply select it and copy-paste into Google Docs.</p>'),
    output_area
])

# Update the on_submit_button_clicked function:
def on_submit_button_clicked(b):
    with output_area:
        clear_output()
        print("Searching...")
        try:
            get_sefaria_text_with_adjacent(
                reference=reference_input.value,
                language=language_dropdown.value,
                context=context_slider.value,
                remove_nikud_marks=remove_nikud_checkbox.value,
                standardize_terms=standardize_terms_checkbox.value,
                split_sentences=split_sentences_checkbox.value,
                include_adjacent=include_adjacent_checkbox.value,
                silent=False
            )
        except Exception as e:
            print(f"Error: {e}")

submit_button.on_click(on_submit_button_clicked)

# Display the form
display(form)

VBox(children=(HTML(value='<h3>Search Sefaria</h3>'), Text(value='Megillah.11b', description='Reference:', lay…