Introduction and Setup: Explains the purpose and installs dependencies


In [None]:
# Sefaria Text Formatter
# This notebook helps you retrieve and format texts from Sefaria for easy copy-pasting into Google Docs

# Install required packages
try:
    import requests
except ImportError:
    !pip install requests

import json
import pandas as pd
import re
from IPython.display import display, HTML, Markdown

# Display introduction
display(HTML("""
<div style="background-color: #f0f7fb; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
    <h1 style="color: #0d47a1;">Sefaria Text Formatter</h1>
    <p>This tool helps you retrieve texts from Sefaria and format them for easy copy-pasting into Google Docs with the following features:</p>
    <ul>
        <li>Remove nikud (vowel marks) from Hebrew text</li>
        <li>Standardize terminology to preferred terms</li>
        <li>Format text in Calibri 11pt for clean copy-paste</li>
        <li>Organize text into sections with Hebrew and English</li>
    </ul>
    <p>To use: Run the cells below, then use the search form to find and format your text.</p>
</div>
"""))

Helper Functions: Contains the core processing functions (terminology, nikud removal)


In [None]:
# ---------------------------
# HELPER FUNCTIONS
# ---------------------------

# Dictionary for terminology preferences
TERMINOLOGY_PREFERENCES = {
    r'\bGemara\b': 'Talmud',
    r'\bRabbi\b': 'R\'',
    r'\bThe Sages taught\b': 'A baraita states',
    r'\bDivine Voice\b': 'bat kol',
    r'\bDivine Presence\b': 'Shekhina',
    r'\bdivine inspiration\b': 'Holy Spirit',
    r'\bthe Lord\b': 'YHWH',
    r'\bleper\b': 'metzora',
    r'\bleprosy\b': 'tzara\'at',
    r'\bphylacteries\b': 'tefillin',
    r'\bgentile\b': 'non-Jew',
    r'\bignorant\b': 'am ha\'aretz',
    r'\bignoram(us|i)\b': 'am ha\'aretz',
    r'\bmaidservant\b': 'female slave',
    r'\bbarrel\b': 'jug',
}

# Regular expression pattern for converting spelled-out numbers to Arabic numerals
NUMBER_PATTERNS = {
    r'\b(?:eleven|eleventh)\b': '11',
    r'\b(?:twelve|twelfth)\b': '12',
    r'\b(?:thirteen|thirteenth)\b': '13',
    r'\b(?:fourteen|fourteenth)\b': '14',
    r'\b(?:fifteen|fifteenth)\b': '15',
    r'\b(?:sixteen|sixteenth)\b': '16',
    r'\b(?:seventeen|seventeenth)\b': '17',
    r'\b(?:eighteen|eighteenth)\b': '18',
    r'\b(?:nineteen|nineteenth)\b': '19',
    r'\b(?:twenty|twentieth)\b': '20',
    r'\btwenty-(\w+)\b': lambda m: f'2{convert_single_digit(m.group(1))}',
    r'\b(?:thirty|thirtieth)\b': '30',
    r'\bthirty-(\w+)\b': lambda m: f'3{convert_single_digit(m.group(1))}',
    r'\b(?:forty|fortieth)\b': '40',
    r'\bforty-(\w+)\b': lambda m: f'4{convert_single_digit(m.group(1))}',
    r'\b(?:fifty|fiftieth)\b': '50',
    r'\bfifty-(\w+)\b': lambda m: f'5{convert_single_digit(m.group(1))}',
    r'\b(?:sixty|sixtieth)\b': '60',
    r'\bsixty-(\w+)\b': lambda m: f'6{convert_single_digit(m.group(1))}',
    r'\b(?:seventy|seventieth)\b': '70',
    r'\bseventy-(\w+)\b': lambda m: f'7{convert_single_digit(m.group(1))}',
    r'\b(?:eighty|eightieth)\b': '80',
    r'\beighty-(\w+)\b': lambda m: f'8{convert_single_digit(m.group(1))}',
    r'\b(?:ninety|ninetieth)\b': '90',
    r'\bninety-(\w+)\b': lambda m: f'9{convert_single_digit(m.group(1))}',
    r'\b(?:hundred|hundredth)\b': '100',
    r'\b(?:thousand|thousandth)\b': '1000',
}

def convert_single_digit(word):
    """Convert a single digit word to its numeric equivalent"""
    word = word.lower()
    if word in ['one', 'first']:
        return '1'
    elif word in ['two', 'second']:
        return '2'
    elif word in ['three', 'third']:
        return '3'
    elif word in ['four', 'fourth']:
        return '4'
    elif word in ['five', 'fifth']:
        return '5'
    elif word in ['six', 'sixth']:
        return '6'
    elif word in ['seven', 'seventh']:
        return '7'
    elif word in ['eight', 'eighth']:
        return '8'
    elif word in ['nine', 'ninth']:
        return '9'
    return word  # Return original if not a digit word

def remove_nikud(text):
    """Remove Hebrew vowel marks (nikud) while preserving standard punctuation."""
    if not text:
        return text
    return re.sub(r'[\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7]', '', text)

def standardize_terminology(text):
    """Standardize terminology according to preferred terms."""
    if not text:
        return text

    # Apply terminology preferences
    for pattern, replacement in TERMINOLOGY_PREFERENCES.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

    # Convert spelled-out numbers to Arabic numerals
    for pattern, replacement in NUMBER_PATTERNS.items():
        if callable(replacement):
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        else:
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

    # Handle ordinals with a regex to add 'th', 'st', 'nd', 'rd' as appropriate
    def add_ordinal_suffix(match):
        num = int(match.group(1))
        if 10 <= num % 100 <= 20:
            suffix = 'th'
        else:
            suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(num % 10, 'th')
        return f"{num}{suffix}"

    # Pattern for finding bare numbers that should be ordinals (based on context)
    text = re.sub(r'the (\d+) day', lambda m: f"the {add_ordinal_suffix(m)}", text)
    text = re.sub(r'(\d+) century', lambda m: f"{add_ordinal_suffix(m)} century", text)

    return text

Sefaria API Functions: Handles the API calls and text formatting


In [None]:
# ---------------------------
# SEFARIA API FUNCTIONS
# ---------------------------

def query_sefaria(ref, language="en", context=1):
    """Query the Sefaria API for a specific text reference."""
    # Format the reference for the API
    formatted_ref = ref.replace(" ", "_")

    # Construct the API URL
    base_url = "https://www.sefaria.org/api/texts/"
    url = f"{base_url}{formatted_ref}"

    # Add parameters for language and context
    params = {
        "context": context,
        "language": language,
    }

    # Make the request
    response = requests.get(url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

def display_sefaria_text(data, remove_nikud_marks=False, standardize_terms=False):
    """Display the Sefaria text data in a notebook-friendly format."""
    if not data:
        display(Markdown("No data received."))
        return

    # Process Hebrew text if needed
    if remove_nikud_marks and 'he' in data and data['he']:
        if isinstance(data['he'], list):
            # Process each line individually
            data['he_no_nikud'] = [remove_nikud(line) for line in data['he']]
        else:
            # Process single string
            data['he_no_nikud'] = remove_nikud(data['he'])

    # Process English text for terminology standardization if needed
    if standardize_terms and 'text' in data and data['text']:
        if isinstance(data['text'], list):
            # Process each line individually
            data['text_standardized'] = [standardize_terminology(line) for line in data['text']]
        else:
            # Process single string
            data['text_standardized'] = standardize_terminology(data['text'])

    # Create HTML for displaying text
    if 'text' in data and (('he' in data and data['he']) or ('he_no_nikud' in data and data['he_no_nikud'])):
        # Determine which Hebrew text to use
        hebrew_text = data['he_no_nikud'] if remove_nikud_marks and 'he_no_nikud' in data else data['he']

        # Determine which English text to use
        english_text = data['text_standardized'] if standardize_terms and 'text_standardized' in data else data['text']

        # Create a clean, copy-paste friendly format with minimal styling
        html_content = '<div style="font-family: Calibri, sans-serif; font-size: 11pt; color: black; background-color: white;">'

        # Determine if we're dealing with a list or a single item
        if isinstance(english_text, list) and isinstance(hebrew_text, list):
            for i, (heb, eng) in enumerate(zip(hebrew_text, english_text)):
                html_content += f"""
                <div style="margin-bottom: 15px;">
                  <div style="margin-bottom: 5px;">Section {i+1}</div>
                  <div style="text-align: right; direction: rtl; margin-bottom: 5px;">{heb}</div>
                  <div>{eng}</div>
                </div>
                """
        else:
            # For single item
            html_content += f"""
            <div>
              <div style="text-align: right; direction: rtl; margin-bottom: 5px;">{hebrew_text}</div>
              <div>{english_text}</div>
            </div>
            """

        html_content += '</div>'
        display(HTML(html_content))
    elif 'text' in data:
        # Only English text available
        english_text = data['text_standardized'] if standardize_terms and 'text_standardized' in data else data['text']

        html_content = '<div style="font-family: Calibri, sans-serif; font-size: 11pt; color: black; background-color: white;">'

        if isinstance(english_text, list):
            for i, line in enumerate(english_text):
                html_content += f"""
                <div style="margin-bottom: 10px;">
                  <span>Section {i+1}: </span>
                  <span>{line}</span>
                </div>
                """
        else:
            html_content += f'<div>{english_text}</div>'

        html_content += '</div>'
        display(HTML(html_content))
    elif 'he' in data or 'he_no_nikud' in data:
        # Only Hebrew text available
        hebrew_text = data['he_no_nikud'] if remove_nikud_marks and 'he_no_nikud' in data else data['he']

        html_content = '<div style="font-family: Calibri, sans-serif; font-size: 11pt; color: black; background-color: white;">'

        if isinstance(hebrew_text, list):
            for i, line in enumerate(hebrew_text):
                html_content += f"""
                <div style="margin-bottom: 10px;">
                  <span>Section {i+1}: </span>
                  <span style="text-align: right; direction: rtl;">{line}</span>
                </div>
                """
        else:
            html_content += f'<div style="text-align: right; direction: rtl;">{hebrew_text}</div>'

        html_content += '</div>'
        display(HTML(html_content))

def get_sefaria_text(reference, language="all", context=1,
                     remove_nikud_marks=True, standardize_terms=True, silent=True):
    """Query the Sefaria API and display results in a copy-paste friendly format."""
    if not silent:
        print(f"Querying Sefaria API for: {reference}")
    data = query_sefaria(reference, language, context)

    if data:
        display_sefaria_text(data, remove_nikud_marks, standardize_terms)

    return data

User Interface: Creates an easy-to-use form for searching

In [None]:
# ---------------------------
# USER INTERFACE
# ---------------------------

from ipywidgets import widgets
from IPython.display import display, clear_output

# Create form elements
reference_input = widgets.Text(
    value='Sotah.35a.7',
    placeholder='Enter reference (e.g., Berakhot.2a.1)',
    description='Reference:',
    style={'description_width': 'initial'},
    layout={'width': '400px'}
)

language_dropdown = widgets.Dropdown(
    options=[('Hebrew and English', 'all'), ('English only', 'en'), ('Hebrew only', 'he')],
    value='all',
    description='Language:',
    style={'description_width': 'initial'},
    layout={'width': '300px'}
)

context_slider = widgets.IntSlider(
    value=1,
    min=0,
    max=5,
    step=1,
    description='Context verses:',
    style={'description_width': 'initial'},
    layout={'width': '300px'}
)

remove_nikud_checkbox = widgets.Checkbox(
    value=True,
    description='Remove nikud (vowel marks)',
    style={'description_width': 'initial'}
)

standardize_terms_checkbox = widgets.Checkbox(
    value=True,
    description='Standardize terminology',
    style={'description_width': 'initial'}
)

submit_button = widgets.Button(
    description='Get Text',
    button_style='primary',
    tooltip='Click to get the text',
    icon='search'
)

output_area = widgets.Output()

# Layout the form
form = widgets.VBox([
    widgets.HTML('<h3>Search Sefaria</h3>'),
    reference_input,
    widgets.HBox([language_dropdown, context_slider]),
    widgets.HBox([remove_nikud_checkbox, standardize_terms_checkbox]),
    submit_button,
    widgets.HTML('<p style="color: gray; font-style: italic; margin-top: 5px;">After the text appears, simply select it and copy-paste into Google Docs.</p>'),
    output_area
])

# Handle form submission
def on_submit_button_clicked(b):
    with output_area:
        clear_output()
        print("Searching...")
        try:
            get_sefaria_text(
                reference=reference_input.value,
                language=language_dropdown.value,
                context=context_slider.value,
                remove_nikud_marks=remove_nikud_checkbox.value,
                standardize_terms=standardize_terms_checkbox.value,
                silent=False
            )
        except Exception as e:
            print(f"Error: {e}")

submit_button.on_click(on_submit_button_clicked)

# Display the form
display(form)

VBox(children=(HTML(value='<h3>Search Sefaria</h3>'), Text(value='Sotah.35a.7', description='Reference:', layo…

Examples and Help: Provides guidance on reference formats


In [None]:
# ---------------------------
# EXAMPLES & HELP
# ---------------------------

help_button = widgets.Button(
    description='Show Examples',
    button_style='info',
    tooltip='Click to see example references',
    icon='info'
)

examples_output = widgets.Output()

def on_help_button_clicked(b):
    with examples_output:
        clear_output()
        display(HTML("""
        <div style="background-color: #f5f5f5; padding: 15px; border-radius: 5px;">
            <h3>Example References</h3>
            <ul>
                <li><strong>Talmud:</strong> Berakhot.2a.1, Sotah.35a.7, Sanhedrin.90b.3</li>
                <li><strong>Torah:</strong> Genesis.1.1, Exodus.20.1, Leviticus.19.18</li>
                <li><strong>Prophets:</strong> Isaiah.40.1, Jeremiah.29.11, Ezekiel.37.1</li>
                <li><strong>Writings:</strong> Psalms.23.1, Proverbs.3.5, Job.1.1</li>
                <li><strong>Mishnah:</strong> Avot.1.1, Berakhot.1.1, Sukkah.1.1</li>
            </ul>
            <h3>Reference Format</h3>
            <p>The reference format is: <code>Book.Chapter.Verse</code> or <code>Tractate.Page.Section</code></p>
            <p>For Talmud references, use the format: <code>Tractate.PageNumber[a/b].Section</code></p>
            <p>For example, <code>Berakhot.2a.3</code> means: Tractate Berakhot, page 2 side a, section 3</p>
        </div>
        """))

help_button.on_click(on_help_button_clicked)

# Display the help section
display(widgets.VBox([help_button, examples_output]))

Custom Terminology: Allows users to add their own terminology preferences



In [None]:
# ---------------------------
# CUSTOMIZE TERMINOLOGY
# ---------------------------

custom_term_button = widgets.Button(
    description='Customize Terminology',
    button_style='warning',
    tooltip='Add your own terminology preferences',
    icon='edit'
)

original_term = widgets.Text(
    placeholder='Original term',
    description='Original:',
    style={'description_width': 'initial'},
    layout={'width': '300px'}
)

preferred_term = widgets.Text(
    placeholder='Preferred term',
    description='Preferred:',
    style={'description_width': 'initial'},
    layout={'width': '300px'}
)

add_term_button = widgets.Button(
    description='Add Term',
    button_style='success',
    tooltip='Add this term to preferences',
    icon='plus'
)

terms_output = widgets.Output()
custom_form_output = widgets.Output()

def on_custom_term_button_clicked(b):
    with custom_form_output:
        clear_output()
        display(widgets.VBox([
            widgets.HTML('<h3>Add Custom Terminology</h3>'),
            widgets.HBox([original_term, preferred_term]),
            add_term_button,
            terms_output
        ]))

def on_add_term_button_clicked(b):
    with terms_output:
        clear_output()
        if original_term.value and preferred_term.value:
            # Add to dictionary safely with word boundaries
            pattern = r'\b' + re.escape(original_term.value) + r'\b'
            TERMINOLOGY_PREFERENCES[pattern] = preferred_term.value
            print(f"Added: '{original_term.value}' → '{preferred_term.value}'")
            print("Current custom terms:")
            for i, (pattern, replacement) in enumerate(TERMINOLOGY_PREFERENCES.items()):
                if pattern not in [r'\bGemara\b', r'\bRabbi\b', r'\bThe Sages taught\b',
                                   r'\bDivine Voice\b', r'\bDivine Presence\b',
                                   r'\bdivine inspiration\b', r'\bthe Lord\b', r'\bleper\b',
                                   r'\bleprosy\b', r'\bphylacteries\b', r'\bgentile\b',
                                   r'\bignorant\b', r'\bignoram(us|i)\b', r'\bmaidservant\b',
                                   r'\bbarrel\b']:
                    print(f"  {i+1}. {pattern.replace('\\b', '')} → {replacement}")
            # Clear the input fields
            original_term.value = ''
            preferred_term.value = ''
        else:
            print("Please enter both original and preferred terms.")

custom_term_button.on_click(on_custom_term_button_clicked)
add_term_button.on_click(on_add_term_button_clicked)

# Display the terminology customization section
display(widgets.VBox([custom_term_button, custom_form_output]))