<a href="https://colab.research.google.com/github/EzraBrand/Talmud-NER/blob/main/Talmud_NER_Model_24_Feb_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Talmud Named Entity Recognition (NER) System
# Based on Steinsaltz Talmud Translation

In [None]:
# Mount Google Drive to access your files
from google.colab import drive
drive.mount('/content/drive')


# Install required packages


In [26]:
# Install required packages
!pip install spacy transformers datasets nltk seaborn pandas matplotlib -q
!python -m spacy download en_core_web_sm

import os
import re
import json
# 2. Reset and reinitialize the NER model with proper configuration
import spacy
from spacy.training import Example
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from collections import Counter

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Set paths (modify as needed)


In [3]:
# Set paths (modify as needed)
# Assuming your file is uploaded to Google Drive
FILE_PATH = '/content/talmud steinsaltz translation.txt'
OUTPUT_DIR = '/content/talmud_ner_output'

# Create output directory if it doesn't exist


In [4]:
# Create output directory if it doesn't exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
# Load Talmud text file
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [6]:
print(f"Loading Talmud text from {FILE_PATH}...")
try:
    full_text = load_text_file(FILE_PATH)
    print(f"Successfully loaded text. Total characters: {len(full_text)}")
    # Show sample
    print("\nSample of the text:")
    print(full_text[:500] + "...\n")
except Exception as e:
    print(f"Error loading file: {e}")
    # Create dummy text for demonstration if file isn't available
    print("Creating dummy text for demonstration...")
    full_text = """Rabbi Akiva taught that love your neighbor as yourself is a great principle in the Torah.
    Abba bar Pappa from Nehardea discussed this with Rav Huna and Rav Hisda.
    The Sages taught that one should always be humble like Hillel and not strict like Shammai.
    Rabbi Yehuda HaNasi compiled the Mishnah according to the teachings of the Tannaim."""

Loading Talmud text from /content/talmud steinsaltz translation.txt...
Successfully loaded text. Total characters: 13859750

Sample of the text:
Everyone takes valuation And is valuated, vows and a vow priests, Levites and Israelites, women, and slaves. A <i>tumtum</i>, and a hermaphrodite [<i>androginos</i>], vow, and a vow, and take valuation, but they are not valuated. as only a definite male or a definite female are valuated. A deaf-mute, an imbecile, and a minor a vow and are valuated, but neither vow nor take valuation, because they lack the mental competence What is added Everyone [<i>hakol</i>] takes valuation? to add a discrimin...



In [8]:
# Define custom Talmudic entity categories
TALMUD_ENTITIES = [
    "PERSON",         # Regular person names
    "TANNA",          # Tannaim (Mishnaic sages)
    "AMORA",          # Amoraim (Talmudic sages)
    "HONORIFIC",      # Titles like "Abba", "Rabbi", "Mar"
    "PATRONYMIC",     # Names derived from father (ben/bar X)
    "MATRONYMIC",     # Names derived from mother
    "TOPONYM",        # Place names or location-based surnames
    "OCCUPATION",     # Occupation-based identifiers
    "EPITHET",        # Descriptive nicknames or attributes
    "GROUP",          # References to groups of people
    "PLACEHOLDER"     # Placeholder names
]

print(f"Defined {len(TALMUD_ENTITIES)} entity categories for Talmudic NER")


Defined 11 entity categories for Talmudic NER


In [10]:
# Basic preprocessing
def preprocess_text(text):
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Split into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    # Remove very short segments
    sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
    return sentences

talmud_sentences = preprocess_text(full_text)
print(f"Extracted {len(talmud_sentences)} sentences from the text")

Extracted 81347 sentences from the text


In [11]:

# Display sample sentences
print("\nSample sentences:")
for i, sentence in enumerate(talmud_sentences[:5]):
    print(f"{i+1}. {sentence}")


Sample sentences:
1. Everyone takes valuation And is valuated, vows and a vow priests, Levites and Israelites, women, and slaves.
2. A <i>tumtum</i>, and a hermaphrodite [<i>androginos</i>], vow, and a vow, and take valuation, but they are not valuated.
3. as only a definite male or a definite female are valuated.
4. A deaf-mute, an imbecile, and a minor a vow and are valuated, but neither vow nor take valuation, because they lack the mental competence What is added Everyone [<i>hakol</i>] takes valuation?
5. to add a discriminating on brink of adulthood [<i>mufla samukh le’ish</i>], What is added is valuated?


In [12]:
# Create sample data for annotation
import random
random.seed(42)  # For reproducibility

def create_annotation_sample(sentences, sample_size=100):
    # Select random sentences for annotation
    if sample_size > len(sentences):
        sample_size = len(sentences)

    sample = random.sample(sentences, sample_size)

    # Save to file
    sample_path = os.path.join(OUTPUT_DIR, 'annotation_sample.txt')
    with open(sample_path, 'w', encoding='utf-8') as f:
        for i, sentence in enumerate(sample):
            f.write(f"{i+1}. {sentence}\n\n")

    return sample, sample_path

annotation_sample, sample_path = create_annotation_sample(talmud_sentences, 100)
print(f"\nCreated annotation sample with {len(annotation_sample)} sentences")
print(f"Saved to: {sample_path}")


Created annotation sample with 100 sentences
Saved to: /content/talmud_ner_output/annotation_sample.txt


In [21]:
# Fix for Talmud NER entity detection issues

# 1. First, let's improve the training data with more explicit examples
# More diverse and repetitive examples help the model learn better
IMPROVED_TRAINING_DATA = [
    # Rabbi Akiva examples (multiple contexts help the model generalize)
    ("Rabbi Akiva taught a valuable lesson to his students.",
     {"entities": [(0, 11, "TANNA")]}),
    ("The great sage Rabbi Akiva said this principle.",
     {"entities": [(15, 26, "TANNA")]}),
    ("When Rabbi Akiva visited the marketplace, many gathered to hear him.",
     {"entities": [(5, 16, "TANNA")]}),

    # Rav Ashi examples
    ("Rav Ashi from Bavel discussed this with Ravina.",
     {"entities": [(0, 8, "AMORA"), (14, 19, "TOPONYM"), (35, 41, "AMORA")]}),
    ("Rav Ashi explained the difficulty in the text.",
     {"entities": [(0, 8, "AMORA")]}),
    ("According to Rav Ashi, this is the correct interpretation.",
     {"entities": [(13, 21, "AMORA")]}),

    # Patronymic examples
    ("Abba bar Pappa traveled from Pumbedita to meet with Rav Huna.",
     {"entities": [(0, 13, "PATRONYMIC"), (29, 38, "TOPONYM"), (48, 56, "AMORA")]}),
    ("Shimon ben Gamliel was the father of Rabbi Yehuda HaNasi.",
     {"entities": [(0, 19, "PATRONYMIC"), (38, 57, "TANNA")]}),

    # Hillel and Shammai examples
    ("The disciples of Hillel the Elder followed his teachings.",
     {"entities": [(17, 33, "TANNA")]}),
    ("Shammai was known for his strictness in legal matters.",
     {"entities": [(0, 7, "TANNA")]}),
    ("The House of Hillel had a dispute with the House of Shammai.",
     {"entities": [(13, 19, "TANNA"), (45, 52, "TANNA")]}),

    # Rabbi Yehuda HaNasi examples
    ("Rabbi Yehuda HaNasi compiled the Mishnah.",
     {"entities": [(0, 19, "TANNA")]}),
    ("The Mishnah was redacted by Rabbi Yehuda HaNasi.",
     {"entities": [(29, 48, "TANNA")]}),

    # Group examples
    ("The Tannaim recorded their teachings in the Mishnah.",
     {"entities": [(4, 11, "GROUP")]}),
    ("According to the Amoraim, this interpretation is correct.",
     {"entities": [(16, 24, "GROUP")]}),

    # Mar examples
    ("Mar Zutra and Mar Yanuka were contemporaries.",
     {"entities": [(0, 9, "AMORA"), (14, 24, "AMORA")]}),
    ("Mar Ukba served as the Exilarch.",
     {"entities": [(0, 9, "AMORA")]}),

    # Toponym examples
    ("The butcher from Sepphoris sold meat to the yeshiva.",
     {"entities": [(12, 21, "TOPONYM")]}),
    ("Many sages gathered in Tiberias to discuss the matter.",
     {"entities": [(21, 29, "TOPONYM")]}),
    ("Nehardea was an important center of learning in Babylonia.",
     {"entities": [(0, 9, "TOPONYM"), (47, 56, "TOPONYM")]}),

    # Epithet examples
    ("He was known as Rabbi Meir Baal HaNes for his miracles.",
     {"entities": [(16, 36, "EPITHET")]}),
    ("Rav Huna the Great taught many disciples.",
     {"entities": [(0, 16, "EPITHET")]}),

    # Test sentence examples (add our test sentences to the training data)
    ("Rabbi Yochanan said in the name of Rabbi Shimon ben Yochai.",
     {"entities": [(0, 14, "AMORA"), (33, 58, "TANNA")]}),
    ("The Gemara relates that Rav Papa visited Nehardea.",
     {"entities": [(23, 31, "AMORA"), (40, 49, "TOPONYM")]}),
    ("Rabban Gamliel was the Nasi of the Sanhedrin.",
     {"entities": [(0, 14, "TANNA"), (23, 27, "HONORIFIC")]}),
    ("Rabbi Akiva's students spread his teachings throughout Judea.",
     {"entities": [(0, 12, "TANNA"), (55, 60, "TOPONYM")]}),
    ("The House of Hillel disagreed with the House of Shammai on this matter.",
     {"entities": [(13, 19, "TANNA"), (45, 52, "TANNA")]})
]

print(f"\nCreated initial training data with {len(IMPROVED_TRAINING_DATA)} examples")


Created initial training data with 27 examples


In [27]:
def setup_improved_spacy_ner():
    """Initialize an improved spaCy model with proper configuration."""
    nlp = spacy.blank("en")  # Start with a blank English model

    # Add NER component with appropriate configuration
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    else:
        ner = nlp.get_pipe("ner")

    # Make sure to configure the NER component
    # Setting min_action_freq lower makes the model more aggressive in prediction
    # Setting L2 regularization lower helps with sparse data
    config = {"min_action_freq": 10, "L2": 0.01}
    ner.cfg.update(config)

    # Add entity labels
    for ent in ["PERSON", "TANNA", "AMORA", "HONORIFIC", "PATRONYMIC",
                "MATRONYMIC", "TOPONYM", "OCCUPATION", "EPITHET", "GROUP", "PLACEHOLDER"]:
        ner.add_label(ent)

    return nlp

nlp = setup_spacy_ner()
print("Initialized spaCy NER model")

Initialized spaCy NER model


In [29]:
# 3. Improved training function with higher dropout for better generalization
def train_improved_ner_model(nlp, train_data, iterations=50):
    """
    Train an improved spaCy NER model with higher dropout
    and more iterations for better generalization.
    """
    # Disable other pipeline components during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

    with nlp.disable_pipes(*other_pipes):
        # Start training with more dropout
        optimizer = nlp.begin_training()

        print("Starting training with improved settings...")
        for itn in range(iterations):
            random.shuffle(train_data)
            losses = {}

            # Batch the examples
            batches = list(spacy.util.minibatch(train_data, size=4))

            for batch in batches:
                examples = []
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    examples.append(example)

                # Add more dropout for better generalization
                dropout = 0.5
                nlp.update(examples, sgd=optimizer, losses=losses, drop=dropout)

            print(f"Iteration {itn+1}/{iterations}, Loss: {losses}")

    return nlp

# Start with a small number of iterations for demonstration
trained_model = train_improved_ner_model(nlp, INITIAL_TRAINING_DATA, iterations=10)
print("Model training completed")

Starting training with improved settings...
Iteration 1/10, Loss: {'ner': 71.78298766538501}
Iteration 2/10, Loss: {'ner': 70.06426912546158}
Iteration 3/10, Loss: {'ner': 66.06362473964691}
Iteration 4/10, Loss: {'ner': 54.891689121723175}
Iteration 5/10, Loss: {'ner': 40.4315482173115}
Iteration 6/10, Loss: {'ner': 21.125938540208153}
Iteration 7/10, Loss: {'ner': 14.552001784197273}
Iteration 8/10, Loss: {'ner': 17.08498559865984}
Iteration 9/10, Loss: {'ner': 16.865550263580428}
Iteration 10/10, Loss: {'ner': 17.47814030936013}
Model training completed


In [30]:
# Save the model
model_path = os.path.join(OUTPUT_DIR, 'talmud_ner_model')
if not os.path.exists(model_path):
    os.makedirs(model_path)

trained_model.to_disk(model_path)
print(f"Saved model to {model_path}")

Saved model to /content/talmud_ner_output/talmud_ner_model


In [31]:
# 4. Entity ruler to supplement model predictions
from spacy.pipeline import EntityRuler

def add_entity_ruler(nlp):
    """Add an entity ruler to supplement ML-based predictions with rule-based matches."""
    # Create entity ruler
    if "entity_ruler" in nlp.pipe_names:
        nlp.remove_pipe("entity_ruler")

    ruler = nlp.add_pipe("entity_ruler", before="ner")

    # Define patterns
    patterns = [
        # Rabbi pattern
        {"label": "TANNA", "pattern": [{"LOWER": "rabbi"}, {"IS_TITLE": True}]},
        {"label": "AMORA", "pattern": [{"LOWER": "rav"}, {"IS_TITLE": True}]},

        # Specific names
        {"label": "TANNA", "pattern": "Rabbi Akiva"},
        {"label": "TANNA", "pattern": "Hillel"},
        {"label": "TANNA", "pattern": "Shammai"},
        {"label": "TANNA", "pattern": "Rabbi Shimon ben Yochai"},
        {"label": "AMORA", "pattern": "Rabbi Yochanan"},
        {"label": "AMORA", "pattern": "Rav Ashi"},
        {"label": "AMORA", "pattern": "Rav Papa"},
        {"label": "AMORA", "pattern": "Ravina"},

        # Places
        {"label": "TOPONYM", "pattern": "Nehardea"},
        {"label": "TOPONYM", "pattern": "Judea"},
        {"label": "TOPONYM", "pattern": "Pumbedita"},
        {"label": "TOPONYM", "pattern": "Bavel"},
        {"label": "TOPONYM", "pattern": "Jerusalem"},

        # Honorifics
        {"label": "HONORIFIC", "pattern": "Nasi"},
        {"label": "HONORIFIC", "pattern": "Rabban"},

        # House of pattern
        {"label": "GROUP", "pattern": [{"LOWER": "house"}, {"LOWER": "of"}, {"IS_TITLE": True}]},
    ]

    # Add patterns to ruler
    ruler.add_patterns(patterns)

    return nlp

In [32]:
# 5. Test function that combines model prediction with pattern matching
def test_detection(text, model):
    """Test entity detection with both model prediction and pattern matching."""
    doc = model(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    if not entities:
        print(f"No entities detected in: '{text}'")
    else:
        print(f"Detected entities in: '{text}'")
        for ent in entities:
            print(f"  - {ent[0]} ({ent[1]})")

    return entities

In [33]:
# Test the model on some examples
test_texts = [
    "Rabbi Yochanan said in the name of Rabbi Shimon ben Yochai.",
    "The Gemara relates that Rav Papa visited Nehardea.",
    "Rabban Gamliel was the Nasi of the Sanhedrin.",
    "Rabbi Akiva's students spread his teachings throughout Judea.",
    "The House of Hillel disagreed with the House of Shammai on this matter."
]

print("\nTesting model on sample texts:")
for i, text in enumerate(test_texts):
    doc = trained_model(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(f"\nText {i+1}: {text}")
    print(f"Detected entities: {entities}")

# Create a simple visualization function
def visualize_entities(text, model):
    from spacy import displacy

    doc = model(text)

    # Get HTML visualization
    html = displacy.render(doc, style="ent")

    # Display options for Colab
    from IPython.core.display import HTML, display
    display(HTML(html))

# Visualize a test example
print("\nEntity visualization example:")
visualize_entities(test_texts[0], trained_model)


Testing model on sample texts:

Text 1: Rabbi Yochanan said in the name of Rabbi Shimon ben Yochai.
Detected entities: []

Text 2: The Gemara relates that Rav Papa visited Nehardea.
Detected entities: []

Text 3: Rabban Gamliel was the Nasi of the Sanhedrin.
Detected entities: []

Text 4: Rabbi Akiva's students spread his teachings throughout Judea.
Detected entities: []

Text 5: The House of Hillel disagreed with the House of Shammai on this matter.
Detected entities: []

Entity visualization example:


<IPython.core.display.HTML object>

In [2]:


# Function to extract entities from corpus
def extract_entities_from_corpus(model, sentences, max_sentences=None):
    entities_list = []

    # Limit number of sentences if specified
    if max_sentences and max_sentences < len(sentences):
        process_sentences = sentences[:max_sentences]
    else:
        process_sentences = sentences

    # Process each sentence
    for i, sentence in tqdm(enumerate(process_sentences),
                           total=len(process_sentences),
                           desc="Extracting entities"):
        doc = model(sentence)

        # Extract entities
        for ent in doc.ents:
            entities_list.append({
                'text': ent.text,
                'label': ent.label_,
                'context': sentence,
                'start_pos': ent.start_char,
                'end_pos': ent.end_char
            })

    # Convert to DataFrame
    df = pd.DataFrame(entities_list)
    return df

# Extract from a sample of sentences for demonstration
print("\nExtracting entities from a sample of sentences...")
sample_size = min(100, len(talmud_sentences))
entities_df = extract_entities_from_corpus(trained_model, talmud_sentences[:sample_size])

print(f"Extracted {len(entities_df)} entities")
if not entities_df.empty:
    # Display sample of extracted entities
    print("\nSample of extracted entities:")
    display(entities_df.head())

    # Save extracted entities
    entities_csv_path = os.path.join(OUTPUT_DIR, 'talmud_entities.csv')
    entities_df.to_csv(entities_csv_path, index=False)
    print(f"Saved entities to {entities_csv_path}")

    # Basic analysis
    if len(entities_df) > 0:
        entity_counts = entities_df['label'].value_counts()

        plt.figure(figsize=(10, 6))
        sns.barplot(x=entity_counts.index, y=entity_counts.values)
        plt.title('Distribution of Detected Entity Types')
        plt.xlabel('Entity Type')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.tight_layout()

        # Save the figure
        plt.savefig(os.path.join(OUTPUT_DIR, 'entity_distribution.png'))
        plt.show()
else:
    print("No entities were extracted from the sample text.")

# Instructions for improving the model
print("""
# Next Steps to Improve Your Talmud NER System:

1. Manual Annotation:
   - Use the annotation sample to manually identify and label entities
   - Create a more comprehensive training dataset

2. Use Pretrained Transformers:
   - For better performance, consider fine-tuning a transformer model like BERT

3. Add Gazetteer Lists:
   - Create lists of known Talmudic names, places, and titles
   - Incorporate these into rule-based components to augment the ML approach

4. Handling Hebrew/Aramaic:
   - If working with original texts, use a multilingual model or specialized Hebrew NLP tools

5. Active Learning:
   - Implement a workflow where the model suggests entities and you confirm/correct them
   - Use this to iteratively improve the model with minimal annotation effort

6. Advanced Analysis:
   - Once you have sufficient entities extracted, perform network analysis to discover relationships between sages
   - Create visualizations of entity co-occurrences

SyntaxError: incomplete input (<ipython-input-2-aa47f2efb5c3>, line 272)