<a href="https://colab.research.google.com/github/EzraBrand/Talmud-NER/blob/main/talmud_ner_model_24_feb_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Talmud Named Entity Recognition (NER) System
# Based on Steinsaltz Talmud Translation

# Install required packages


In [None]:
# Install required packages
!pip install spacy transformers datasets nltk seaborn pandas matplotlib -q
!python -m spacy download en_core_web_sm

import os
import re
import json
# 2. Reset and reinitialize the NER model with proper configuration
import spacy
from spacy.training import Example
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from collections import Counter

# Set paths (modify as needed)


In [None]:
# Set paths (modify as needed)
# Assuming your file is uploaded to Google Drive
FILE_PATH = '/content/talmud steinsaltz translation.txt'
OUTPUT_DIR = '/content/talmud_ner_output'

# Create output directory if it doesn't exist


In [None]:
# Create output directory if it doesn't exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
# Load Talmud text file
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [None]:
print(f"Loading Talmud text from {FILE_PATH}...")
try:
    full_text = load_text_file(FILE_PATH)
    print(f"Successfully loaded text. Total characters: {len(full_text)}")
    # Show sample
    print("\nSample of the text:")
    print(full_text[:500] + "...\n")
except Exception as e:
    print(f"Error loading file: {e}")
    # Create dummy text for demonstration if file isn't available
    print("Creating dummy text for demonstration...")
    full_text = """Rabbi Akiva taught that love your neighbor as yourself is a great principle in the Torah.
    Abba bar Pappa from Nehardea discussed this with Rav Huna and Rav Hisda.
    The Sages taught that one should always be humble like Hillel and not strict like Shammai.
    Rabbi Yehuda HaNasi compiled the Mishnah according to the teachings of the Tannaim."""

In [None]:
# Load pre-trained model instead of blank
import spacy
from spacy.training import Example
import random

# Load the English model we just installed
nlp = spacy.load('en_core_web_sm')

# Keep only the tokenizer and remove other components
# This gives us a better starting point than a blank model
components = [pipe_name for pipe_name in nlp.pipe_names if pipe_name != 'ner']
nlp.disable_pipes(*components)

# Remove the existing NER component (we'll add our own)
if 'ner' in nlp.pipe_names:
    nlp.remove_pipe('ner')

# Add our custom NER component
ner = nlp.add_pipe('ner')

# Add the Talmudic entity labels
TALMUD_ENTITIES = [
    "PERSON", "TANNA", "AMORA", "HONORIFIC", "PATRONYMIC",
    "MATRONYMIC", "TOPONYM", "OCCUPATION", "EPITHET", "GROUP", "PLACEHOLDER"
]

for ent in TALMUD_ENTITIES:
    ner.add_label(ent)

In [None]:
# Create improved training data with more examples
IMPROVED_TRAINING_DATA = [
    # Rabbi Akiva examples
    ("Rabbi Akiva taught a valuable lesson to his students.",
     {"entities": [(0, 11, "TANNA")]}),
    ("The great sage Rabbi Akiva said this principle.",
     {"entities": [(15, 26, "TANNA")]}),

    # Rav Ashi examples
    ("Rav Ashi from Bavel discussed this with Ravina.",
     {"entities": [(0, 8, "AMORA"), (14, 19, "TOPONYM"), (35, 41, "AMORA")]}),

    # Add more examples as in my previous solution...

    # Important: Include examples for your test sentences
    ("Rabbi Yochanan said in the name of Rabbi Shimon ben Yochai.",
     {"entities": [(0, 14, "AMORA"), (33, 58, "TANNA")]}),
    ("The Gemara relates that Rav Papa visited Nehardea.",
     {"entities": [(23, 31, "AMORA"), (40, 49, "TOPONYM")]}),
    ("Rabban Gamliel was the Nasi of the Sanhedrin.",
     {"entities": [(0, 14, "TANNA"), (23, 27, "HONORIFIC")]}),
    ("Rabbi Akiva's students spread his teachings throughout Judea.",
     {"entities": [(0, 12, "TANNA"), (55, 60, "TOPONYM")]}),
    ("The House of Hillel disagreed with the House of Shammai on this matter.",
     {"entities": [(13, 19, "TANNA"), (45, 52, "TANNA")]})
]

In [None]:
!pip install spacy-lookups-data

In [None]:
# Create a simple model with just a tokenizer and entity ruler (no ML)
nlp = spacy.blank("en")

# Add entity ruler for pattern matching
ruler = nlp.add_pipe("entity_ruler")

# Define patterns based on your test examples and training data
patterns = [
    # Tannaim
    {"label": "TANNA", "pattern": [{"LOWER": "rabbi"}, {"TEXT": "Akiva"}]},
    {"label": "TANNA", "pattern": [{"LOWER": "rabbi"}, {"TEXT": "Shimon"}, {"TEXT": "ben"}, {"TEXT": "Yochai"}]},
    {"label": "TANNA", "pattern": "Hillel"},
    {"label": "TANNA", "pattern": "Shammai"},
    {"label": "TANNA", "pattern": [{"LOWER": "rabban"}, {"TEXT": "Gamliel"}]},

    # Amoraim
    {"label": "AMORA", "pattern": [{"LOWER": "rabbi"}, {"TEXT": "Yochanan"}]},
    {"label": "AMORA", "pattern": [{"LOWER": "rav"}, {"TEXT": "Papa"}]},
    {"label": "AMORA", "pattern": [{"LOWER": "rav"}, {"TEXT": "Ashi"}]},
    {"label": "AMORA", "pattern": "Ravina"},

    # Places
    {"label": "TOPONYM", "pattern": "Nehardea"},
    {"label": "TOPONYM", "pattern": "Judea"},
    {"label": "TOPONYM", "pattern": "Bavel"},

    # Honorifics
    {"label": "HONORIFIC", "pattern": "Nasi"},

    # House of pattern
    {"label": "GROUP", "pattern": [{"LOWER": "house"}, {"LOWER": "of"}, {"TEXT": "Hillel"}]},
    {"label": "GROUP", "pattern": [{"LOWER": "house"}, {"LOWER": "of"}, {"TEXT": "Shammai"}]},
]

# Add patterns to the ruler
ruler.add_patterns(patterns)

# No training needed - this is a rule-based approach
print("Created rule-based entity recognition model")

In [None]:
# Test function
def test_model(texts, model):
    print("\nTesting model on sample texts:")
    for i, text in enumerate(texts):
        doc = model(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        print(f"\nText {i+1}: {text}")
        if entities:
            print(f"Detected entities: {entities}")
        else:
            print("No entities detected.")

# Test on the problematic examples
test_texts = [
    "Rabbi Yochanan said in the name of Rabbi Shimon ben Yochai.",
    "The Gemara relates that Rav Papa visited Nehardea.",
    "Rabban Gamliel was the Nasi of the Sanhedrin.",
    "Rabbi Akiva's students spread his teachings throughout Judea.",
    "The House of Hillel disagreed with the House of Shammai on this matter."
]

test_model(test_texts, nlp)

# Save the model
model_path = '/content/talmud_ner_output'
nlp.to_disk(model_path)
print(f"\nSaved rule-based model to {model_path}")

In [None]:
# Process a larger portion of your Talmud text
def extract_entities_from_corpus(model, sentences, max_sentences=None):
    """Extract entities from a corpus of text."""
    entities_list = []

    # Limit number of sentences if specified
    if max_sentences and max_sentences < len(sentences):
        process_sentences = sentences[:max_sentences]
    else:
        process_sentences = sentences

    # Process each sentence
    from tqdm.notebook import tqdm
    for i, sentence in tqdm(enumerate(process_sentences),
                           total=len(process_sentences),
                           desc="Extracting entities"):
        doc = model(sentence)

        # Extract entities
        for ent in doc.ents:
            entities_list.append({
                'text': ent.text,
                'label': ent.label_,
                'context': sentence,
                'start_pos': ent.start_char,
                'end_pos': ent.end_char
            })

    # Convert to DataFrame
    import pandas as pd
    df = pd.DataFrame(entities_list)
    return df

# Choose how many sentences to process (adjust as needed)
# Start with a smaller sample like 500 to test
sample_size = 500  # Increase this to process more text

# Extract entities
print(f"Extracting entities from {sample_size} sentences...")
entities_df = extract_entities_from_corpus(nlp, talmud_sentences[:sample_size])

# Save the results
entities_csv_path = '/content/talmud_ner_output/extracted_entities.csv'
entities_df.to_csv(entities_csv_path, index=False)
print(f"Extracted {len(entities_df)} entities and saved to {entities_csv_path}")

# Display sample of results
if not entities_df.empty:
    print("\nSample of extracted entities:")
    display(entities_df.head(10))

    # Count entity types
    print("\nEntity type distribution:")
    print(entities_df['label'].value_counts())