In [3]:
# Import necessary libraries
import spacy, tensorflow as tf, tensorflow_text as text, re
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

# Load the small English model for Spacy and the Universal Sentence Encoder model for Tensorflow
nlp = spacy.load("en_core_web_sm")
similarity_model = tf.saved_model.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Regular expression pattern for capturing defined terms and cross references within text
defined_term_pattern = r"“(.*?)”"
cross_ref_pattern = r"Section\s[0-9\.]+"

# Function to replace entities within text. Can replace entities in either original text or replacement text.
def replace_entities(original_text, replacement_text, user_replace=False):
    # Extract entities from the original and replacement texts
    original_entities = extract_entities(original_text)
    replacement_entities = extract_entities(replacement_text)
    
    # Match entities between the original and replacement texts
    matched_entities = match_entities(original_entities, replacement_entities)
    
    # User option to replace entities in the original text or the replacement text
    if user_replace:
        for x,y in matched_entities:
            original_text = original_text.replace(y,x) 
    else:
        for x,y in matched_entities:
            replacement_text = replacement_text.replace(x,y)
        
    return original_text, replacement_text

# Function to extract legal entities within text, including named entities, defined terms and cross references
def extract_entities(text):
    legal_entities = {}
    named_entities = [(str(ent), ent.label_) for ent in nlp(text).ents]
    legal_entities["named_entities"] = named_entities
    
    defined_terms = re.findall(defined_term_pattern, text)
    legal_entities["defined_terms"] = defined_terms

    cross_refs = re.findall(cross_ref_pattern, text)
    legal_entities["cross_refs"] = cross_refs
    
    return legal_entities

# Function to match entities between two sets of extracted entities
def match_entities(original_entities, replacement_entities):
    pairings = []
    for type in ["named_entities", "defined_terms", "cross_refs"]:
        original_type_ents = original_entities[type]
        replacement_type_ents = replacement_entities[type]
        
        # Create embeddings for the entities in original and replacement texts and compute their similarity
        embeddings1 = similarity_model(tf.constant(original_type_ents))
        embeddings2 = similarity_model(tf.constant(replacement_type_ents))
        
        similarity_scores = cosine_similarity(embeddings1, embeddings2)
        paired_entities_1_2 = [(original_type_ents[i], replacement_type_ents[j]) for i,j in enumerate(np.argmax(similarity_scores, axis=1))]

        pairings.extend(paired_entities_1_2)
    return pairings

ModuleNotFoundError: No module named 'spacy'