In [6]:
from utils import load_json, load_txt
first_paragraph = load_json("../bigger_study_sample/001-57899.json")['facts']
second_paragraph = load_txt("./gpt-4/001-57899.txt")

In [27]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    processed_sentences = []
    for sentence in sentences:
        # Tokenize into words
        words = word_tokenize(sentence)
        # Lowercase and remove punctuation
        words = [word.lower() for word in words if word.isalnum()]
        # Remove stop words
        words = [word for word in words if word not in stopwords.words('english')]
        processed_sentences.append(' '.join(words))
    return processed_sentences


[nltk_data] Downloading package punkt to /Users/ahmed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
import spacy

nlp = spacy.load('en_core_web_sm')

def extract_facts(sentences):
    facts = []
    for sentence in sentences:
        doc = nlp(sentence)
        for token in doc:
            # Look for subject-verb-object constructs
            if token.dep_ == 'ROOT':
                subject = [w.text for w in token.lefts if w.dep_ in ('nsubj', 'nsubjpass')]
                object_ = [w.text for w in token.rights if w.dep_ in ('dobj', 'pobj')]
                if subject and object_:
                    facts.append({
                        'subject': subject[0],
                        'verb': token.text,
                        'object': object_[0]
                    })
    return facts


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def facts_to_sentences(facts):
    return ['{} {} {}'.format(fact['subject'], fact['verb'], fact['object']) for fact in facts]

def compare_facts(facts1, facts2):
    sentences1 = facts_to_sentences(facts1)
    sentences2 = facts_to_sentences(facts2)
    
    vectorizer = TfidfVectorizer().fit(sentences1 + sentences2)
    vectors1 = vectorizer.transform(sentences1)
    vectors2 = vectorizer.transform(sentences2)
    
    discrepancies = []
    for i, vec2 in enumerate(vectors2):
        similarities = cosine_similarity(vec2, vectors1)
        max_similarity = similarities.max()
        if max_similarity < 0.5:  # Threshold for similarity
            discrepancies.append(facts2[i])
    return discrepancies

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def compare_facts_advanced(facts1, facts2):
    sentences1 = facts_to_sentences(facts1)
    sentences2 = facts_to_sentences(facts2)
    
    embeddings1 = model.encode(sentences1)
    embeddings2 = model.encode(sentences2)
    
    discrepancies = []
    for i, emb2 in enumerate(embeddings2):
        similarities = cosine_similarity([emb2], embeddings1)
        max_similarity = similarities.max()
        if max_similarity < 0.5:
            discrepancies.append(facts2[i])
    return discrepancies




In [35]:
def report_discrepancies(discrepancies):
    print("Facts in the second text not mentioned in the first:")
    for fact in discrepancies:
        print('- {} {} {}'.format(fact['subject'], fact['verb'], fact['object']))

text1, text2 = first_paragraph, second_paragraph

# Example usage:
text1_processed = preprocess_text(text1)
text2_processed = preprocess_text(text2)

facts1 = extract_facts(text1_processed)
facts2 = extract_facts(text2_processed)

discrepancies = compare_facts(facts1, facts2)

report_discrepancies(discrepancies)


Facts in the second text not mentioned in the first:
- case involves boner
- case follows 1
- judge ruled presence
- evidence implicated robbery
- parties involved boner
- echr ruled article
- justice required hearing
- boner represented appeal
- justice required representation


In [41]:
def report_discrepancies(discrepancies):
    print("Facts in the second text not mentioned in the first:")
    for fact in discrepancies:
        print('- {} {} {}'.format(fact['subject'], fact['verb'], fact['object']))

text1, text2 = first_paragraph, second_paragraph

# Example usage:
text1_processed = preprocess_text(text1)
text2_processed = preprocess_text(text2)

facts1 = extract_facts(text1_processed)
facts2 = extract_facts(text2_processed)

discrepancies = compare_facts_advanced(facts1, facts2)

report_discrepancies(discrepancies)


Facts in the second text not mentioned in the first:
- case follows 1
- evidence implicated robbery
- echr ruled article


In [43]:
import networkx as nx

def build_knowledge_graph(facts):
    G = nx.Graph()
    for fact in facts:
        subject = fact['subject']
        verb = fact['verb']
        object_ = fact['object']
        G.add_edge(subject, object_, label=verb)
    return G

# Build graphs for both texts
graph1 = build_knowledge_graph(facts1)
graph2 = build_knowledge_graph(facts2)

# Compare the graphs to find discrepancies
def compare_graphs(graph1, graph2):
    discrepancies = []
    edges1 = set(graph1.edges(data='label'))
    edges2 = set(graph2.edges(data='label'))
    new_edges = edges2 - edges1
    for edge in new_edges:
        discrepancies.append(edge)
    return discrepancies

discrepancies = compare_graphs(graph1, graph2)


In [44]:
discrepancies

[('events', 'employees', 'assaulted'),
 ('boner', 'parties', 'involved'),
 ('judge', 'presence', 'ruled'),
 ('case', 'boner', 'involves'),
 ('boner', 'appeal', 'represented'),
 ('justice', 'hearing', 'required'),
 ('evidence', 'robbery', 'implicated'),
 ('case', '1', 'follows'),
 ('echr', 'article', 'ruled'),
 ('justice', 'representation', 'required')]

In [46]:
graph1.edges(data='label')

EdgeDataView([('course', 'employees', 'assaulted'), ('applicant', 'trial', 'received'), ('applicant', 'fact', 'included'), ('applicant', 'application', 'granted'), ('trial', 'appeal', 'obtained'), ('mrs', 'evidence', 'give'), ('mrs', 'office', 'stated'), ('inquiries', 'responsibility', 'revealed'), ('judge', 'injustice', 'took'), ('judge', 'imprisonment', 'sentenced'), ('witness', 'boner', 'identified'), ('boner', 'solicitors', 'instructed'), ('boner', 'commission', 'applied'), ('jury', 'charges', 'found'), ('solicitor', 'conviction', 'lodged'), ('solicitors', 'appeal', 'filed'), ('appeal', 'grounds', 'contained'), ('appeal', 'board', 'see'), ('appeal', 'bench', 'heard'), ('grounds', 'court', 'appeal'), ('work', 'aid', 'covered'), ('aid', 'recommendation', 'made'), ('application', 'proceedings', 'extend'), ('application', 'reconsideration', 'involves'), ('application', 'commission', 'declared'), ('opinion', 'board', 'forwarded'), ('opinion', 'judgment', 'reproduced'), ('board', 'decisi

In [49]:
import spacy
from sentence_transformers import SentenceTransformer, util

# Load spaCy model for NER, POS tagging, dependency parsing
nlp = spacy.load('en_core_web_sm')

# Load a sentence transformer model for semantic similarity
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def preprocess(text):
    # Tokenize and extract entities, facts from a paragraph
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return sentences, entities

def extract_facts(sentences):
    # Using dependency parsing to extract simple facts
    facts = []
    for sent in sentences:
        doc = nlp(sent)
        # Example fact extraction logic using subject-predicate-object relations
        for token in doc:
            if token.dep_ == 'ROOT':  # The main verb of the sentence
                subj = [w for w in token.lefts if w.dep_ == 'nsubj']
                obj = [w for w in token.rights if w.dep_ in ('dobj', 'pobj')]
                if subj and obj:
                    facts.append((subj[0], token, obj[0]))  # (subject, verb, object)
    return facts

def compare_facts(facts1, facts2):
    # Convert facts to text and then to embeddings
    facts1_text = [' '.join([str(f) for f in fact]) for fact in facts1]
    facts2_text = [' '.join([str(f) for f in fact]) for fact in facts2]
    
    # Calculate semantic similarity
    embeddings1 = model.encode(facts1_text, convert_to_tensor=True)
    embeddings2 = model.encode(facts2_text, convert_to_tensor=True)
    
    # Find new facts by comparing similarity
    new_facts = []
    for idx, fact2 in enumerate(facts2_text):
        similarity = util.pytorch_cos_sim(embeddings2[idx], embeddings1)
        if similarity.max() < 0.8:  # Threshold for considering a fact as new
            new_facts.append(fact2)
    return new_facts


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

New Facts in Paragraph 2: []


In [51]:
# Example usage
paragraph1 = first_paragraph
paragraph2 = second_paragraph

sentences1, entities1 = preprocess(paragraph1)
sentences2, entities2 = preprocess(paragraph2)

facts1 = extract_facts(sentences1)
facts2 = extract_facts(sentences2)

new_facts = compare_facts(facts1, facts2)
print("New Facts in Paragraph 2:", new_facts)


New Facts in Paragraph 2: ['case involves Boner', 'Events committed robbery', 'evidence implicated Boner', 'Boner represented himself', 'interests required representation']


In [56]:
from sentence_transformers import SentenceTransformer, util
import spacy

# Load spaCy model and legal-specific sentence transformer model
nlp = spacy.load('en_core_web_sm')
legal_model = SentenceTransformer('nlpaueb/legal-bert-base-uncased')

No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


In [76]:
import spacy
from spacy.language import Language
from spacy.tokens import Span

nlp = spacy.load('en_core_web_sm')

# Add custom rules to handle legal abbreviations (e.g., "Art." for Article, "Sec." for Section)
@Language.component("legal_abbreviation_rules")
def legal_abbreviation_rules(doc):
    # Define a set of common legal abbreviations
    legal_abbreviations = {"Art.", "Sec.", "Mr.", "Mrs.", "Dr.", "Ms.", "Prof.", "Gov."}
    
    for token in doc[:-1]:  # Iterate over the tokens but skip the last token
        if token.text in legal_abbreviations and token.i < len(doc) - 1:
            next_token = doc[token.i + 1]
            if next_token.is_title:  # Check if the next token starts a new title
                # Merge abbreviation with the next token
                with doc.retokenize() as retokenizer:
                    span = doc[token.i: token.i + 2]  # Merge abbreviation with next token
                    retokenizer.merge(span)
    return doc

# Register the component with spaCy
nlp.add_pipe("legal_abbreviation_rules", before="parser")

def extract_sentences(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Extract sentences, ensuring custom handling for legal abbreviations
    sentences = [sent.text.strip() for sent in doc.sents]
    
    # Filter out any overly short sentences that may be due to tokenization noise
    sentences = [sent for sent in sentences if len(sent) > 5]
    
    return sentences

def compare_sentences(paragraph1, paragraph2, threshold=0.8):
    # Extract sentences
    sentences1 = extract_sentences(paragraph1)
    sentences2 = extract_sentences(paragraph2)
    
    # Encode sentences using a legal-specific model
    embeddings1 = legal_model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = legal_model.encode(sentences2, convert_to_tensor=True)
    
    # Compare each sentence in paragraph 2 with sentences from paragraph 1
    new_sentences = []
    for idx, emb2 in enumerate(embeddings2):
        cosine_scores = util.pytorch_cos_sim(emb2, embeddings1)
        max_score = cosine_scores.max().item()
        
        # Only consider sentences that are sufficiently dissimilar
        if max_score < threshold:
            new_sentences.append(sentences2[idx])
    
    return new_sentences

In [78]:
# Example Usage
paragraph1 = first_paragraph
paragraph2 = second_paragraph

new_facts = compare_sentences(paragraph1, paragraph2)
print("New facts in paragraph 2:", new_facts)

IndexError: [E040] Attempt to access token at 555, max length 555.

In [77]:
sentences1 = extract_sentences(second_paragraph)
sentences1

IndexError: [E040] Attempt to access token at 555, max length 555.