In [None]:
import fitz  # PyMuPDF for PDF text extraction
import joblib  # for loading the trained model and vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF Vectorizer
import nltk  # Natural Language Toolkit for stopwords
from nltk.corpus import stopwords
import spacy  # spaCy for text processing
import re  # Regular expressions for text cleaning
from sklearn.metrics.pairwise import cosine_similarity  # Cosine similarity for semantic comparison

: 

In [4]:
# Load the trained model and the TF-IDF vectorizer from disk
model = joblib.load('ensemble_model.pkl')
vectorizer = joblib.load('ensemble_model_tfidf_vectorizer.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

In [6]:
# Download stopwords from nltk if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arnab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text

In [8]:
# Function to split text into paragraphs
def split_text_into_paragraphs(text):
    paragraphs = re.split(r'\n\d+\.\s\xa0|\n\xa0', text)
    paragraphs = [para.strip() for para in paragraphs if para.strip()]
    return paragraphs

In [9]:
# Function to clean and preprocess text
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])
    # Lemmatization using spaCy
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    return text

In [10]:
# Function to annotate paragraphs with predicted clause types
def annotate_paragraphs(paragraphs):
    annotated_paragraphs = []
    for paragraph in paragraphs:
        cleaned_paragraph = clean_text(paragraph)
        paragraph_vector = vectorizer.transform([cleaned_paragraph])
        clause_type = model.predict(paragraph_vector)
        annotated_paragraphs.append((paragraph, clause_type[0]))
    return annotated_paragraphs

In [11]:
# Paths to your PDF files for both contracts
pdf_path_first_contract = 'Contract1.pdf'
pdf_path_second_contract = 'Contract2.pdf'

In [12]:
# Extract text from the first contract
first_contract_text = extract_text_from_pdf(pdf_path_first_contract)

# Split text into paragraphs for the first contract
first_contract_paragraphs = split_text_into_paragraphs(first_contract_text)

# Annotate paragraphs with predicted clause types for the first contract
annotated_first_contract_paragraphs = annotate_paragraphs(first_contract_paragraphs)

In [13]:
# Extract text from the second contract
second_contract_text = extract_text_from_pdf(pdf_path_second_contract)

# Split text into paragraphs for the second contract
second_contract_paragraphs = split_text_into_paragraphs(second_contract_text)

# Annotate paragraphs with predicted clause types for the second contract
annotated_second_contract_paragraphs = annotate_paragraphs(second_contract_paragraphs)

In [14]:
# Print the annotated paragraphs for both contracts
print("Annotated Paragraphs for the First Contract:")
for idx, (text, clause_type) in enumerate(annotated_first_contract_paragraphs, 1):
    print(f"Paragraph {idx}:")
    print(f"Text: {text}")
    print(f"Predicted Clause Type: {clause_type}")
    print()

Annotated Paragraphs for the First Contract:
Paragraph 1:
Text: First Amendment to Restricted Stock Unit
Agreement (Strategic Growth PSUs) between
Intel and Patrick Gelsinger, dated November 18,
2022
Contract Categories: Business Finance - Stock Agreements
EX-10.2 3 d310344dex102.htm EX-10.2 EX-10.2
Exhibit 10.2
Strategic Growth PSUs
INTEL CORPORATION
2021 INDUCEMENT PLAN
FIRST AMENDMENT TO
RESTRICTED STOCK UNIT AGREEMENT
(for Performance-Based Restricted Stock Units (or “PSUs”))
This First Amendment (this “Amendment”) to the Restricted Stock Unit Agreement by and between
Patrick Gelsinger (“you”) and Intel Corporation (the “Corporation”), which provided for the grant of 457,789
PSUs, effective as of February  15, 2021 (the “Strategic Growth PSU Award Agreement”), is made by and
between you and the Corporation, effective as of November  18, 2022 (the “Amendment Effective Date”).
Capitalized terms contained herein but not defined herein shall have the meanings ascribed to them in the 20

In [15]:
print("Annotated Paragraphs for the Second Contract:")
for idx, (text, clause_type) in enumerate(annotated_second_contract_paragraphs, 1):
    print(f"Paragraph {idx}:")
    print(f"Text: {text}")
    print(f"Predicted Clause Type: {clause_type}")
    print()

Annotated Paragraphs for the Second Contract:
Paragraph 1:
Text: First Amendment to Restricted Stock Unit
Agreement (Outperformance PSUs) between
Intel and Patrick Gelsinger, dated November 18,
2022
Contract Categories: Business Finance - Stock Agreements
EX-10.3 4 d310344dex103.htm EX-10.3 EX-10.3
Exhibit 10.3
Outperformance PSUs
INTEL CORPORATION
2021 INDUCEMENT PLAN
FIRST AMENDMENT TO
RESTRICTED STOCK UNIT AGREEMENT
(for Performance-Based Restricted Stock Units (or “PSUs”))
This First Amendment (this “Amendment”) to the Restricted Stock Unit Agreement by and between
Patrick Gelsinger (“you”) and Intel Corporation (the “Corporation”), which provided for the grant of 3,275,199
PSUs, effective as of February  15, 2021 (the “Outperformance PSU Award Agreement”), is made by and
between you and the Corporation, effective as of November  18, 2022 (the “Amendment Effective Date”).
Capitalized terms contained herein but not defined herein shall have the meanings ascribed to them in the 2021


In [16]:
# Function to calculate cosine similarity between two texts
def calculate_similarity(text1, text2):
    cleaned_text1 = clean_text(text1)
    cleaned_text2 = clean_text(text2)
    vector1 = vectorizer.transform([cleaned_text1])
    vector2 = vectorizer.transform([cleaned_text2])
    similarity = cosine_similarity(vector1, vector2)[0][0]
    return similarity

In [17]:
# Perform semantic comparison between paragraphs from both contracts
similar_paragraphs = []
for idx1, (text1, _) in enumerate(annotated_first_contract_paragraphs):
    for idx2, (text2, _) in enumerate(annotated_second_contract_paragraphs):
        similarity = calculate_similarity(text1, text2)
        if similarity > 0.6:
            similar_paragraphs.append((idx1 + 1, idx2 + 1, similarity))

In [18]:
# Print pairs of similar paragraphs
print("\nPairs of Similar Paragraphs (similarity > 60%):")
for pair in similar_paragraphs:
    print(f"Paragraph {pair[0]} (First Contract) - Paragraph {pair[1]} (Second Contract) | Similarity: {pair[2]}")


Pairs of Similar Paragraphs (similarity > 60%):
Paragraph 1 (First Contract) - Paragraph 1 (Second Contract) | Similarity: 0.8733716662631975
Paragraph 2 (First Contract) - Paragraph 2 (Second Contract) | Similarity: 1.0000000000000002
Paragraph 2 (First Contract) - Paragraph 3 (Second Contract) | Similarity: 0.6000000000000002
Paragraph 3 (First Contract) - Paragraph 3 (Second Contract) | Similarity: 0.9192723907499134
Paragraph 4 (First Contract) - Paragraph 3 (Second Contract) | Similarity: 0.6934760925288567
Paragraph 5 (First Contract) - Paragraph 3 (Second Contract) | Similarity: 0.632455532033676
Paragraph 7 (First Contract) - Paragraph 6 (Second Contract) | Similarity: 0.6274950199005568
Paragraph 8 (First Contract) - Paragraph 6 (Second Contract) | Similarity: 0.6741998624632424
Paragraph 9 (First Contract) - Paragraph 7 (Second Contract) | Similarity: 0.9672041516493521
Paragraph 10 (First Contract) - Paragraph 8 (Second Contract) | Similarity: 1.0
Paragraph 11 (First Contra