# NLP Experiment-8


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

In [6]:
def jaccard_similarity(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

In [7]:
def longest_common_subsequence(text1, text2):
    seq1 = text1.split()
    seq2 = text2.split()
    m = len(seq1)
    n = len(seq2)

    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                dp[i][j] = 0
            elif seq1[i - 1] == seq2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

    return dp[m][n]

In [8]:
original_text = """
The dog (Canis familiaris or Canis lupus familiaris) is a domesticated descendant of the gray wolf. Also called the domestic dog, it was selectively bred from a population of wolves during the Late Pleistocene by hunter-gatherers. The dog was the first species to be domesticated by humans, over 14,000 years ago and before the development of agriculture. Due to their long association with humans, dogs have gained the ability to thrive on a starch-rich diet that would be inadequate for other canids.

Dogs have been bred for desired behaviors, sensory capabilities, and physical attributes. Dog breeds vary widely in shape, size, and color. They have the same number of bones (with the exception of the tail), powerful jaws that house around 42 teeth, and well-developed senses of smell, hearing, and sight. Compared to humans, dogs possess a superior sense of smell and hearing, but inferior visual acuity. Dogs perform many roles for humans, such as hunting, herding, pulling loads, protection, companionship, therapy, aiding disabled people, and assisting police and the military.
"""

In [9]:
input_text = """
The dog (Canis familiaris or Canis lupus familiaris) is a tamed offspring of the gray wolf. Also known as the domestic dog, it was intentionally bred from a group of wolves during the Late Pleistocene by hunter-gatherers. The dog was the initial species to be tamed by humans, more than 14,000 years ago and prior to the rise of agriculture. Because of their extended relationship with humans, dogs have developed the capacity to flourish on a starch-heavy diet that would not be sufficient for other canids.

Dogs have been selectively bred for specific behaviors, sensory skills, and physical traits. Dog breeds differ greatly in size, shape, and color. They possess an identical number of bones (excluding the tail), strong jaws containing approximately 42 teeth, and highly developed senses of smell, hearing, and sight. In comparison to humans, dogs have a more advanced sense of smell and hearing, but a lesser ability to see clearly. Dogs serve various functions for humans, including hunting, herding, carrying loads, providing protection, companionship, therapy, helping disabled individuals, and supporting police and military efforts
"""

In [10]:
og_text_pp = preprocess_text(original_text)
ip_text_pp = preprocess_text(input_text)

In [11]:
print(f"Jaccard Similarity: {jaccard_similarity(og_text_pp, ip_text_pp)}")

Jaccard Similarity: 0.4108527131782946


In [12]:
print(f"LCS Similarity: {longest_common_subsequence(og_text_pp, ip_text_pp)}")

LCS Similarity: 63


In [13]:
text = """
Dogs are loyal, loving, and intelligent companions.
They come in many breeds, each with unique traits and personalities.
From playful puppies to calm seniors, dogs bring joy at every age.
They can be trained to assist people with disabilities or perform rescue tasks.
Dogs are highly social and thrive on human interaction and care.
Their presence reduces stress and boosts emotional well-being.
With wagging tails and eager eyes, they show unconditional love.
A dog isn’t just a pet—it’s family.
"""

text_pp = preprocess_text(text)

print(f"Jacccard Similarity: {jaccard_similarity(og_text_pp, text_pp)}")
print(f"LCS Similarity: {longest_common_subsequence(og_text_pp, text_pp)}")

Jacccard Similarity: 0.05384615384615385
LCS Similarity: 4


## Plagiarism Checking on Dataset

In [14]:
!pip install PyMuPDF



In [16]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            text += page.get_text()
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        return None
    return text

try:
    with open("paper1.txt", "r") as f:
        paper1_text = f.read()
except FileNotFoundError:
    print("Error: paper1.txt not found.")
    paper1_text = ""

try:
    with open("paper2.txt", "r") as f:
        paper2_text = f.read()
except FileNotFoundError:
    print("Error: paper2.txt not found.")
    paper2_text = ""

original_text_combined = paper1_text + " " + paper2_text

original_text_pp = preprocess_text(original_text_combined)

pdf_file_path = "input.pdf"

input_pdf_text = extract_text_from_pdf(pdf_file_path)

if input_pdf_text:
    input_pdf_text_pp = preprocess_text(input_pdf_text)

    jaccard_sim = jaccard_similarity(original_text_pp, input_pdf_text_pp)
    lcs_sim = longest_common_subsequence(original_text_pp, input_pdf_text_pp)

    print(f"Jaccard Similarity (Original vs PDF): {jaccard_sim}")
    print(f"LCS Similarity (Original vs PDF): {lcs_sim}")
else:
    print("Could not extract text from the input PDF.")

Jaccard Similarity (Original vs PDF): 0.0707029662738724
LCS Similarity (Original vs PDF): 230


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_score(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

def normalized_longest_common_subsequence(text1, text2):
    lcs = longest_common_subsequence(text1, text2)
    tokens1 = text1.split()
    return lcs / len(tokens1) if len(tokens1) != 0 else 0

if 'original_text_pp' in locals() and 'input_pdf_text_pp' in locals():
    cosine_sim = cosine_similarity_score(original_text_pp, input_pdf_text_pp)
    normalized_lcs_sim = normalized_longest_common_subsequence(original_text_pp, input_pdf_text_pp)

    print(f"Cosine Similarity (Original vs PDF): {cosine_sim}")
    print(f"Normalized LCS Similarity (Original vs PDF): {normalized_lcs_sim}")
else:
    print("Preprocessing results not available. Please run the previous cell.")

Cosine Similarity (Original vs PDF): 0.6222980526553753
Normalized LCS Similarity (Original vs PDF): 0.012360275150472914
