<a href="https://colab.research.google.com/github/Alexa5437/AIVEX_AI_LAB/blob/main/Lab_2_B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

PLAGIARISM DETECTION SUBMISSION PROBLEM

In [None]:
import re
import nltk
import numpy as np
from queue import PriorityQueue

nltk.download('punkt')


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    sentences = nltk.sent_tokenize(text)
    return sentences

# Compute normalized Levenshtein distance (edit distance) between two sentences
def levenshtein_distance(s1, s2):
    len_s1, len_s2 = len(s1), len(s2)

    if len_s1 == 0:
        return len_s2
    if len_s2 == 0:
        return len_s1

    if len(s1) < len(s2):
        s1, s2 = s2, s1

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    # Normalize by the length of the longer sentence
    return previous_row[-1] / max(len_s1, len_s2)

# A* search algorithm to find the optimal sentence alignment
def a_star_search(sentences_doc1, sentences_doc2):
    def heuristic(i, j):
        # Heuristic: Assume remaining sentences will all be different
        return (len(sentences_doc1) - i) + (len(sentences_doc2) - j)

    open_states = PriorityQueue()
    open_states.put((0, (0, 0, 0)))  # (f_cost, (i, j, g_cost))
    closed_states = set()  # Fix the indentation here

    while not open_states.empty():
        f_cost, (i, j, g_cost) = open_states.get()

        if (i, j) in closed_states:
            continue
        closed_states.add((i, j))

        # Check if we've reached the end of both documents
        if i == len(sentences_doc1) and j == len(sentences_doc2):
            return g_cost  # Return total alignment cost

        # Explore alignment
        if i < len(sentences_doc1) and j < len(sentences_doc2):
            alignment_cost = g_cost + levenshtein_distance(sentences_doc1[i], sentences_doc2[j])
            open_states.put((alignment_cost + heuristic(i + 1, j + 1), (i + 1, j + 1, alignment_cost)))

        # Explore skipping sentence from doc1
        if i < len(sentences_doc1):
            open_states.put((g_cost + 1 + heuristic(i + 1, j), (i + 1, j, g_cost + 1)))

        # Explore skipping sentence from doc2
        if j < len(sentences_doc2):
            open_states.put((g_cost + 1 + heuristic(i, j + 1), (i, j + 1, g_cost + 1)))

    return float('inf')  # No solution found


def detect_plagiarism(doc1, doc2):
    sentences_doc1 = preprocess_text(doc1)
    sentences_doc2 = preprocess_text(doc2)
    alignment_cost = a_star_search(sentences_doc1, sentences_doc2)

    # Maximum possible cost is aligning every sentence by skipping
    max_possible_cost = (len(sentences_doc1) + len(sentences_doc2))

    # Calculate plagiarism score
    if alignment_cost == float('inf'):
        plagiarism_score = 0
    else:
        normalized_cost = alignment_cost / max_possible_cost
        plagiarism_score = max(0, 10 - (normalized_cost * 10))  # Ensure non-negative score

    return round(plagiarism_score)


# Test cases
def run_test_cases():
    # Test Case 1: Identical Documents
    doc1 = "Machine learning algorithms can classify images."
    doc2 = "Machine learning algorithms can classify images."
    plagiarism_score = detect_plagiarism(doc1, doc2)
    print(f"Test Case 1 - Identical Documents, Plagiarism Score: {plagiarism_score}")

    # Test Case 2: Slightly Modified Document
    doc1 = "Machine learning algorithms can classify images."
    doc2 = "Algorithms for machine learning are able to classify images."
    plagiarism_score = detect_plagiarism(doc1, doc2)
    print(f"Test Case 2 - Slightly Modified Document, Plagiarism Score: {plagiarism_score}")

    # Test Case 3: Completely Different Documents
    doc1 = "Machine learning algorithms can classify images."
    doc2 = "The quick brown fox jumps over the lazy dog."
    plagiarism_score = detect_plagiarism(doc1, doc2)
    print(f"Test Case 3 - Completely Different Documents, Plagiarism Score: {plagiarism_score}")

    # Test Case 4: Partial Overlap
    doc1 = "Machine learning algorithms can classify images. Deep learning improves accuracy."
    doc2 = "Deep learning is a technique in machine learning that improves accuracy."
    plagiarism_score = detect_plagiarism(doc1, doc2)
    print(f"Test Case 4 - Partial Overlap, Plagiarism Score: {plagiarism_score}")

# Run all test cases
run_test_cases()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Test Case 1 - Identical Documents, Plagiarism Score: 10
Test Case 2 - Slightly Modified Document, Plagiarism Score: 8
Test Case 3 - Completely Different Documents, Plagiarism Score: 6
Test Case 4 - Partial Overlap, Plagiarism Score: 8
