### Text Preprocessing - Tokenize into sentences, normalize text


In [34]:
import string

def preprocess_text(text):
    sentences = [s.strip() for s in text.strip().split('.') if s.strip()]
    table = str.maketrans('', '', string.punctuation)
    normalized = [s.lower().translate(table) for s in sentences]
    return normalized


### Levenshtein Distance function for edit distance


In [35]:
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]


### A* search algorithm for text alignment

-   Adaptive heuristic for A* text alignment (drop-in compatible version).

-   Automatically chooses between exact Levenshtein and length-based approximation based on remaining sentences.

In [36]:
import heapq

class State:
    def __init__(self, i, j, g_cost, path):
        self.i = i  
        self.j = j
        self.g_cost = g_cost
        self.path = path

    def __lt__(self, other):
        return self.g_cost < other.g_cost

def heuristic(i, j, doc1, doc2):
    max_exact_pairs = 12   
    scale_factor = 0.5     

    remaining1 = doc1[i:]
    remaining2 = doc2[j:]
    n1, n2 = len(remaining1), len(remaining2)

    # If one doc is exhausted, return total remaining char length as lower bound
    if n1 == 0 or n2 == 0:
        return sum(len(s) for s in remaining1) + sum(len(s) for s in remaining2)

    if n1 * n2 <= max_exact_pairs:
        # Exact minimal Levenshtein match
        min_dists = []
        for s1 in remaining1:
            min_cost = min(levenshtein_distance(s1, s2) for s2 in remaining2)
            min_dists.append(min_cost)
        estimate = sum(min_dists)
    else:
        # Fast approximation: use length differences
        estimate = 0
        for s1 in remaining1:
            len_diffs = [abs(len(s1) - len(s2)) for s2 in remaining2]
            estimate += min(len_diffs) if len_diffs else len(s1)

    return estimate * scale_factor


def neighbors(state, doc1, doc2):
    i, j = state.i, state.j
    neighbors = []
    len1, len2 = len(doc1), len(doc2)
    # Align current sentences if both available
    if i < len1 and j < len2:
        cost = levenshtein_distance(doc1[i], doc2[j])
        neighbors.append(('align', i+1, j+1, cost))
    # Skip sentence in doc1
    if i < len1:
        cost = len(doc1[i]) 
        neighbors.append(('skip_doc1', i+1, j, cost))
    # Skip sentence in doc2
    if j < len2:
        cost = len(doc2[j])
        neighbors.append(('skip_doc2', i, j+1, cost))
    return neighbors

def a_star_search(doc1, doc2):
    start = State(0, 0, 0, [])
    heap = []
    heapq.heappush(heap, (start.g_cost + heuristic(0, 0, doc1, doc2), start))
    visited = set()
    while heap:
        f_cost, current = heapq.heappop(heap)
        if (current.i, current.j) in visited:
            continue
        visited.add((current.i, current.j))
        # Goal check
        if current.i >= len(doc1) and current.j >= len(doc2):
            return current.path, current.g_cost
        for action, ni, nj, cost in neighbors(current, doc1, doc2):
            if (ni, nj) not in visited:
                new_path = current.path.copy()
                if action == 'align':
                    new_path.append((current.i, current.j, cost))
                elif action == 'skip_doc1':
                    new_path.append((current.i, None, cost))
                else:  # skip_doc2
                    new_path.append((None, current.j, cost))
                g_cost_new = current.g_cost + cost
                new_state = State(ni, nj, g_cost_new, new_path)
                heapq.heappush(heap, (g_cost_new + heuristic(ni, nj, doc1, doc2), new_state))
    return None, None


### Detect plagiarism based on alignment

In [37]:
def detect_plagiarism(alignment, doc1, doc2, threshold=5):
    # threshold: max edit distance to consider as plagiarism
    plagiarized_pairs = []
    for i, j, cost in alignment:
        if i is not None and j is not None and cost <= threshold:
            plagiarized_pairs.append((doc1[i], doc2[j], cost))
    return plagiarized_pairs


In [38]:
def run_test(doc1, doc2, threshold=5):
    sentences1 = preprocess_text(doc1)
    sentences2 = preprocess_text(doc2)
    alignment, total_cost = a_star_search(sentences1, sentences2)
    print(f"Total alignment cost: {total_cost}")
    plag_pairs = detect_plagiarism(alignment, sentences1, sentences2, threshold)
    for s1, s2, c in plag_pairs:
        print(f"Potential plagiarism:\nDoc1: {s1}\nDoc2: {s2}\nEdit Distance: {c}\n")


In [39]:
# Test Case 1: Identical Documents
doc1_test1 = (
    "We propose a new simple network architecture, the Transformer, "
    "based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. "
    "Experiments on two machine translation tasks show these models to be superior in quality "
    "while being more parallelizable and requiring significantly less time to train."
)

doc2_test1 = (
    "We propose a new simple network architecture, the Transformer, "
    "based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. "
    "Experiments on two machine translation tasks show these models to be superior in quality "
    "while being more parallelizable and requiring significantly less time to train."
)


# Test Case 2: Slightly Modified Documents
doc1_test2 = (
    "We propose a new simple network architecture, the Transformer, "
    "based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. "
    "Experiments on two machine translation tasks show these models to be superior in quality "
    "while being more parallelizable and requiring significantly less time to train."
)

doc2_test2 = (
    "We propose a novel network design, the Transformer, "
    "built purely on attention mechanisms, eliminating recurrence and convolutions completely. "
    "Experiments on machine translation tasks demonstrate these models to be better in performance "
    "while being more parallelizable and taking much less time to train."
)


# Test Case 3: Completely Different Documents
doc1_test3 = (
    "The Transformer relies entirely on self-attention to draw global dependencies between input and output. "
    "This design removes recurrence completely, enabling parallel computation."
)

doc2_test3 = (
    "Convolutional neural networks are designed to capture spatial hierarchies in image data. "
    "Recurrent models, on the other hand, are well suited for sequential information such as text or speech."
)


# Test Case 4: Partial Overlap
doc1_test4 = (
    "We inspect attention distributions from our models and present and discuss examples in the appendix. "
    "Our model achieves 28.4 BLEU on the WMT 2014 English–German translation task, "
    "improving over the existing best results including ensembles by over 2 BLEU."
)

doc2_test4 = (
    "We inspect attention distributions from our models and show illustrative examples in the appendix. "
    "On the WMT 2014 English–German translation task, our Transformer achieves 28.4 BLEU, "
    "surpassing previous best scores and ensemble methods."
)


In [40]:
print("Test Case 1: Identical Documents")
run_test(doc1_test1, doc2_test1)

print("Test Case 2: Slightly Modified Documents")
run_test(doc1_test2, doc2_test2)

print("Test Case 3: Completely Different Documents")
run_test(doc1_test3, doc2_test3)

print("Test Case 4: Partial Overlap")
run_test(doc1_test4, doc2_test4)


Test Case 1: Identical Documents
Total alignment cost: 0
Potential plagiarism:
Doc1: we propose a new simple network architecture the transformer based solely on attention mechanisms dispensing with recurrence and convolutions entirely
Doc2: we propose a new simple network architecture the transformer based solely on attention mechanisms dispensing with recurrence and convolutions entirely
Edit Distance: 0

Potential plagiarism:
Doc1: experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train
Doc2: experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train
Edit Distance: 0

Test Case 2: Slightly Modified Documents
Total alignment cost: 94
Test Case 3: Completely Different Documents
Total alignment cost: 152
Test Case 4: Partial Overlap
Total alignment cost: 163
