# Paper Similarity Check

Python Imports

In [1]:
import os
import re
from collections import defaultdict
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annieqian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/annieqian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Making sure download the nltk package

In [2]:
nltk.data.path.append("./nltk_data")

Turing pdf to text file

In [3]:
def extract_text_from_pdf(file_path):
    pdf_reader = PyPDF2.PdfReader(open(file_path, 'rb'))
    text = ''
    for page_num in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page_num].extract_text()
    return text

Finding matched files

In [4]:
def find_and_compare_files(directory):
    # Create a dictionary to store files based on their base name
    file_dict = defaultdict(list)
    
    # Define the regex pattern to extract the base name and the number
    pattern = re.compile(r'^(.*)_(\d+)\.pdf$')

    # Iterate through files in the directory
    for filename in os.listdir(directory):
        match = pattern.match(filename)
        if match:
            base_name = match.group(1)
            number = int(match.group(2))
            file_dict[base_name].append((number, filename))
    
    # Find matching pairs
    matching_pairs = []
    for base_name, files in file_dict.items():
        files_sorted = sorted(files, key=lambda x: x[0])
        for i in range(len(files_sorted) - 1):
            current_file = files_sorted[i]
            next_file = files_sorted[i + 1]
            if next_file[0] == current_file[0] + 1:
                matching_pairs.append((current_file[1], next_file[1]))

    return matching_pairs

Computing Similarity Scores

In [5]:

def compute_similarity(first_draft_path, final_draft_path):
    # Extract text from both PDFs
    first_draft_text = extract_text_from_pdf(first_draft_path)
    final_draft_text = extract_text_from_pdf(final_draft_path)

    documents = [first_draft_text, final_draft_text]

    stop_words = stopwords.words('english')

    word_tokenizer = WordPunctTokenizer()
    word_lemmatizer = nltk.WordNetLemmatizer()

    vocabulary_dict = defaultdict(int)
    cleansed_documents = []
    for doc in documents:
        tokens = word_tokenizer.tokenize(doc)
        alpha_words = [word.lower() for word in tokens if word.isalpha() and len(word) > 2 and word not in stop_words]
        final_words = [word_lemmatizer.lemmatize(word) for word in alpha_words]
        for word in final_words:
            vocabulary_dict[word] += 1
        cleansed_doc = ' '.join(final_words)
        cleansed_documents.append(cleansed_doc)

    sorted_vocabulary = sorted(vocabulary_dict.items(), key=lambda kv: kv[1], reverse=True)

    vocabulary = []
    for word, count in sorted_vocabulary:
        vocabulary.append(word)
        
    word_vectorizer = TfidfVectorizer(vocabulary=vocabulary)
    matrix = word_vectorizer.fit_transform(cleansed_documents).toarray()
    #compute the cosine similarity
    first_doc = matrix[0].reshape(1, -1)
    next_doc = matrix[1].reshape(1, -1)
    similarity_score = cosine_similarity(first_doc, next_doc)
    
    return similarity_score[0][0]

Put your directory path

In [6]:
# Directory path
directory_path = '/Users/annieqian/Desktop/check_similarity/'

Run the code and Print out the Similarity Scores

In [7]:
# Find the matching files
matching_files = find_and_compare_files(directory_path)

# Compute and print the similarity scores
for pair in matching_files:
    first_draft_path = os.path.join(directory_path, pair[0])
    final_draft_path = os.path.join(directory_path, pair[1])
    print(f"Matched files: {pair[0]} and {pair[1]}")
    similarity_score = compute_similarity(first_draft_path, final_draft_path)
    print(f"Similarity score between {pair[0]} and {pair[1]}: {similarity_score:.4f}")


Matched files: a803f_0.pdf and a803f_1.pdf




Similarity score between a803f_0.pdf and a803f_1.pdf: 0.9865
Matched files: a8768_0.pdf and a8768_1.pdf
Similarity score between a8768_0.pdf and a8768_1.pdf: 0.7863
Matched files: a8768_1.pdf and a8768_2.pdf
Similarity score between a8768_1.pdf and a8768_2.pdf: 0.9938
