# Paper Similarity Check

Python Imports

In [29]:
import os
import re
from collections import defaultdict
import PyPDF2
import nltk
import csv
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annieqian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/annieqian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Making sure download the nltk package

In [2]:
nltk.data.path.append("./nltk_data")

Turing pdf to text file

In [30]:
def extract_text_from_pdf(file_path):
    pdf_reader = PyPDF2.PdfReader(open(file_path, 'rb'))
    text = ''
    for page_num in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page_num].extract_text()
    return text

Finding matched files

In [31]:
def find_and_compare_files(directory):
    # Create a dictionary to store files based on their base name
    file_dict = defaultdict(list)
    
    # Define the regex pattern to extract the base name and the number
    pattern = re.compile(r'^(.*)_(\d+)\.pdf$')

    # Iterate through files in the directory
    for filename in os.listdir(directory):
        match = pattern.match(filename)
        if match:
            base_name = match.group(1)
            number = int(match.group(2))
            file_dict[base_name].append((number, filename))
    
    # Find matching pairs: basename_0 vs other basename files
    matching_pairs = defaultdict(list)
    for base_name, files in file_dict.items():
        draft_0 = None
        other_drafts = []
        
        for file in files:
            if file[0] == 0:
                draft_0 = file[1]
            else:
                other_drafts.append((file[0], file[1]))
        
        # Add pairings of draft_0 with each other draft
        if draft_0:
            for number, other_draft in other_drafts:
                matching_pairs[base_name].append((draft_0, other_draft, number))

    return matching_pairs

Computing Similarity Scores

In [32]:
def compute_similarity(first_draft_path, final_draft_path):
    # Extract text from both PDFs
    first_draft_text = extract_text_from_pdf(first_draft_path)
    final_draft_text = extract_text_from_pdf(final_draft_path)

    documents = [first_draft_text, final_draft_text]

    stop_words = stopwords.words('english')

    word_tokenizer = WordPunctTokenizer()
    word_lemmatizer = nltk.WordNetLemmatizer()

    vocabulary_dict = defaultdict(int)
    cleansed_documents = []
    for doc in documents:
        tokens = word_tokenizer.tokenize(doc)
        alpha_words = [word.lower() for word in tokens if word.isalpha() and len(word) > 2 and word not in stop_words]
        final_words = [word_lemmatizer.lemmatize(word) for word in alpha_words]
        for word in final_words:
            vocabulary_dict[word] += 1
        cleansed_doc = ' '.join(final_words)
        cleansed_documents.append(cleansed_doc)

    sorted_vocabulary = [word for word, _ in sorted(vocabulary_dict.items(), key=lambda kv: kv[1], reverse=True)]
        
    word_vectorizer = TfidfVectorizer(vocabulary=sorted_vocabulary)
    matrix = word_vectorizer.fit_transform(cleansed_documents).toarray()
    
    # Compute the cosine similarity
    first_doc = matrix[0].reshape(1, -1)
    next_doc = matrix[1].reshape(1, -1)
    similarity_score = cosine_similarity(first_doc, next_doc)
    
    return similarity_score[0][0]

Put your directory path

In [35]:
# Directory path
directory_path = '/Users/annieqian/Desktop/check_similarity/test_1'

Run the code and Print out the Similarity Scores

In [36]:
matching_files = find_and_compare_files(directory_path)

# Prepare the data for the CSV
csv_data = []
header = ['basename']

# Compute similarity scores and store them
for base_name, pairs in matching_files.items():
    row = {'basename': base_name}
    for draft_0, other_draft, number in pairs:
        if draft_0:  # Only process if draft_0 exists
            similarity_score = compute_similarity(os.path.join(directory_path, draft_0), os.path.join(directory_path, other_draft))
            column_name = f'score_0_{number}'
            row[column_name] = similarity_score
            if column_name not in header:
                header.append(column_name)
    csv_data.append(row)

# Write to CSV file
csv_file_path = '/Users/annieqian/Desktop/check_similarity/similarity_scores.csv'
with open(csv_file_path, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=header)
    writer.writeheader()
    for data in csv_data:
        writer.writerow(data)

print(f"Similarity scores have been saved to {csv_file_path}")

Similarity scores have been saved to /Users/annieqian/Desktop/check_similarity/similarity_scores.csv
