In [1]:
import PyPDF2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import string
from PyQt5.QtWidgets import QApplication, QFileDialog

In [2]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Function to read text from a PDF file
def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

In [4]:
# Preprocess the text
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

In [5]:
# Function to generate a document vector using Word2Vec
def get_document_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [6]:
def select_files():
    app = QApplication([])
    files, _ = QFileDialog.getOpenFileNames(None, "Select PDF files", "", "PDF files (*.pdf)")
    return files

In [7]:
# Test the function
# Select the files
file_paths = select_files()
print(file_paths)

['D:/BU/aa research/journal articles/AML/Acute Myeloid Leukemia A Review.pdf', 'D:/BU/aa research/journal articles/AML/Diagnosis and management of acute myeloid leukemia in adults.pdf']


In [8]:
# Read and preprocess the text from the selected PDF files
text1 = preprocess(read_pdf(file_paths[0]))
text2 = preprocess(read_pdf(file_paths[1]))

In [9]:
# Combine the tokenized sentences for training Word2Vec
sentences = [text1, text2]

In [10]:
# Train a Word2Vec model on the sentences
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

In [11]:
# Generate document vectors
doc_vector_1 = get_document_vector(text1, model)
doc_vector_2 = get_document_vector(text2, model)

In [12]:
# Calculate cosine similarity between the two document vectors
similarity = cosine_similarity([doc_vector_1], [doc_vector_2])

In [13]:
# Print the tokenized sentences and their document vectors
print("Tokenized Document 1:", text1)
print("Tokenized Document 2:", text2)

Tokenized Document 1: ['updates', 'acute', 'myeloid', 'leukemia', 'review', 'ari', 'pelcovits', 'md', 'rabin', 'niroula', 'md', 'abstract', 'acute', 'myeloid', 'leukemia', 'aml', 'malignancy', 'stem', 'cell', 'precursors', 'myeloid', 'lineage', 'red', 'blood', 'cells', 'platelets', 'white', 'blood', 'cells', 'b', 'cells', 'like', 'malignancies', 'due', 'genetic', 'variations', 'lead', 'neoplastic', 'changes', 'clonal', 'pro', 'liferation', 'aml', 'remains', 'rare', 'malignancy', 'accounting', 'new', 'cancer', 'diagnoses', 'united', 'states', 'per', 'year', 'accounts', 'close', 'one', 'third', 'leukemias', 'diagnosed', 'much', 'early', 'century', 'treatment', 'paradigms', 'unchanged', 'survival', 'curves', 'remaining', 'stagnant', 'many', 'decades', 'recent', 'changes', 'understanding', 'genetic', 'varia', 'tions', 'disease', 'led', 'promising', 'new', 'ther', 'apies', 'hopes', 'improved', 'outcomes', 'future', 'review', 'definitions', 'diagnosis', 'classifica', 'tion', 'aml', 'affects'

In [14]:
print("\nDocument Vector 1:", doc_vector_1)
print("Document Vector 2:", doc_vector_2)


Document Vector 1: [-0.06028705  0.09066492 -0.04264551 -0.10434996 -0.01869038 -0.19852982
 -0.01206556  0.27604583 -0.10428098  0.04017933 -0.05148957 -0.19797249
 -0.07536412  0.00350058  0.06329219 -0.15101483 -0.01938701 -0.18568449
  0.00163248 -0.14164081  0.02895941  0.06032033  0.13826632  0.01735337
 -0.07620387 -0.00333533 -0.08118761 -0.10597454 -0.10576026 -0.04897039
  0.09297724 -0.00747162  0.10342778 -0.12262674 -0.04300593  0.11723351
  0.02213829 -0.1322745  -0.05993718 -0.17960238  0.00478922 -0.08480076
 -0.02860679  0.02897447  0.07425003 -0.00780965 -0.04774154 -0.02881022
  0.05455728  0.12989186  0.03507204 -0.1513438  -0.03984315 -0.00944226
 -0.10111815  0.05675543  0.09569385  0.00789309 -0.11208872 -0.04793609
  0.12254     0.11966821 -0.08379335 -0.00348069  0.01535915  0.04464399
 -0.02608     0.07802763 -0.17362982  0.05995125 -0.01576983  0.16827916
  0.12552117  0.02347091  0.20800693  0.04502115 -0.0423705  -0.05305415
 -0.08278268  0.0786568   0.002

In [15]:
# Print cosine similarity
print("\nCosine Similarity between the two documents:", similarity[0][0])


Cosine Similarity between the two documents: 0.9999763
