In [None]:
!pip install PyMuPDF

In [None]:
from google.colab import drive
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import fitz  
import re

In [None]:
# Function to read text from a PDF file
def read_pdf(file_path):
    doc = fitz.open(file_path)
    text = ''
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the papers folder
papers_path = '/content/drive/MyDrive/papers/'



# Function to read documents from a folder
def read_documents(folder_path):
    documents = []
    file_names = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if file_path.endswith('.pdf'):
            content = read_pdf(file_path)
            documents.append(content)
            file_names.append(filename)

    return documents, file_names

In [None]:
# Function to construct a DataFrame representing the term-document matrix
def create_term_doc_matrix(documents, file_names):
    # Step 1: Gather unique terms from all documents
    vocabulary = set()
    for document in documents:
        terms = clean_and_split_document(document)  # Assumes a function for preprocessing
        vocabulary.update(terms)

    # Print vocabulary for inspection
    print("vocab", vocabulary)

    # Step 2: Initialize an empty DataFrame
    matrix = pd.DataFrame(0, index=list(vocabulary), columns=file_names)

    # Step 3: Fill the matrix with term frequencies
    for i, document in enumerate(documents):
        terms = clean_and_split_document(document)  # Preprocess the document
        for term in terms:
            matrix.at[term, file_names[i]] += 1  # Increment the count for each term in the document

    # Display the resulting matrix
    print(matrix)
    return matrix


In [None]:
# Initialization for document organization:
# - all_documents: Stores the content of every document
# - all_file_names: Stores the names of all documents
# - doc_category_lookup: Maps file names to their respective categories
all_documents = []
all_file_names = []
doc_category_lookup = {}

# Define the categories to be processed
categories = ['NLP', 'CV', 'Cryptography', 'Virtual Reality', 'Internet of Things']

# Process each category:
for category in categories:
    # Construct the path to the category's documents
    category_path = os.path.join(papers_path, category)  # Combine base path with category name

    # Read documents and file names from the current category's path
    documents, file_names = read_documents(category_path)  # Assumes a function for reading documents

    # Add category-filename pairs to the lookup dictionary
    for filename in file_names:
        doc_category_lookup[filename] = category

    # Combine documents and file names from all categories into master lists
    all_documents.extend(documents)  # Add documents to the master list
    all_file_names.extend(file_names)  # Add file names to the master list

# Create a term-document matrix to represent word frequencies across documents
term_doc_matrix = create_term_doc_matrix(all_documents, all_file_names)  # Assumes a function for matrix creation

# Normalize the matrix using logarithmic scaling to balance term frequencies
term_doc_matrix_log = np.log1p(term_doc_matrix)  # Add 1 before log to handle zeros


In [None]:
print(doc_category_lookup)

{'MULTILINGUALNLP.pdf': 'NLP', 'ReinforcementLearningbasedNLP.pdf': 'NLP', 'GOPI.NLP_TERM.pdf': 'NLP', 'Article4-NLP-edit.pdf': 'NLP', 'Aslam2592023JERR107753NLP.pdf': 'NLP', 'Reviewpaper.pdf': 'NLP', '45.LeveragingNLPandMachineLearningforDataIntegrityinClinicalTrials.pdf': 'NLP', 'Machine_Learning_Advancements_in_SQL_Injection_Det.pdf': 'NLP', 'Combining_computer_vision_and_standardised_protoco.pdf': 'CV', 'Leaping_into_the_future_Current_application_and_fu.pdf': 'CV', 'Computer-visionbasedanalysisoftheneurosurgicalsceneAsystematicreview.pdf': 'CV', 'Virtual_reality_experiences_in_medical_education_A.pdf': 'Virtual Reality', 'Legal_aspects_of_functional_security_standardisati.pdf': 'Internet of Things'}


In [None]:
import math

# Function to find the most similar document category for a test document
def find_most_similar_category(test_document, term_doc_matrix_log):
    global doc_category_lookup  # Access the global dictionary for category lookup

    # Gather unique terms from the term-document matrix
    terms = set(term_doc_matrix_log.index)

    # Create a term frequency vector for the test document, counting whole-word occurrences
    test_vector = []
    for term in terms:
        count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(term), test_document))
        test_vector.append(count)

    # Print vector length for debugging
    print(len(test_vector))

    # Normalize the test vector using logarithmic scaling (add 1 before log to handle zeros)
    test_vector = np.log1p(test_vector)

    # Calculate cosine similarity with each document in the term-document matrix
    similarities = []
    for col in term_doc_matrix_log.columns:
        col_vector = term_doc_matrix_log[col].values  # Get vector for the current document
        similarity = np.dot(test_vector, col_vector) / (np.linalg.norm(test_vector) * np.linalg.norm(col_vector))  # Cosine similarity
        similarities.append(similarity)

    # Print similarities for debugging
    print("Similarities:", similarities)

    # Find the index of the most similar document
    most_similar_category_index = np.argmax(similarities)

    # Print index for debugging
    print("Most Similar Category Index:", most_similar_category_index)

    # Retrieve the category of the most similar document
    if most_similar_category_index < len(term_doc_matrix_log.columns):
        most_similar_doc = term_doc_matrix_log.columns[most_similar_category_index]  # Get document name
        # Print most similar document for debugging
        print(most_similar_doc)
        return doc_category_lookup[most_similar_doc]  # Return its category
    else:
        return "Unknown Category"  # Handle cases where no match is found


In [None]:

test_document_path = '/content/drive/MyDrive/papers/Test/VeterinaryDermatology-2023-Smith-Computervisionmodelforthedetectionofcaninepododermatitisandneoplasiaof.pdf'
test_document_content = read_pdf(test_document_path)

predicted_category = find_most_similar_category(test_document_content, term_doc_matrix_log)
print(f'category: {predicted_category}')

In [None]:

test_document_path = '/content/drive/MyDrive/papers/Test/Clickbait_Post_Detection_using_NLP_for_Sustainable.pdf'
test_document_content = read_pdf(test_document_path)

predicted_category = find_most_similar_category(test_document_content, term_doc_matrix_log)
print(f'category: {predicted_category}')