<a href="https://colab.research.google.com/github/Darshansundeep/AI/blob/main/Program_for_Word_Search_in_a_folder_for_PDF_documents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install nltk
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [7]:
from PyPDF2 import PdfReader
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import os


In [None]:
nltk.download('punkt')
nltk.download('stopwords')

def count_words(text):
    tokens = word_tokenize(text)
    words = [word.lower() for word in tokens if word.isalpha()]
    return len(words)

def get_topics(text, num_topics=5):
    tokens = word_tokenize(text)
    words = [word.lower() for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
    counted_words = Counter(filtered_words)
    return counted_words.most_common(num_topics)

from PyPDF2 import PdfReader

def analyze_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        num_pages = len(reader.pages)
        total_words = 0
        all_text = ""

        for page in reader.pages:
            text = page.extract_text()
            all_text += text
            total_words += count_words(text)

        topics = get_topics(all_text)

        print(f"File Name: {file_path}")
        print(f"Number of Pages: {num_pages}")
        print(f"Total Word Count: {total_words}")
        print(f"Topics (approx.): {topics}")

def search_in_pdf(file_path, search_text):
    count = 0
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        for page_num, page in enumerate(reader.pages):
            text = page.extract_text()
            count += text.lower().count(search_text.lower())
    return count


def analyze_folder(folder_path):
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    num_pdf_files = len(pdf_files)

    print(f"Number of PDF documents in the folder: {num_pdf_files}")

    for file_name in pdf_files:
        file_path = os.path.join(folder_path, file_name)
        print(f"\nAnalyzing file: {file_name}")
        analyze_pdf(file_path)

    while True:
        search_text = input("\nEnter your search query (word or sentence), or type 'exit' to quit: ")
        if search_text.lower() == 'exit':
            break

        for file_name in pdf_files:
            file_path = os.path.join(folder_path, file_name)
            count = search_in_pdf(file_path, search_text)
            if count > 0:
                print(f"'{search_text}' found {count} times in file: {file_name}")


# Example usage
folder_path = '/content/drive/MyDrive/Files'
analyze_folder(folder_path)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of PDF documents in the folder: 2

Analyzing file: Bose Soundbar.pdf
File Name: /content/drive/MyDrive/Files/Bose Soundbar.pdf
Number of Pages: 64
Total Word Count: 9294
Topics (approx.): [('soundbar', 203), ('bose', 156), ('music', 99), ('app', 87), ('device', 85)]

Analyzing file: Malibu Owner's Manual.pdf
File Name: /content/drive/MyDrive/Files/Malibu Owner's Manual.pdf
Number of Pages: 419
Total Word Count: 110953
Topics (approx.): [('vehicle', 1910), ('system', 1048), ('see', 734), ('e', 609), ('engine', 601)]

Enter your search query (word or sentence), or type 'exit' to quit: BLUETOOTH
'BLUETOOTH' found 56 times in file: Bose Soundbar.pdf
'BLUETOOTH' found 190 times in file: Malibu Owner's Manual.pdf

Enter your search query (word or sentence), or type 'exit' to quit: CRUISE
'CRUISE' found 119 times in file: Malibu Owner's Manual.pdf
