In [35]:
!pip install pymongo
!pip install pdfplumber



In [38]:
# Step 1: Import Necessary Libraries
import requests
import os
import json
from pymongo import MongoClient
import pdfplumber
from collections import Counter
import re
import time
from concurrent.futures import ThreadPoolExecutor

In [40]:
# Step 2: Load the Dataset
file_path = 'C:/Users/Dell/Downloads/Dataset.json'  # Update with your local path
with open(file_path, 'r') as f:
    dataset = json.load(f)

print(dataset)

{'pdf1': 'https://digiscr.sci.gov.in/pdf_viewer?dir=YWRtaW4vanVkZ2VtZW50X2ZpbGUvanVkZ2VtZW50X3BkZi8xOTUwL3ZvbHVtZSAxL1BhcnQgSS9Db21taXNzaW9uZXIgb2YgSW5jb21lIFRheCwgV2VzdCBCZW5nYWxfQ2FsY3V0dGEgQWdlbmN5IEx0ZC5fMTY5NzYwNjMxMC5wZGY=', 'pdf2': 'https://digiscr.sci.gov.in/pdf_viewer?dir=YWRtaW4vanVkZ2VtZW50X2ZpbGUvanVkZ2VtZW50X3BkZi8xOTUyL3ZvbHVtZSAxL1BhcnQgSS90aGUgc3RhdGUgb2YgYmloYXJfbWFoYXJhamFkaGlyYWphIHNpciBrYW1lc2h3YXIgc2luZ2ggb2YgZGFyYmhhbmdhIGFuZCBvdGhlcnNfMTY5ODMxODQ0OC5wZGY=', 'pdf3': 'https://cdnbbsr.s3waas.gov.in/s380537a945c7aaa788ccfcdf1b99b5d8f/uploads/2024/07/20240716890312078.pdf', 'pdf4': 'https://www.mha.gov.in/sites/default/files/250883_english_01042024.pdf', 'pdf5': 'https://rbidocs.rbi.org.in/rdocs/PressRelease/PDFs/PR60974A2ED1DFDB84EC0B3AABFB8419E1431.PDF', 'pdf6': 'https://digiscr.sci.gov.in/pdf_viewer?dir=YWRtaW4vanVkZ2VtZW50X2ZpbGUvanVkZ2VtZW50X3BkZi8xOTYwL3ZvbHVtZSAxL1BhcnQgSS90aGUgdGF0YSBvaWwgbWlsbHMgY28uIGx0ZC5faXRzIHdvcmttZW4gYW5kIG90aGVyc18xNjk5MzMzODYyLnBkZg==

In [42]:
# Step 3: Setup MongoDB Connection
client = MongoClient('mongodb://localhost:27017/')
db = client['pdf_summarization']  # Use your actual database name

In [44]:
# Step 5: Function to Download and Save PDFs
def download_pdf(url, file_name):
    response = requests.get(url)
    file_path = os.path.join(download_dir, file_name)
    with open(file_path, 'wb') as file:
        file.write(response.content)
    return file_path

In [48]:
# Step 6: Function to Parse PDF and Extract Text and Metadata
def parse_pdf(file_path):
    try:
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ''  # Extract text from each page

        # Metadata to be stored in MongoDB
        metadata = {
            'file_name': os.path.basename(file_path),
            'path': file_path,
            'size': os.path.getsize(file_path),
            'num_pages': len(pdf.pages)
        }

        # Insert metadata into MongoDB
        doc_id = db.pdfs.insert_one(metadata).inserted_id
        return {'_id': doc_id, 'text': text, 'metadata': metadata}
    except Exception as e:
        print(f"Error processing {file_path}: {e}")


In [50]:
# Step 7: Function to Summarize the Text
def summarize_text(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    word_count = Counter(text.split())
    top_sentences = sorted(sentences, key=lambda x: sum([word_count[word] for word in x.split()]), reverse=True)
    summary = " ".join(top_sentences[:5])  # Top 5 sentences for summary
    return summary

In [52]:
# Step 8: Function to Extract Keywords
def extract_keywords(text):
    words = re.findall(r'\b\w+\b', text.lower())
    stop_words = set(Counter(words).most_common(100))  # Add your stop words list here
    keywords = [word for word in words if word not in stop_words and len(word) > 3]
    common_keywords = Counter(keywords).most_common(10)
    return [keyword[0] for keyword in common_keywords]


In [54]:
# Step 9: Function to Process a Single PDF
def process_single_pdf(i, url):
    file_name = f"pdf_{i + 1}.pdf"
    pdf_path = download_pdf(url, file_name)  # Download the PDF
    result = parse_pdf(pdf_path)  # Parse and store metadata in MongoDB
    
    if result:
        # Print the extracted text before summarizing
        print(f"\nExtracted Text from {file_name}:\n")
        print(result['text'])
        
        summary = summarize_text(result['text'])
        keywords = extract_keywords(result['text'])
        # Update MongoDB with summary and keywords
        db.pdfs.update_one(
            {'_id': result['_id']},
            {'$set': {'summary': summary, 'keywords': keywords}}
        )
        print(f"Processed {file_name} and updated MongoDB.")

In [56]:
# Step 10: Loop Through Dataset and Process in Parallel
def process_dataset(dataset):
    start_time = time.time()  # Start timing the overall process

    # Using ThreadPoolExecutor for concurrency
    with ThreadPoolExecutor() as executor:
        executor.map(lambda i_url: process_single_pdf(i_url[0], i_url[1]), enumerate(dataset.items()))

    end_time = time.time()  # End timing the overall process
    print(f"\nTotal processing time: {end_time - start_time:.2f} seconds")


In [58]:
# Step 11: Run the Dataset Processing
process_dataset(dataset)


Total processing time: 0.02 seconds
