In [10]:
! pip install nltk,pymongo

Collecting nltk


[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip



  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.9.11-cp311-cp311-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ---------------------------------------  41.0/41.5 kB ? eta -:--:--
     ---------------------------------------  41.0/41.5 kB ? eta -:--:--
     ---------------------------------------  41.0/41.5 kB ? eta -:--:--
     ---------------------------------------  41.0/41.5 kB ? eta -:--:--
     -------------------------------------- 41.5/41.5 kB 100.3 kB/s eta 0:00:00
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2024.9.11-cp311-cp311-win_amd64.whl (274 kB)
   ---------------------------------------- 0.0/274.0 kB ? eta -:--:--
   -- ------------------------------------- 20.5/274.0 kB ? eta -:--:--
   ------- ------------------------------- 51.2/274.0 kB 525.1 kB/s eta 0:00:01
   ------- ------------------------------- 51.

In [1]:
import pandas as pd

def load_data(file_link):
    """
    Loads data from a specified file link or local path and returns it as a DataFrame.
    
    Args:
        file_link: The link (URL or local path) to the file. The file can be in JSON, CSV, or Excel format.
    
    Returns:
        data: A pandas DataFrame containing the loaded data. If the file is invalid or cannot be loaded, it returns None.
    """
    valid_local_drives = ('D:', 'C:', 'E:')  # Local drive letters to check for file paths
    valid_extensions = ('.json', '.csv', '.xlsx')  # Valid file extensions

    # Check if the file link is a URL
    if file_link.startswith('https') and file_link.endswith(valid_extensions):
        try:
            # Load the data based on its extension
            if file_link.endswith('.json'):
                data = pd.read_json(file_link, typ='series')
            elif file_link.endswith('.csv'):
                data = pd.read_csv(file_link)
            elif file_link.endswith('.xlsx'):
                data = pd.read_excel(file_link)
            else:
                print("Unsupported file format.")
                return None
        except Exception as e:
            print(f"Error loading file from URL: {e}")
            return None

    # Check if the file link is a local file path
    elif file_link.startswith(valid_local_drives) and file_link.endswith(valid_extensions):
        try:
            # Load the data based on its extension
            if file_link.endswith('.json'):
                data = pd.read_json(file_link, typ='series')
            elif file_link.endswith('.csv'):
                data = pd.read_csv(file_link)
            elif file_link.endswith('.xlsx'):
                data = pd.read_excel(file_link)
            else:
                print("Unsupported file format.")
                return None
        except Exception as e:
            print(f"Error loading file from local path: {e}")
            return None

    else:
        print('Invalid link or file path. Please provide a valid link or path.')

    # If the data is a Series, convert it to a DataFrame
    if isinstance(data, pd.Series):
        data = data.to_frame('Values')

    return data


# Example usage
link = input('''Enter the file link (e.g., URL or local path): https://raw.githubusercontent.com/Devian158/AI-Internship-Task/main/Dataset.json
''')

data = load_data(link)

# Display the first few rows of the DataFrame if data is successfully loaded
if data is not None:
    data.head()


In [2]:
data.head()

Unnamed: 0,Values
pdf1,https://digiscr.sci.gov.in/pdf_viewer?dir=YWRt...
pdf2,https://digiscr.sci.gov.in/pdf_viewer?dir=YWRt...
pdf3,https://cdnbbsr.s3waas.gov.in/s380537a945c7aaa...
pdf4,https://www.mha.gov.in/sites/default/files/250...
pdf5,https://rbidocs.rbi.org.in/rdocs/PressRelease/...


In [3]:
import os
import requests

def download_and_store(data, folder_name):
    """
    Downloads all data from the links in the DataFrame and stores them in a specified folder.
    
    Args:
        data: A pandas DataFrame with a column containing the links to download.
        folder_name: The name of the folder where the downloaded data will be stored.
    """
    
    # Create the folder if it does not exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Iterate over each row in the DataFrame to get the links
    for index, row in data.iterrows():
        link = row['Values']  #the download links are stored in the 'Values' column
        
        try:
            # Send an HTTP GET request to download the file from the link
            response = requests.get(link, stream=True)
            
            # Check if the request was successful (HTTP status code 200)
            if response.status_code == 200:
                # Construct the file name and path to store the downloaded file
                file_name = os.path.join(folder_name, f"{index}.pdf")  # Assuming the files are PDFs. Adjust as needed.
                
                # Write the content of the file in chunks to avoid memory issues with large files
                with open(file_name, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                
                print(f"Downloaded and saved {link} as {file_name}")
            else:
                # If the request fails, log the status code
                print(f"Failed to download {link} (Status code: {response.status_code})")
        
        except Exception as e:
            # If there is any error during the download, print the error message
            print(f"Error downloading {link}: {e}")

# Specify the name of the folder where the downloaded files will be stored
folder_name = "downloaded_data"

# Download and store all data from the DataFrame
download_and_store(data, folder_name)


Downloaded and saved https://digiscr.sci.gov.in/pdf_viewer?dir=YWRtaW4vanVkZ2VtZW50X2ZpbGUvanVkZ2VtZW50X3BkZi8xOTUwL3ZvbHVtZSAxL1BhcnQgSS9Db21taXNzaW9uZXIgb2YgSW5jb21lIFRheCwgV2VzdCBCZW5nYWxfQ2FsY3V0dGEgQWdlbmN5IEx0ZC5fMTY5NzYwNjMxMC5wZGY= as downloaded_data\pdf1.pdf
Downloaded and saved https://digiscr.sci.gov.in/pdf_viewer?dir=YWRtaW4vanVkZ2VtZW50X2ZpbGUvanVkZ2VtZW50X3BkZi8xOTUyL3ZvbHVtZSAxL1BhcnQgSS90aGUgc3RhdGUgb2YgYmloYXJfbWFoYXJhamFkaGlyYWphIHNpciBrYW1lc2h3YXIgc2luZ2ggb2YgZGFyYmhhbmdhIGFuZCBvdGhlcnNfMTY5ODMxODQ0OC5wZGY= as downloaded_data\pdf2.pdf
Downloaded and saved https://cdnbbsr.s3waas.gov.in/s380537a945c7aaa788ccfcdf1b99b5d8f/uploads/2024/07/20240716890312078.pdf as downloaded_data\pdf3.pdf
Downloaded and saved https://www.mha.gov.in/sites/default/files/250883_english_01042024.pdf as downloaded_data\pdf4.pdf
Downloaded and saved https://rbidocs.rbi.org.in/rdocs/PressRelease/PDFs/PR60974A2ED1DFDB84EC0B3AABFB8419E1431.PDF as downloaded_data\pdf5.pdf
Downloaded and saved https

In [4]:
import os
import pdfplumber
from concurrent.futures import ThreadPoolExecutor
from collections import Counter
import nltk
from pymongo import MongoClient
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt_tab')
nltk.download('stopwords')

from nltk.corpus import stopwords

stopwords.words('english')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [5]:
import os

def get_pdf_files(folder_path):
    """
    Retrieve all PDF files from a given folder.

    Args:
        folder_path (str): The path to the folder containing PDF files.

    Returns:
        list: A list of file paths for all PDF files in the folder.
    """
    pdf_files = [
        os.path.join(folder_path, file)
        for file in os.listdir(folder_path)
        if file.endswith('.pdf')
    ]
    
    return pdf_files


In [6]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        tuple: A tuple containing the extracted text (str) and the number of pages (int).
    """
    text = ""
    
    # Open the PDF file using pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        num_pages = len(pdf.pages)  # Get the total number of pages
        
        # Extract text from each page
        for page in pdf.pages:
            text += page.extract_text() or ""  # Handle pages with no text

    return text, num_pages


In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter

def custom_summarization(text, num_sentences=3):
    """
    Summarize the text by selecting the top N ranked sentences.

    Args:
        text (str): The input text to summarize.
        num_sentences (int): The number of sentences to include in the summary (default is 3).

    Returns:
        str: A summary composed of the top-ranked sentences.
    """
    # Tokenize the text into sentences and words
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())

    # Create a frequency distribution of words
    word_freq = Counter(words)

    # Rank sentences based on the sum of word frequencies
    ranked_sentences = sorted(
        sentences,
        key=lambda sentence: sum(word_freq[word] for word in word_tokenize(sentence.lower())),
        reverse=True
    )

    # Select the top N ranked sentences for the summary
    summary = " ".join(ranked_sentences[:num_sentences])
    
    return summary


In [8]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    """
    Remove stop words from the input text.

    Args:
        text (str): The input text from which to remove stop words.

    Returns:
        str: The text with stop words removed.
    """
    # Create a set of English stop words for faster lookup
    stop_words = set(stopwords.words('english'))
    
    # Split the text into words and filter out stop words
    filtered_words = [
        word for word in text.split() if word.lower() not in stop_words
    ]
    
    # Join the filtered words back into a single string
    return " ".join(filtered_words)


In [9]:

def extract_keywords(text, top_n=5):
    """
    Extract top N keywords based on frequency.
    
    Parameters:
    text (str): The input text from which to extract keywords.
    top_n (int): The number of top keywords to return. Default is 5.
    
    Returns:
    list: A list of top N keywords.
    """
    if not text:
        return []
    
    stop_words = set(stopwords.words('english'))
    
    # Remove stopwords and convert text to lowercase
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    text = text.lower()
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Calculate word frequencies
    word_freq = Counter(words)
    
    # Return top N keywords that are alphabetic
    return [word for word, freq in word_freq.most_common(top_n) if word.isalpha()]


In [10]:
import os
from pymongo import MongoClient

def save_to_mongo(pdf_path, summary, keywords):
    """
    Save the processed data to MongoDB.

    Args:
        pdf_path (str): The path to the PDF file.
        summary (str): The summarized text of the PDF.
        keywords (list): A list of keywords associated with the PDF.
    """
    # Extract the PDF name from the file path without the extension
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    
    # Connect to MongoDB
    client = MongoClient('localhost', 27017)
    db = client['pdf_summarization53']
    collection = db['summaries']
    
    # Insert the data into the collection
    collection.insert_one({
        'pdf_name': pdf_name,
        'summary': summary,
        'keywords': keywords
    })


In [11]:
def process_single_pdf(pdf_path):
    """
    Process a single PDF file to extract text, summarize it, and save the data to MongoDB.

    Args:
        pdf_path (str): The path to the PDF file to process.
    """
    try:
        # Extract text and number of pages from the PDF file
        text, num_pages = extract_text_from_pdf(pdf_path)
        
        # Determine the number of sentences for the summary based on the number of pages
        if num_pages <= 10:
            summary = custom_summarization(text, num_sentences=3)  # For PDFs with 10 or fewer pages, summarize with 3 sentences
        elif 10 < num_pages <= 30:
            summary = custom_summarization(text, num_sentences=5)  # For PDFs with 11 to 30 pages, summarize with 5 sentences
        else:
            summary = custom_summarization(text, num_sentences=10)  # For PDFs with more than 30 pages, summarize with 10 sentences
        
        # Extract keywords from the text
        keywords = extract_keywords(text)
        
        # Save the PDF name, summary, and keywords to MongoDB
        save_to_mongo(pdf_path, summary, keywords)
    
    except Exception as e:
        # Handle exceptions that may occur during processing and print an error message
        print(f"Error processing {pdf_path}: {str(e)}")


In [12]:
from concurrent.futures import ThreadPoolExecutor

def process_pdf_concurrently(pdf_paths):
    """
    Process multiple PDF files concurrently using multithreading.

    Args:
        pdf_paths (list): A list of paths to PDF files to process.
    """
    # Create a ThreadPoolExecutor to manage a pool of threads for concurrent processing
    with ThreadPoolExecutor(max_workers=4) as executor:
        # Map the process_single_pdf function to the list of PDF paths
        # This will execute process_single_pdf for each PDF path concurrently
        executor.map(process_single_pdf, pdf_paths)


In [13]:
def run_pipeline(folder_path):
    """Main pipeline to process all PDFs in a folder."""
    pdf_paths = get_pdf_files(folder_path)
    process_pdf_concurrently(pdf_paths)



In [None]:
# Run the pipeline (Provide the folder path)
n=r'D:\Deep learning model\nlp Projects\text\downloaded_data'
run_pipeline(n)

In [None]:
https://github.com/12ashwani/text_pipeline.git