<a href="https://colab.research.google.com/github/AK-Singh8/sem6_minor/blob/main/Summarize_Research_Papers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
import fitz  # PyMuPDF for extracting text from PDFs
import nltk
import ipywidgets as widgets
from IPython.display import display
import io
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import numpy as np

In [29]:
# Load models
model = SentenceTransformer('all-MiniLM-L6-v2')
key_model = KeyBERT('all-MiniLM-L6-v2')
sentiment_model = "distilbert-base-uncased-finetuned-sst-2-english"
lemmatizer = WordNetLemmatizer()

# Widgets for input selection
input_selection = widgets.Dropdown(
    options=[("Select Input Type", ""), ("Upload PDF", "pdf"), ("Enter Text", "text")],
    description="Input:",
    disabled=False,
)

# PDF upload widget
upload_widget = widgets.FileUpload(accept='.pdf', multiple=False)

# Text input widget
text_area = widgets.Textarea(
    placeholder="Enter your text here...",
    layout=widgets.Layout(width="100%", height="150px"),
)

# Dropdown for user selection
task_dropdown = widgets.Dropdown(
    options=[("Select an Option", ""), ("Summarizer", "summarizer"), ("Keywords", "keywords"), ("Sentiment Analysis", "sentiment")],
    description="Task:",
    disabled=True,  # Initially disabled
)

# Output display
output = widgets.Output()

# Function to extract text from uploaded PDF
def extract_text_from_pdf(uploaded_file):
    try:
        pdf_bytes = uploaded_file['content']
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        text = "\n".join(page.get_text("text") for page in doc)
        return text.strip()
    except Exception as e:
        return f"Error extracting text: {e}"

# Summarization function
def summarizer(abstract, top_n=3):
    with output:
        output.clear_output()
        if not abstract.strip():
            print("No text provided for summarization.")
            return

        papers = [sentence.strip() for sentence in abstract.split(".") if sentence.strip()]

        if not papers:
            print("No valid sentences found for summarization.")
            return

        sentence_embeddings = np.array(model.encode(papers))
        mean_embedding = np.mean(sentence_embeddings, axis=0)
        similarities = cosine_similarity([mean_embedding], sentence_embeddings)[0]

        top_n = min(top_n, len(papers))
        top_sentence_indices = similarities.argsort()[-top_n:][::-1]
        summary = [papers[i] for i in top_sentence_indices]

        print("\nSummary:")
        for sentence in summary:
            print(sentence)

# Keyword extraction function
def keywords(abstract):
    with output:
        output.clear_output()
        if not abstract.strip():
            print("No text provided for keyword extraction.")
            return

        def get_wordnet_pos(word):
            tag = wordnet.synsets(word)
            return tag[0].pos() if tag else wordnet.NOUN

        extracted_keywords = key_model.extract_keywords(abstract, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=10)
        lemmatized_keywords = list(set([lemmatizer.lemmatize(keyword, get_wordnet_pos(keyword)) for keyword, _ in extracted_keywords]))

        print("\nUnique Keywords:", lemmatized_keywords[:5])

# Sentiment analysis function
def sentiment_analysis(abstract):
    with output:
        output.clear_output()
        if not abstract.strip():
            print("No text provided for sentiment analysis.")
            return

        sentiment_analysis_model = pipeline("sentiment-analysis", sentiment_model)
        sentiment_results = sentiment_analysis_model(abstract)

        print("\nSentiment Analysis Result:", sentiment_results)

# Handle input type selection
def handle_input_selection(change):
    if input_selection.value == "pdf":
        upload_widget.layout.display = "block"
        text_area.layout.display = "none"
        task_dropdown.disabled = True  # Disable task selection until PDF is processed
    elif input_selection.value == "text":
        upload_widget.layout.display = "none"
        text_area.layout.display = "block"
        task_dropdown.disabled = False  # Enable task selection for manual text
    else:
        upload_widget.layout.display = "none"
        text_area.layout.display = "none"
        task_dropdown.disabled = True

# Function to handle uploaded file
def handle_upload(change):
    uploaded_file = list(upload_widget.value.values())[0] if upload_widget.value else None

    if not uploaded_file:
        with output:
            output.clear_output()
            print("No file uploaded.")
        return

    abstract = extract_text_from_pdf(uploaded_file)
    if not abstract:
        with output:
            output.clear_output()
            print("Failed to extract text from the PDF.")
        return

    with output:
        output.clear_output()
        print("\nExtracted Text:\n", abstract)

    task_dropdown.disabled = False  # Enable task selection after text extraction

# Enable task selection when text is entered manually
def handle_text_change(change):
    task_dropdown.disabled = False if text_area.value.strip() else True

# Function to handle task selection
def handle_task_selection(change):
    if input_selection.value == "pdf":
        uploaded_file = list(upload_widget.value.values())[0] if upload_widget.value else None
        if not uploaded_file:
            with output:
                output.clear_output()
                print("No file uploaded.")
            return
        abstract = extract_text_from_pdf(uploaded_file)
    elif input_selection.value == "text":
        abstract = text_area.value
    else:
        with output:
            output.clear_output()
            print("Please select an input type first.")
        return

    if not abstract:
        with output:
            output.clear_output()
            print("No text available for processing.")
        return

    task = task_dropdown.value

    if task == "summarizer":
        summarizer(abstract)
    elif task == "keywords":
        keywords(abstract)
    elif task == "sentiment":
        sentiment_analysis(abstract)
    else:
        with output:
            output.clear_output()
            print("Please select a valid option.")

# Attach handlers
input_selection.observe(handle_input_selection, names='value')
upload_widget.observe(handle_upload, names='value')
text_area.observe(handle_text_change, names='value')
task_dropdown.observe(handle_task_selection, names='value')

# Initially hide upload and text area widgets
upload_widget.layout.display = "none"
text_area.layout.display = "none"

# Display widgets
display(input_selection, upload_widget, text_area, task_dropdown, output)


Dropdown(description='Input:', options=(('Select Input Type', ''), ('Upload PDF', 'pdf'), ('Enter Text', 'text…

FileUpload(value={}, accept='.pdf', description='Upload', layout=Layout(display='none'))

Textarea(value='', layout=Layout(display='none', height='150px', width='100%'), placeholder='Enter your text h…

Dropdown(description='Task:', disabled=True, options=(('Select an Option', ''), ('Summarizer', 'summarizer'), …

Output()