In [None]:
# !pip install spacy networkx
# !python -m spacy download en_core_web_sm

In [1]:
import tkinter as tk
from tkinter import scrolledtext, messagebox, ttk
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
import networkx as nx
import numpy as np
import streamlit as st

--------------------------------------------------------------------------------

  CuPy may not function correctly because multiple CuPy packages are installed
  in your environment:

    cupy, cupy-cuda12x

  Follow these steps to resolve this issue:

    1. For all packages listed above, run the following command to remove all
       existing CuPy installations:

         $ pip uninstall <package_name>

      If you previously installed CuPy via conda, also run the following:

         $ conda uninstall cupy

    2. Install the appropriate CuPy package.
       Refer to the Installation Guide for detailed instructions.

         https://docs.cupy.dev/en/stable/install.html

--------------------------------------------------------------------------------



***Dont use this in collab***

In [None]:
class TextSummarizerApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Text Summarizer")
        self.root.geometry("800x600")
        self.root.configure(bg="#f5f5f5")
        
        # Initialize spaCy model
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            messagebox.showerror("Model Error", "Please install the spaCy model by running:\npython -m spacy download en_core_web_sm")
            root.destroy()
            return
        
        self.create_widgets()
    
    def create_widgets(self):
        # Header
        header_frame = tk.Frame(self.root, bg="#f5f5f5")
        header_frame.pack(fill=tk.X, padx=10, pady=10)
        
        tk.Label(
            header_frame, 
            text="TextRank Summarizer", 
            font=("Arial", 18, "bold"),
            bg="#f5f5f5"
        ).pack(side=tk.LEFT)
        
        # Main content
        content_frame = tk.Frame(self.root, bg="#f5f5f5")
        content_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
        
        # Left panel for input
        left_frame = tk.LabelFrame(content_frame, text="Original Text", bg="#f5f5f5", font=("Arial", 10))
        left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        self.input_text = scrolledtext.ScrolledText(left_frame, wrap=tk.WORD, font=("Arial", 11))
        self.input_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        # Right panel for output
        right_frame = tk.LabelFrame(content_frame, text="Summary", bg="#f5f5f5", font=("Arial", 10))
        right_frame.pack(side=tk.RIGHT, fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        self.output_text = scrolledtext.ScrolledText(right_frame, wrap=tk.WORD, font=("Arial", 11))
        self.output_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        # Controls panel
        controls_frame = tk.Frame(self.root, bg="#f5f5f5")
        controls_frame.pack(fill=tk.X, padx=10, pady=10)
        
        # Summary length selection
        tk.Label(
            controls_frame, 
            text="Summary length:", 
            bg="#f5f5f5", 
            font=("Arial", 10)
        ).pack(side=tk.LEFT, padx=(0, 5))
        
        self.summary_percent = tk.StringVar(value="30%")
        summary_options = ["10%", "20%", "30%", "40%", "50%"]
        summary_dropdown = ttk.Combobox(
            controls_frame, 
            textvariable=self.summary_percent, 
            values=summary_options, 
            width=5,
            state="readonly"
        )
        summary_dropdown.pack(side=tk.LEFT, padx=(0, 10))
        
        # Buttons
        self.summarize_btn = tk.Button(
            controls_frame, 
            text="Summarize", 
            command=self.summarize_text,
            bg="#4CAF50", 
            fg="white",
            font=("Arial", 10, "bold"),
            padx=15,
            relief=tk.RAISED,
            borderwidth=2
        )
        self.summarize_btn.pack(side=tk.LEFT, padx=5)
        
        self.clear_btn = tk.Button(
            controls_frame, 
            text="Clear", 
            command=self.clear_fields,
            bg="#f44336", 
            fg="white",
            font=("Arial", 10, "bold"),
            padx=15,
            relief=tk.RAISED,
            borderwidth=2
        )
        self.clear_btn.pack(side=tk.LEFT, padx=5)
        
        # Status bar
        self.status_var = tk.StringVar()
        self.status_var.set("Ready")
        self.status_bar = tk.Label(
            self.root, 
            textvariable=self.status_var, 
            anchor=tk.W, 
            bg="#e0e0e0", 
            relief=tk.SUNKEN,
            padx=5
        )
        self.status_bar.pack(side=tk.BOTTOM, fill=tk.X)
    
    def summarize_text(self):
        """Generate a summary of the input text using TextRank"""
        text = self.input_text.get("1.0", tk.END).strip()
        
        if not text:
            messagebox.showwarning("Warning", "Please enter text to summarize.")
            return
        
        # Update status
        self.status_var.set("Summarizing...")
        self.root.update_idletasks()
        
        try:
            # Parse percentage
            percent = int(self.summary_percent.get().strip('%'))
            summary = self.textrank_summary(text, percent/100)
            
            # Display summary
            self.output_text.delete("1.0", tk.END)
            self.output_text.insert(tk.END, summary)
            
            # Update status
            self.status_var.set(f"Summary complete. ({len(summary.split())} words)")
        
        except Exception as e:
            messagebox.showerror("Error", f"An error occurred: {str(e)}")
            self.status_var.set("Error occurred during summarization.")
    
    def textrank_summary(self, text, per):
        """Generate text summary using TextRank algorithm"""
        doc = self.nlp(text)
        sentences = list(doc.sents)
        
        if len(sentences) <= 1:
            return text
        
        # Create sentence vectors using spaCy's word vectors
        sentence_vectors = []
        for sent in sentences:
            # Skip sentences with no words with vectors
            if not any(token.has_vector for token in sent):
                sent_vec = np.zeros((len(sent), 96))  # Default embedding dimension
            else:
                words_with_vectors = [token.vector for token in sent if token.has_vector]
                if not words_with_vectors:
                    sent_vec = np.zeros(96)  # Default dimension
                else:
                    sent_vec = np.mean(words_with_vectors, axis=0)
            sentence_vectors.append(sent_vec)
        
        # Create similarity matrix
        sim_mat = np.zeros([len(sentences), len(sentences)])
        
        # Fill the similarity matrix
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    # Make sure we don't divide by zero
                    if np.linalg.norm(sentence_vectors[i]) * np.linalg.norm(sentence_vectors[j]) == 0:
                        sim_mat[i][j] = 0
                    else:
                        sim_mat[i][j] = self._cosine_similarity(sentence_vectors[i], sentence_vectors[j])
        
        # Create networkx graph and add edges with weights
        nx_graph = nx.from_numpy_array(sim_mat)
        
        # Apply PageRank algorithm
        scores = nx.pagerank(nx_graph)
        
        # Sort sentences by score and select top sentences
        ranked_sentences = sorted(((scores[i], i, s) for i, s in enumerate(sentences)), reverse=True)
        
        # Calculate the number of sentences for the summary
        summary_size = max(1, int(len(sentences) * per))
        
        # Get top N sentences and sort them by original position
        top_sentences = sorted(ranked_sentences[:summary_size], key=lambda x: x[1])
        
        # Combine sentences into summary
        summary = " ".join([s.text for _, _, s in top_sentences])
        
        return summary
    
    def _cosine_similarity(self, vec1, vec2):
        """Calculate cosine similarity between two vectors"""
        # Handle zero vectors
        if np.all(vec1 == 0) or np.all(vec2 == 0):
            return 0
        
        # Calculate cosine similarity
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    
    def clear_fields(self):
        """Clear input and output text fields"""
        self.input_text.delete("1.0", tk.END)
        self.output_text.delete("1.0", tk.END)
        self.status_var.set("Ready")


In [None]:
if __name__ == "__main__":
    try:
        root = tk.Tk()
        app = TextSummarizerApp(root)
        root.mainloop()
    except Exception as e:
        messagebox.showerror("Error", f"Application error: {str(e)}") 

***USE this code***

In [2]:
class TextSummarizer:
    def __init__(self):
        # Initialize spaCy model
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            print("Error: Please install the spaCy model by running:")
            print("python -m spacy download en_core_web_sm")
            raise
    
    def summarize(self, text, percent=0.3):
        """
        Generate a summary of the given text using TextRank algorithm
        
        Args:
            text (str): The text to summarize
            percent (float): Percentage of original text to include in summary (0.1 to 0.5)
            
        Returns:
            str: The generated summary
        """
        if not isinstance(text, str) or text.strip() == "":
            return "No text provided for summarization."
            
        # Make sure percent is between 0.1 and 0.5
        percent = max(0.1, min(0.5, percent))
        
        return self.textrank_summary(text, percent)
    
    def textrank_summary(self, text, per):
        """Generate text summary using TextRank algorithm"""
        doc = self.nlp(text)
        sentences = list(doc.sents)
        
        if len(sentences) <= 1:
            return text
        
        # Create sentence vectors using spaCy's word vectors
        sentence_vectors = []
        for sent in sentences:
            # Skip sentences with no words with vectors
            if not any(token.has_vector for token in sent):
                sent_vec = np.zeros((len(sent), 96))  # Default embedding dimension
            else:
                words_with_vectors = [token.vector for token in sent if token.has_vector]
                if not words_with_vectors:
                    sent_vec = np.zeros(96)  # Default dimension
                else:
                    sent_vec = np.mean(words_with_vectors, axis=0)
            sentence_vectors.append(sent_vec)
        
        # Create similarity matrix
        sim_mat = np.zeros([len(sentences), len(sentences)])
        
        # Fill the similarity matrix
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    # Make sure we don't divide by zero
                    if np.linalg.norm(sentence_vectors[i]) * np.linalg.norm(sentence_vectors[j]) == 0:
                        sim_mat[i][j] = 0
                    else:
                        sim_mat[i][j] = self._cosine_similarity(sentence_vectors[i], sentence_vectors[j])
        
        # Create networkx graph and add edges with weights
        nx_graph = nx.from_numpy_array(sim_mat)
        
        # Apply PageRank algorithm
        scores = nx.pagerank(nx_graph)
        
        # Sort sentences by score and select top sentences
        ranked_sentences = sorted(((scores[i], i, s) for i, s in enumerate(sentences)), reverse=True)
        
        # Calculate the number of sentences for the summary
        summary_size = max(1, int(len(sentences) * per))
        
        # Get top N sentences and sort them by original position
        top_sentences = sorted(ranked_sentences[:summary_size], key=lambda x: x[1])
        
        # Combine sentences into summary
        summary = " ".join([s.text for _, _, s in top_sentences])
        
        return summary
    
    def _cosine_similarity(self, vec1, vec2):
        """Calculate cosine similarity between two vectors"""
        # Handle zero vectors
        if np.all(vec1 == 0) or np.all(vec2 == 0):
            return 0
        
        # Calculate cosine similarity
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


In [3]:
# Example usage
if __name__ == "__main__":
    # Sample text for demonstration
    sample_text = """
    Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. 
    AI research has been defined as the field of study of intelligent agents, which refers to any system that perceives its environment and takes actions that maximize its chance of achieving its goals.
    The term "artificial intelligence" had previously been used to describe machines that mimic and display "human" cognitive skills that are associated with the human mind, such as "learning" and "problem-solving". 
    This definition has since been rejected by major AI researchers who now describe AI in terms of rationality and acting rationally, which does not limit how intelligence can be articulated.
    AI applications include advanced web search engines, recommendation systems, understanding human speech, self-driving cars, automated decision-making and competing at the highest level in strategic game systems.
    As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect. 
    For instance, optical character recognition is frequently excluded from things considered to be AI, having become a routine technology.
    """
    
    # Create a summarizer
    summarizer = TextSummarizer()
    
    # Generate summary at 30% length
    summary = summarizer.summarize(sample_text, 0.5)
    
    # Print results
    print("Original Text Length:", len(sample_text.split()), "words")
    print("Summary Length:", len(summary.split()), "words")
    print("\n--- SUMMARY ---\n")
    print(summary)

Original Text Length: 181 words
Summary Length: 74 words

--- SUMMARY ---

This definition has since been rejected by major AI researchers who now describe AI in terms of rationality and acting rationally, which does not limit how intelligence can be articulated.
     As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect. 
     For instance, optical character recognition is frequently excluded from things considered to be AI, having become a routine technology.
    


In [None]:
from pypdf import PdfReader

In [6]:
def extract_text_from_pdf(pdf_file):
    """Extract text from a PDF file and return chunks with metadata."""
    reader = PdfReader(pdf_file)
    chunks = []
    pdf_name = pdf_file.name
    for page_num, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        if text.strip():
            chunks.append({
                "text": text,
                "metadata": {"pdf_file": pdf_name, "page_number": page_num}
            })
    return chunks

In [7]:
if __name__ == '__main__':
    st.set_page_config(page_title="Text Summarizer", page_icon=":material/Summarize:")    

    st.title("Text Summarizer")

    with st.sidebar():
        uploaded_file = st.file_uploader("Choose a file", accept_multiple_files=False, type=['pdf'])

        if uploaded_file:
             text = extract_text_from_pdf(uploaded_file)
             st.write(text)


StreamlitAPIException: The value `":material​/Summarize:"` is not a valid Material icon. Please use a Material icon shortcode like **`:material​/thumb_up:`**. 