In [1]:
import os
from dotenv import load_dotenv
from typing import Union
import requests # For URL validation

# Langchain Imports
from langchain_groq import ChatGroq
from langchain.chains.summarize import load_summarize_chain
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader, WebBaseLoader, PyPDFLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
load_dotenv()

True

In [3]:
# --- Configuration ---
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_MODEL_NAME = "meta-llama/llama-4-scout-17b-16e-instruct"

# Check if API key is set
if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY not found in environment variables. Please set it in your .env file.")

# Initialize the Groq LLM
try:
    groq_llm = ChatGroq(
        temperature=0,  # Lower temperature for more consistent summaries
        groq_api_key=GROQ_API_KEY,
        model_name=GROQ_MODEL_NAME
    )
    print(f"Successfully initialized Groq LLM with model: {GROQ_MODEL_NAME}")
except Exception as e:
    print(f"Error initializing Groq LLM: {e}")
    print("Please ensure your GROQ_API_KEY is correct and the model name is valid.")
    exit()

Successfully initialized Groq LLM with model: meta-llama/llama-4-scout-17b-16e-instruct


In [4]:
# --- Document Utilities (unchanged) ---
def split_text_into_docs(text: str, chunk_size: int = 4000, chunk_overlap: int = 200) -> list[Document]:
    """Splits a long text string into a list of Langchain Document objects."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    docs = text_splitter.create_documents([text])
    return docs

In [5]:
def load_docs_from_url(url: str) -> Union[list[Document], None]: # <--- CHANGE MADE HERE
    """
    Loads content from a URL into a list of Langchain Document objects.
    Tries UnstructuredURLLoader first, falls back to WebBaseLoader.
    """
    print(f"Attempting to load content from URL: {url}")
    try:
        # Basic check for URL validity
        response = requests.head(url, allow_redirects=True, timeout=10)
        response.raise_for_status() # Raise an exception for bad status codes

        # Try UnstructuredURLLoader first for richer parsing
        loader = UnstructuredURLLoader(urls=[url])
        docs = loader.load()
        if docs and docs[0].page_content.strip(): # Check if content is actually loaded
            print(f"Content loaded successfully from {url} using UnstructuredURLLoader.")
            return docs
        else:
            print(f"UnstructuredURLLoader failed to get content or returned empty. Trying WebBaseLoader for {url}.")
            # Fallback to WebBaseLoader if UnstructuredURLLoader doesn't get content
            loader = WebBaseLoader(url)
            docs = loader.load()
            if docs and docs[0].page_content.strip():
                print(f"Content loaded successfully from {url} using WebBaseLoader.")
                return docs
            else:
                print(f"WebBaseLoader also failed to get content from {url}.")
                return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while loading URL {url}: {e}")
        return None

In [6]:
def load_docs_from_pdf(file_path: str) -> Union[list[Document], None]:
    """
    Loads content from a PDF file into a list of Langchain Document objects.
    Each page of the PDF becomes a separate Document.
    """
    print(f"Attempting to load content from PDF: {file_path}")
    if not os.path.exists(file_path):
        print(f"Error: PDF file not found at '{file_path}'")
        return None
    if not file_path.lower().endswith(".pdf"):
        print(f"Error: Provided file '{file_path}' is not a PDF.")
        return None

    try:
        loader = PyPDFLoader(file_path)
        # PyPDFLoader returns a list of Documents, where each Document is a page.
        docs = loader.load()
        if docs:
            print(f"Successfully loaded {len(docs)} pages from PDF.")
            return docs
        else:
            print(f"No content extracted from PDF: {file_path}. It might be empty or unreadable.")
            return None
    except Exception as e:
        print(f"Error loading PDF file '{file_path}': {e}")
        return None

In [7]:
# --- Summarization Function (Stuff Chain with verbose=True) ---
def summarize_text_stuff(
    input_content: str,
    llm: ChatGroq,
    prompt_template_str: str = None,
    input_type: str = "text" # Added input_type parameter
) -> str:
    """
    Summarizes text, URL, or PDF content using the Langchain 'stuff' chain with Groq.

    Args:
        input_content (str): The raw text, URL, or PDF file path to summarize.
        llm (ChatGroq): The initialized Groq LLM instance.
        prompt_template_str (str, optional): Custom prompt string.
        input_type (str): 'text', 'url', or 'pdf' to indicate the type of input_content.

    Returns:
        str: The generated summary.
    """
    docs_to_summarize = []

    if input_type == "url":
        docs = load_docs_from_url(input_content)
        if not docs:
            return "Failed to load content from the provided URL."
        combined_content = "\n\n".join([doc.page_content for doc in docs])
        docs_to_summarize = [Document(page_content=combined_content)]
    elif input_type == "pdf":
        docs = load_docs_from_pdf(input_content)
        if not docs:
            return "Failed to load content from the provided PDF."
        # For 'stuff' chain with PDF, combine all page contents into one large document
        # as long as it fits the context window.
        combined_content = "\n\n".join([doc.page_content for doc in docs])
        docs_to_summarize = [Document(page_content=combined_content)]
    elif input_type == "text":
        docs_to_summarize = [Document(page_content=input_content)]
    else:
        return "Invalid input_type specified. Must be 'text', 'url', or 'pdf'."


    if not docs_to_summarize or not docs_to_summarize[0].page_content.strip():
        return "No valid content provided for summarization."

    if prompt_template_str:
        prompt = PromptTemplate(template=prompt_template_str, input_variables=["text"])
    else:
        prompt = PromptTemplate(
            template="""You are an expert summarizer. Provide a concise and comprehensive summary of the following text. Focus on the main points and key information.

            Text:
            "{text}"

            Summary:""",
            input_variables=["text"]
        )

    print(f"Using summarization prompt:\n---\n{prompt.template}\n---")

    try:
        chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt, verbose=True)
        print("Invoking summarization chain...")
        summary = chain.invoke({"input_documents": docs_to_summarize})
        return summary["output_text"]
    except Exception as e:
        print(f"An error occurred during summarization: {e}")
        return f"Error during summarization: {e}"

In [8]:
# --- UPDATED Example Usage with User Input for Text, URL, or PDF ---
if __name__ == "__main__":
    print("\n--- Groq Text/URL/PDF Summarizer ---")
    print("This tool can summarize text, web articles, or PDF documents using Groq's LLMs.")

    while True:
        user_input_type = input("\nDo you want to summarize (T)ext, a (U)RL, or a (P)DF? (Type 'T', 'U', 'P', or 'Q' to quit): ").strip().upper()

        if user_input_type == 'Q':
            print("Exiting summarizer. Goodbye!")
            break
        elif user_input_type == 'T':
            print("\nPlease paste the text you want to summarize. Press Enter twice when done (empty line).")
            lines = []
            while True:
                line = input()
                if not line:
                    break
                lines.append(line)
            content_to_summarize = "\n".join(lines)

            if not content_to_summarize.strip():
                print("No text provided. Please try again.")
                continue

            print("\nSummarizing text...")
            summary = summarize_text_stuff(content_to_summarize, groq_llm, input_type="text")
            print("\n--- Generated Summary ---")
            print(summary)
            print("-------------------------")

        elif user_input_type == 'U':
            content_to_summarize = input("\nPlease enter the URL of the article you want to summarize: ").strip()

            if not (content_to_summarize.startswith("http://") or content_to_summarize.startswith("https://")):
                print("Invalid URL format. Please ensure it starts with 'http://' or 'https://'.")
                continue

            print(f"\nSummarizing URL: {content_to_summarize}...")
            summary = summarize_text_stuff(content_to_summarize, groq_llm, input_type="url")
            print("\n--- Generated Summary ---")
            print(summary)
            print("-------------------------")

        elif user_input_type == 'P':
            pdf_file_path = input("\nPlease enter the path to the PDF file you want to summarize: ").strip()

            if not pdf_file_path:
                print("No file path provided. Please try again.")
                continue
            if not os.path.exists(pdf_file_path):
                print(f"Error: File not found at '{pdf_file_path}'. Please check the path.")
                continue
            if not pdf_file_path.lower().endswith(".pdf"):
                print(f"Error: The file '{pdf_file_path}' does not appear to be a PDF.")
                continue

            print(f"\nSummarizing PDF: {pdf_file_path}...")
            summary = summarize_text_stuff(pdf_file_path, groq_llm, input_type="pdf")
            print("\n--- Generated Summary ---")
            print(summary)
            print("-------------------------")

        else:
            print("Invalid choice. Please enter 'T' for Text, 'U' for URL, 'P' for PDF, or 'Q' to quit.")


--- Groq Text/URL/PDF Summarizer ---
This tool can summarize text, web articles, or PDF documents using Groq's LLMs.



Do you want to summarize (T)ext, a (U)RL, or a (P)DF? (Type 'T', 'U', 'P', or 'Q' to quit):  P

Please enter the path to the PDF file you want to summarize:  C:\Users\hp\Downloads\Seminar_Report.pdf



Summarizing PDF: C:\Users\hp\Downloads\Seminar_Report.pdf...
Attempting to load content from PDF: C:\Users\hp\Downloads\Seminar_Report.pdf
Successfully loaded 8 pages from PDF.
Using summarization prompt:
---
You are an expert summarizer. Provide a concise and comprehensive summary of the following text. Focus on the main points and key information.

            Text:
            "{text}"

            Summary:
---
Invoking summarization chain...


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are an expert summarizer. Provide a concise and comprehensive summary of the following text. Focus on the main points and key information.

            Text:
            "RESEARCH DOMAIN 
 
Introduction 
 
Data from different sources in geography, combined using deep learning, is driving major advances in 
remote sensing, urban planning, watching over the environment and disaster management. Using a 
variet


Do you want to summarize (T)ext, a (U)RL, or a (P)DF? (Type 'T', 'U', 'P', or 'Q' to quit):  q


Exiting summarizer. Goodbye!
