In [1]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Import LangChain document loaders
from langchain_community.document_loaders import UnstructuredURLLoader, PyPDFLoader, UnstructuredFileLoader, WebBaseLoader

# Import Vector Store and Embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma # Using Chroma for persistent vector store

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
# Load environment variables from .env file
load_dotenv()

True

In [3]:
# --- LLM and Prompt Setup ---

# 1. Initialize your LLM with Groq
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    raise ValueError("GROQ_API_KEY not found in environment variables. Please set it in your .env file or as an environment variable.")

llm = ChatGroq(model="meta-llama/llama-4-scout-17b-16e-instruct", temperature=0, api_key=groq_api_key)

In [4]:
# 2. Define your prompt template
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "From the following text, extract all important keywords and key phrases. "
            "Focus on domain-specific terminology, named entities, and core concepts. "
            "Format your response as a comma-separated list of phrases."
        ),
        ("human", "{text}"),
    ]
)

In [5]:
# 3. Create a keyword extraction chain
keyword_extraction_chain = prompt | llm | StrOutputParser()

# --- Embeddings Model Setup ---
# Choose an embeddings model (e.g., 'all-MiniLM-L6-v2' is good for general purpose)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# --- Input Handling Functions using LangChain Loaders ---

def load_documents_from_input(user_input: str) -> list[Document]:
    """
    Determines the input type (URL, file path, or plain text) and loads content
    into LangChain Document objects using appropriate loaders.
    """
    documents = []

    if user_input.startswith("http://") or user_input.startswith("https://"):
        print(f"Detected URL: {user_input}")
        try:
            loader = UnstructuredURLLoader(urls=[user_input])
            documents = loader.load()
            if not documents:
                print("UnstructuredURLLoader returned no documents. Trying WebBaseLoader...")
                loader = WebBaseLoader(user_input)
                documents = loader.load()
        except Exception as e:
            print(f"Error loading URL {user_input} with LangChain loaders: {e}")
            documents = []
    elif os.path.exists(user_input):
        if user_input.lower().endswith(".pdf"):
            print(f"Detected PDF file: {user_input}")
            try:
                loader = PyPDFLoader(user_input)
                documents = loader.load()
            except Exception as e:
                print(f"Error loading PDF {user_input}: {e}")
        elif user_input.lower().endswith((".docx", ".doc")):
            print(f"Detected Word document: {user_input}")
            try:
                loader = UnstructuredFileLoader(user_input)
                documents = loader.load()
            except Exception as e:
                print(f"Error loading Word document {user_input}: {e}")
        else: # Assume it's a plain text file or other file UnstructuredFileLoader can handle
            print(f"Detected file: {user_input}")
            try:
                loader = UnstructuredFileLoader(user_input)
                documents = loader.load()
            except Exception as e:
                print(f"Error loading file {user_input}: {e}")
    else:
        print("Detected plain text input.")
        documents = [Document(page_content=user_input)]

    return documents

In [7]:
# --- Main Execution Flow ---

if __name__ == "__main__":
    print("Welcome to the Universal Keyword Extractor!")
    print("You can provide text directly, a file path (PDF, DOCX, TXT), or a URL.")
    user_input = input("Enter your text, file path, or URL: ").strip()

    # Define a directory for Chroma persistence
    CHROMA_DB_DIR = "./chroma_db"
    if not os.path.exists(CHROMA_DB_DIR):
        os.makedirs(CHROMA_DB_DIR)

    # Load documents using LangChain loaders
    loaded_docs = load_documents_from_input(user_input)

    if not loaded_docs:
        print("No content extracted from the provided input. Exiting.")
    else:
        # Concatenate content of all loaded documents for splitting
        full_article_content = "\n\n".join([doc.page_content for doc in loaded_docs])

        # 4. Split the document into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100,
            length_function=len,
            is_separator_regex=False,
        )
        # Split the full content into Document chunks
        chunks = text_splitter.create_documents([full_article_content])

        print(f"Split article into {len(chunks)} chunks.")

        # --- Initialize and Populate Chroma Vector Store ---
        print(f"\nAttempting to load/create Chroma DB at '{CHROMA_DB_DIR}'...")
        try:
            # Check if a Chroma collection with chunks already exists for this content
            # (This is a simple check, for a real app you might hash content or use doc IDs)
            collection_name = "keyword_extraction_chunks" # A consistent name for your collection

            # Use Chroma.from_documents to create and persist, or load if it exists
            # Chroma will automatically use `get_or_create_collection` under the hood when `persist_directory` is set.
            vectorstore = Chroma.from_documents(
                chunks, # The documents to add/embed
                embeddings, # The embedding function to use
                collection_name=collection_name, # Name your collection
                persist_directory=CHROMA_DB_DIR # Directory to save the database
            )
            vectorstore.persist() # Explicitly persist after adding documents

            print(f"Chroma vector store '{collection_name}' loaded/created successfully.")
            print(f"Number of items in Chroma: {vectorstore._collection.count()}")


            # Example: You can now perform similarity search if needed (optional for keyword extraction)
            # query = "What are the main challenges of AI?"
            # similar_docs = vectorstore.similarity_search(query, k=2)
            # print(f"\nTop 2 similar chunks for query '{query}':")
            # for doc in similar_docs:
            #     print(f"- {doc.page_content[:200]}...")

        except Exception as e:
            print(f"Error creating/loading Chroma vector store: {e}")
            vectorstore = None # Ensure vectorstore is None if creation fails

        # --- Keyword Extraction from Chunks (unchanged) ---
        all_extracted_keywords = set()

        for i, chunk in enumerate(chunks):
            print(f"\nProcessing Chunk {i+1}/{len(chunks)}")
            extracted_from_chunk = keyword_extraction_chain.invoke({"text": chunk.page_content})
            print(f"Keywords from Chunk {i+1}: {extracted_from_chunk}")

            for kw in extracted_from_chunk.split(','):
                all_extracted_keywords.add(kw.strip())

        print("\n--- All Extracted Key Phrases (Unique) ---")
        print(", ".join(sorted(list(all_extracted_keywords))))

Welcome to the Universal Keyword Extractor!
You can provide text directly, a file path (PDF, DOCX, TXT), or a URL.


Enter your text, file path, or URL:  https://www.techtarget.com/searchenterpriseai/definition/generative-AI


Detected URL: https://www.techtarget.com/searchenterpriseai/definition/generative-AI
Split article into 59 chunks.

Attempting to load/create Chroma DB at './chroma_db'...


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
  vectorstore.persist() # Explicitly persist after adding documents


Chroma vector store 'keyword_extraction_chunks' loaded/created successfully.
Number of items in Chroma: 59

Processing Chunk 1/59
Keywords from Chunk 1: Here are the important keywords and key phrases extracted from the text as a comma-separated list:

Generative AI, GenAI, Artificial Intelligence, AI technologies, Enterprise AI, Tech Accelerator, Large data sets, Complex data, Meaningful clusters, New content creation, Text generation, Image generation, Audio generation, Query response, Prompt response, Enterprise technology, Business operation, Technology implications.

Let me know if you'd like me to help with anything else!

Processing Chunk 2/59
Keywords from Chunk 2: Here are the important keywords and key phrases extracted from the text as a comma-separated list:

Generative artificial intelligence, GenAI, sophisticated algorithms, large complex data sets, meaningful clusters, new content, text, images, audio, query, prompt, vector space, data points, correlations, dependencies,