In [1]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Import LangChain document loaders
from langchain_community.document_loaders import UnstructuredURLLoader, PyPDFLoader
# UnstructuredWordDocumentLoader is not directly in community.document_loaders, it's typically part of unstructured,
# but can be used via UnstructuredFileLoader or directly if specifically installed.
# For simplicity, we'll use a more general approach or rely on unstructured's auto-detection.
# Or, if you need explicit UnstructuredWordDocumentLoader:
from langchain_community.document_loaders import UnstructuredFileLoader # This can handle docx, pdf, txt, etc. based on file extension

In [2]:
# Load environment variables from .env file
load_dotenv()

True

In [4]:
# --- LLM and Prompt Setup ---

# 1. Initialize your LLM with Groq
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    raise ValueError("GROQ_API_KEY not found in environment variables. Please set it in your .env file or as an environment variable.")

llm = ChatGroq(model="meta-llama/llama-4-scout-17b-16e-instruct", temperature=0, api_key=groq_api_key)

In [5]:
# 2. Define your prompt template
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "From the following text, extract all important keywords and key phrases. "
            "Focus on domain-specific terminology, named entities, and core concepts. "
            "Format your response as a comma-separated list of phrases."
        ),
        ("human", "{text}"),
    ]
)

In [6]:
# 3. Create a keyword extraction chain
keyword_extraction_chain = prompt | llm | StrOutputParser()

# --- Input Handling Functions using LangChain Loaders ---

In [8]:
# --- Main Execution Flow ---

if __name__ == "__main__":
    print("Welcome to the Universal Keyword Extractor!")
    print("You can provide text directly, a file path (PDF, DOCX, TXT), or a URL.")
    user_input = input("Enter your text, file path, or URL: ").strip()

    # Load documents using LangChain loaders
    loaded_docs = load_documents_from_input(user_input)

    if not loaded_docs:
        print("No content extracted from the provided input. Exiting.")
    else:
        # Concatenate content of all loaded documents for splitting (or process them individually)
        # For simplicity, we'll join all page_content into one string if multiple documents are returned
        # This is common for initial processing before splitting into chunks.
        full_article_content = "\n\n".join([doc.page_content for doc in loaded_docs])

        # 4. Split the document into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100,
            length_function=len,
            is_separator_regex=False,
        )
        # Use the split_text method if you joined everything into one string
        # Or, if you want to split each document individually:
        # chunks = []
        # for doc in loaded_docs:
        #     chunks.extend(text_splitter.split_documents([doc]))
        
        # For now, let's stick to splitting the concatenated content as before
        chunks = text_splitter.create_documents([full_article_content])

        print(f"Split article into {len(chunks)} chunks.")

        all_extracted_keywords = set()

        for i, chunk in enumerate(chunks):
            print(f"\nProcessing Chunk {i+1}/{len(chunks)}")
            extracted_from_chunk = keyword_extraction_chain.invoke({"text": chunk.page_content})
            print(f"Keywords from Chunk {i+1}: {extracted_from_chunk}")

            for kw in extracted_from_chunk.split(','):
                all_extracted_keywords.add(kw.strip())

        print("\n--- All Extracted Key Phrases (Unique) ---")
        print(", ".join(sorted(list(all_extracted_keywords))))

Welcome to the Universal Keyword Extractor!
You can provide text directly, a file path (PDF, DOCX, TXT), or a URL.


Enter your text, file path, or URL:  C:\Users\hp\Downloads\Gireesh Resume.pdf


Detected PDF file: C:\Users\hp\Downloads\Gireesh Resume.pdf
Split article into 5 chunks.

Processing Chunk 1/5
Keywords from Chunk 1: Here are the important keywords and key phrases extracted from the text as a comma-separated list:

C/C++, Java, Python, SQL, AI, ML, DL, NN, CNN, NLP, LLM, RAG, Git, GitHub, API, PowerBI, transformer-based deepfake detection model, deepfake detection, data engineering, data collection, data processing, Timesformer Model, Hugging-face, roop-unleashed, face-swapping model, Pi-labs.ai, ML & Data Engineering Intern.

Let me know if you'd like me to help with anything else! 

Alternatively, I can also provide the list in a more categorized format:

**Programming Languages:** C/C++, Java, Python, SQL
**Technologies & Tools:** AI, ML, DL, NN, CNN, Git, GitHub, API, PowerBI, NLP, LLM, RAG
**Domain-specific Concepts:** deepfake detection, data engineering, transformer-based deepfake detection model
**Models & Frameworks:** Timesformer Model, Hugging-face, roop-u