In [1]:
pip install faiss-cpu langchain langchain-community langchain-groq transformers sentence-transformers wikipedia wikipedia-api pymupdf arxiv


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-groq
  Downloading langchain_groq-0.3.1-py3-none-any.whl.metadata (2.6 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.

In [6]:
import os
import getpass
import json
import pymupdf
from langchain.chains import StuffDocumentsChain
from langchain.chat_models import init_chat_model
from langchain.document_loaders import WikipediaLoader, ArxivLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferMemory

# Securely set API key for Groq
api_key = os.environ.get("GROQ_API_KEY")
if not api_key:
    api_key = getpass.getpass("Enter API key for Groq: ")
    os.environ["GROQ_API_KEY"] = api_key

# Initialize the Groq chat model
llm = init_chat_model("llama3-8b-8192", model_provider="groq")

# Memory for conversation retention
memory = ConversationBufferMemory(memory_key="research_history", return_messages=True)

# Research Tools: Wikipedia & Arxiv with size limits
def fetch_research_data(query):
    sources = []
    documents = []

    # Wikipedia with limited content
    try:
        wiki_loader = WikipediaLoader(query, load_max_docs=2)
        wiki_docs = wiki_loader.load()
        sources.append(f"Wikipedia ({query})")
        documents.extend(wiki_docs)
    except Exception as e:
        print(f"Warning: Wikipedia lookup failed: {e}")

    # Arxiv with limited results
    try:
        arxiv_loader = ArxivLoader(query, max_results=2)
        arxiv_docs = arxiv_loader.load()
        sources.append(f"Arxiv ({query})")
        documents.extend(arxiv_docs)
    except Exception as e:
        print(f"Warning: Arxiv lookup failed: {e}")

    if not documents:
        raise ValueError("No research data found for the query.")

    return documents, sources

# Process documents in chunks to avoid token limits
def process_in_chunks(query, documents, chunk_size=2000):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=200,
        length_function=len
    )

    split_docs = text_splitter.split_documents(documents)
    summaries = []

    for i, chunk in enumerate(split_docs):
        try:
            prompt = (
                f"Research Query: {query}\n\n"
                f"Analyze this chunk ({i+1}/{len(split_docs)}) and extract key insights:\n\n"
                f"{chunk.page_content}"
            )
            response = llm.invoke(prompt)
            summaries.append(response.content.strip())
        except Exception as e:
            print(f"Error processing chunk {i+1}: {str(e)}")
            summaries.append(f"[Chunk {i+1} processing error: {str(e)}]")

    return "\n\n".join(summaries)

# Filtering Mechanism for Relevance
def filter_relevant_info(query, synthesized_text):
    try:
        # Process in chunks if the text is too long
        if len(synthesized_text) > 3000:
            chunks = [synthesized_text[i:i+3000] for i in range(0, len(synthesized_text), 3000)]
            filtered_chunks = []

            for chunk in chunks:
                filter_prompt = (
                    f"Research Query: {query}\n\n"
                    f"Extract only the most relevant points from this section:\n\n{chunk}"
                )
                response = llm.invoke(filter_prompt)
                filtered_chunks.append(response.content.strip())

            return "\n\n".join(filtered_chunks)
        else:
            filter_prompt = (
                f"Research Query: {query}\n\n"
                f"Extract only the most relevant points:\n\n{synthesized_text}"
            )
            response = llm.invoke(filter_prompt)
            return response.content.strip()

    except Exception as e:
        return f"Error filtering information: {str(e)}"

# Structured Report Generation
def generate_research_report(query):
    try:
        documents, sources = fetch_research_data(query)
        synthesized_text = process_in_chunks(query, documents)
        filtered_info = filter_relevant_info(query, synthesized_text)

        report = {
            "query": query,
            "sources": sources,
            "summary": synthesized_text,
            "filtered_research": filtered_info
        }
        return json.dumps(report, indent=4)

    except ValueError as e:
        return json.dumps({"error": str(e)}, indent=4)
    except Exception as e:
        return json.dumps({"error": f"Unexpected error: {str(e)}"}, indent=4)

# Example Usage
if __name__ == "__main__":
    research_topic = input("Enter a research topic: ")
    print(generate_research_report(research_topic))

Enter a research topic: cloud
{
    "query": "cloud",
    "sources": [
        "Wikipedia (cloud)",
        "Arxiv (cloud)"
    ],
    "summary": "Here are the key insights extracted from the first chunk of research on \"cloud\":\n\n**Definition**: A cloud is an aerosol consisting of visible liquid droplets, frozen crystals, or particles suspended in the atmosphere of a planetary body or space.\n\n**Composition**: Water or other chemicals can compose the droplets and crystals.\n\n**Formation**: Clouds are formed on Earth due to:\n\n1. Saturation of air when cooled to its dew point.\n2. Gaining sufficient moisture from an adjacent source to raise the dew point to the ambient temperature.\n\n**Location**: Clouds are seen in the Earth's homosphere, which includes the troposphere, stratosphere, and mesosphere.\n\n**Science**: Nephology is the science of clouds, which is a branch of meteorology known as cloud physics.\n\n**Classification**: The World Meteorological Organization uses two met