In [None]:
!pip install smolagents -q

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install duckduckgo_search

In [None]:
from smolagents import CodeAgent, DuckDuckGoSearchTool, InferenceClientModel

# Initialize the search tool
search_tool = DuckDuckGoSearchTool()

# Initialize the model
model = InferenceClientModel()

agent = CodeAgent(
    model = model,
    tools=[search_tool]
)

# Example usage
response = agent.run(
    "Search for phd ideas for a Computational Linguistics with a master degree"
)
print(response)

The agent follows this process:

Analyzes the Request: Our agent identifies the key elements of phd topics for a computational Linguistics

Performs Retrieval: The agent leverages DuckDuckGo to search for the most relevant and up-to-date information, ensuring it aligns with agent's event

Synthesizes Information: After gathering the results, the agent processes them into a cohesive, actionable plan for us, covering all aspects of the topics.

Stores for Future Reference: The agent stores the retrieved information for easy access when planning future events, optimizing efficiency in subsequent tasks.

# Custom Knowledge base tool


A custom knowledge base is very important for specialized tasks.

We will create a tool that queries a vector database of technical documentation or specialized knowledge.

We will use semantic search, so that the agent can find the most relevant information for agent's needs.

This approach combines predefined knowledge with semantic search to provide context-aware solutions for event planning.

In [None]:
# Install the dependencies first and run!
!pip install langchain-community rank_bm25

In [None]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from smolagents import Tool
from langchain_community.retrievers import BM25Retriever
from smolagents import CodeAgent, InferenceClientModel

class PhdIdeasRetrieverTool(Tool):
  name = "phd_ideas_retriever"
  description = "Uses semantic search to retrieve relevant phd ideas for a computational linguist student"
  inputs = {
      "query": {
          "type": "string",
          "description": "The query to perform. This should be a query related to phd ideas",
      }
  }
  output_type = "string"

  def __init__(self, docs, **kwargs):
    super().__init__(**kwargs)
    self.retriever = BM25Retriever.from_documents(
        docs, k=10 # Retrieve the top 5 documents
    )
  def forward(self, query: str) -> str:
    assert isinstance(query, str), "Your search query must be a string"

    docs = self.retriever.invoke(
        query,
    )
    return "\nRetrieved ideas:\n" + "".join(
        [
          f"\n\n===== Idea {str(i)} =====\n" + doc.page_content
          for i, doc in enumerate(docs)
        ]
    )

# Simulate a knowledge base about phd ideas
phd_ideas = [
    {
        "text": "Develop a multilingual transformer model optimized for low-resource languages, with emphasis on transfer learning and cross-lingual embeddings.",
        "source": "Multilingual NLP"
    },
    {
        "text": "Investigate bias propagation in large language models and propose debiasing techniques using adversarial training and data augmentation.",
        "source": "Ethics in NLP"
    },
    {
        "text": "Create a discourse-aware summarization system that uses rhetorical structure theory to improve coherence in generated summaries.",
        "source": "Text Summarization"
    },
    {
        "text": "Design a computational framework to detect and model code-switching in bilingual corpora using sequence tagging and syntactic cues.",
        "source": "Sociolinguistics & NLP"
    },
    {
        "text": "Explore the use of graph neural networks for semantic role labeling by encoding predicate-argument structures in dependency graphs.",
        "source": "Semantic Parsing"
    },
    {
        "text": "Build an explainable sentiment analysis model that identifies linguistic markers and provides interpretable explanations for predictions.",
        "source": "Sentiment Analysis & Explainability"
    },
    {
        "text": "Analyze the evolution of meaning in language using diachronic word embeddings trained on historical corpora across centuries.",
        "source": "Computational Historical Linguistics"
    },
    {
        "text": "Study the interaction between prosody and syntax in spoken corpora using multimodal transformers that integrate audio and text.",
        "source": "Speech & Prosody"
    },
    {
        "text": "Propose a benchmarking framework for evaluating robustness of NLP models against linguistic perturbations like syntactic reordering or lexical substitution.",
        "source": "NLP Evaluation"
    },
    {
        "text": "Implement a human-in-the-loop annotation pipeline using active learning to reduce labeling effort for complex NLP tasks.",
        "source": "Annotation & Data Collection"
    }
]

source_docs = [
    Document(page_content=doc["text"], metadata={"source": doc["source"]})
    for doc in phd_ideas
]

# Split the documents into smaller chunks for more efficient search
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap=50,
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n\n", "\n", ".", " ", ""],
)
docs_processed = text_splitter.split_documents(source_docs)

# create the retriever tool
phd_ideas_retriever = PhdIdeasRetrieverTool(docs_processed)

# Initialize the agent
agent = CodeAgent(tools=[phd_ideas_retriever], model=InferenceClientModel())

# Example usage
response = agent.run(
    "Find ideas for a phd ideas, including Linguistis, Computer Science, Artificial Intelligence, Machine Learning."
)

print(response)



This enhanced agent can:

1. First check the documentation for relevant information
2. Combine insights from the knowledge base
3. Maintain conversation context in memory