## Auto Gen Tutorial
Note book written by John Adeojo
Founder, and Chief Data Scientist at [Data-centric Solutions](https://www.data-centric-solutions.com/)


In [76]:
import os
import yaml
import openai

script_dir = "C:/Users/johna/OneDrive/Documents/api_keys/"

# Mount Google Drive
# drive.mount('/content/drive')

def get_apikey(script_dir=script_dir):
    """
    Reads API key from a configuration file.

    This function opens a configuration file named "apikeys.yml", reads the API key for OpenAI

    Returns:
    api_key (str): The OpenAI API key.
    """
    # Update the script_dir path to point to the correct location in your Google Drive
    script_dir = script_dir
    file_path = os.path.join(script_dir, "apikeys.yml")

    with open(file_path, 'r') as yamlfile:
        loaded_yamlfile = yaml.safe_load(yamlfile)
        API_KEY = loaded_yamlfile['openai']['api_key']

    return API_KEY

# Call the function to get the API key
openai.api_key = get_apikey()

In [74]:
# Tool 1: Does a query based search for Wikipages
from typing import Any, List
import wikipedia
from llama_index import download_loader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.text_splitter import get_default_text_splitter
import openai
from pydantic import BaseModel
from llama_index.program import OpenAIPydanticProgram
from utils import get_apikey
from typing import Callable, Dict, Optional, Union, List, Tuple, Any
from llama_hub.wikipedia.base import WikipediaReader

def search_wikipedia(
        query: str, lang: str = "en", results_limit: int = 5, **load_kwargs: Any
    ) -> List[List]:
    import wikipedia
    wikipedia.set_lang(lang)
    return wikipedia.search(query, results=results_limit)

def create_wikidocs(query: str) -> list:  # Added type hint
    loader = WikipediaReader()  # Removed redefinition of WikipediaReader
    documents = loader.load_data(pages=query)
    return documents

def index_wikipedia_pages(wikipageslist: list) -> VectorStoreIndex:  # Added type hint
    documents = create_wikidocs(wikipageslist)
    text_splits = get_default_text_splitter(chunk_size=150, chunk_overlap=45)
    parser = SimpleNodeParser.from_defaults(text_splitter=text_splits)
    service_context = ServiceContext.from_defaults(node_parser=parser)
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    return index

In [77]:
import autogen 

config_list = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={
        "model": ["gpt-3.5-turbo-16k"],
    },
)
llm_config = {
    "functions": [
        {
            "name": "search_wikipedia",
            "description": "Use this to search for relevant Wikipedia pages",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "A query to search and identify relevant Wikipedia pages",
                    }
                },
                "required": ["query"],
            },
        },
        
        {
            "name": "index_wikipedia_pages",
            "description": "Use this to index wikipedia pages",
            "parameters": {
                "type": "object",
                "properties": {
                    "wikipageslist": {
                        "type": "list",
                        "description": "A list of wikipedia pages to index",
                    }
                },
                "required": ["wikipageslist"],
            },
        },
    ],
    "config_list": config_list,
    "request_timeout": 120,
}



In [102]:
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from llama_index import download_loader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.text_splitter import get_default_text_splitter
import openai
from llama_index.program import OpenAIPydanticProgram
from utils import get_apikey
from typing import Callable, Dict, Optional, Union, List, Tuple, Any
from llama_hub.wikipedia.base import WikipediaReader


class WikipediaRetrieverUserProxyAgent(RetrieveUserProxyAgent):
    def __init__(self, index, **kwargs):
        super().__init__(**kwargs)  # Fixed superclass constructor call
        # Set the OpenAI API key when the class is instantiated
        openai.api_key = get_apikey()
        self.index = index

    def retrieve_docs(
            self,
            query_texts: List[str],
            n_results: int = 10,
            search_string: str = "",
            **kwargs,
    ) -> Dict[str, Union[List[str], List[List[str]]]]:  # Added type hint
        query_engine = self.index.as_query_engine(
            response_mode="compact", verbose=True, similarity_top_k=n_results
        )
        nodes = query_engine.query(search_string)
        results = {
            "ids": [],
            "text": []
        }
        for node in nodes:
            results["ids"].append(node.node.id_)
            results["text"].append(node.node.text)  # Assuming the 'text' attribute exists in 'node.node'
        return results


In [104]:
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
    human_input_mode="NEVER",
    max_consecutive_auto_reply=2,
    code_execution_config={"work_dir": "coding"},
)

wikisearch_agent = autogen.AssistantAgent(
    name="wikisearch_agent",
    system_message="Search for relevant Wikipedia pages",
    llm_config=llm_config,
)

searchqa_agent = autogen.AssistantAgent(
    name="searchqa_agent",
    system_message="Inpsect the Wikipedia search results for relevance",
    llm_config=config_list,
)

retirve_agent = WikipediaRetrieverUserProxyAgent(
    name="retrieve_agent",
    system_message="Index and retrieve information from wikipages using the pages identified",
    llm_config=llm_config
)

user_proxy.register_function(
    function_map={
        "search_wikipedia": search_wikipedia,
        "index_wikipedia_pages":index_wikipedia_pages
    }
)

groupchat = autogen.GroupChat(agents=[user_proxy, wikisearch_agent, searchqa_agent, retirve_agent], messages=[], max_round=2)
manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)

user_proxy.initiate_chat(manager, message="Write a 500 word summary of the 2023 banking crisis")


[33muser_proxy[0m (to chat_manager):

Write a 500 word summary of the 2023 banking crisis

--------------------------------------------------------------------------------








[33mwikisearch_agent[0m (to chat_manager):

I'm sorry, as of now, there is no information available about a 2023 banking crisis as it is a future event and cannot be predicted. Nonetheless, I can provide you with information about past banking crises. Would that be helpful?

--------------------------------------------------------------------------------


In [70]:
# from llama_hub.wikipedia.base import WikipediaReader
# from llama_index.readers.schema.base import Document
# from typing import Any, List

# class WikipediaSearchReader(WikipediaReader):
#     def load_data(
#         self, query: str, lang: str = "en", results_limit: int = 5, **load_kwargs: Any
#     ) -> List[Document]:
#         import wikipedia

#         wikipedia.set_lang(lang)
#         search_results = wikipedia.search(query, results=results_limit)
#         # Now call the parent class's load_data method with the search results
#         return super().load_data(pages=search_results, lang=lang, **load_kwargs), search_results

# loader = WikipediaSearchReader()
# documents, search_results = loader.load_data(query="2023 Bank crisis")
# search_results

['2023 United States banking crisis',
 '2022–2023 Pakistani economic crisis',
 '2023 Nigerien crisis',
 '2007–2008 financial crisis',
 'First Republic Bank']

In [101]:
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from llama_index import download_loader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.text_splitter import get_default_text_splitter
import openai
from pydantic import BaseModel
from llama_index.program import OpenAIPydanticProgram
from utils import get_apikey
from typing import Callable, Dict, Optional, Union, List, Tuple, Any
from llama_hub.wikipedia.base import WikipediaReader


# define the data model in pydantic
class WikiPageList(BaseModel):
    "Data model for WikiPageList"
    pages: list

class WikipediaRetrieverUserProxyAgent(RetrieveUserProxyAgent):
    def __init__(self, **kwargs):
        super().__init__
        # Set the OpenAI API key when the class is instantiated
        openai.api_key = get_apikey()
        self.index = None

    def create_wikidocs(self, query):
        WikipediaReader = download_loader("WikipediaReader")
        loader = WikipediaReader()
        documents = loader.load_data(pages=query)
        return documents

    def create_index(self, documents):
        text_splits = get_default_text_splitter(chunk_size=150, chunk_overlap=45)
        parser = SimpleNodeParser.from_defaults(text_splitter=text_splits)
        service_context = ServiceContext.from_defaults(node_parser=parser)
        index = VectorStoreIndex.from_documents(documents, service_context=service_context)
        return index

    def retrieve_docs(self, query):
        wikipage_requests = self.wikipage_list(query)
        documents = self.create_wikidocs(wikipage_requests)
        self.index = self.create_index(documents)
        return self.query_vector_db(query)

    def query_vector_db(
            self,
            query_texts: List[str],
            n_results: int = 10,
            search_string: str = "",
            **kwargs,
    ) -> Dict[str, Union[List[str], List[List[str]]]]:
        query_engine = self.index.as_query_engine(
            response_mode="compact", verbose=True, similarity_top_k=n_results
        )
        nodes = query_engine.query(search_string)
        results = {
            "ids": [],
            "text": []
        }
        for node in nodes:
            results["ids"].append(node.node.id_)
            results["text"].append(node.node.text)  # Assuming the 'text' attribute exists in 'node.node'
        return results

In [None]:
import autogen

config_list = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={
        "model": ["gpt-3.5-turbo-16k"],
    },
)

user_proxy = WikipediaRetrieverUserProxyAgent(
    name="user_proxy",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    is_termination_msg=lambda x: x.get("content", "").rstrip().endswith("TERMINATE"),
    # code_execution_config={
    #     "work_dir": "coding",
    #     "use_docker": False,  # set to True or image name like "python:3" to use docker
    # },
)

math_problem = "Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation."
WikipediaRetrieverUserProxyAgent.initiate_chat(assistant, problem=math_problem)

In [4]:
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from llama_index import download_loader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.text_splitter import get_default_text_splitter
import openai
from pydantic import BaseModel
from llama_index.program import OpenAIPydanticProgram
from utils import get_apikey
from typing import Callable, Dict, Optional, Union, List, Tuple, Any

# define the data model in pydantic
class WikiPageList(BaseModel):
    "Data model for WikiPageList"
    pages: list

class WikipediaRetrieverUserProxyAgent(RetrieveUserProxyAgent):
    def __init__(self, wikipages, query):
        self.wikipages = wikipages
        index = self.create_index(query)
        
    def wikipage_list(self, query):
        openai.api_key = get_apikey()

        prompt_template_str = """
        Given the input {query}, 
        extract the Wikipedia pages mentioned after 
        "please index:" and return them as a list.
        If only one page is mentioned, return a single
        element list.
        """
        program = OpenAIPydanticProgram.from_defaults(
            output_cls=WikiPageList,
            prompt_template_str=prompt_template_str,
            verbose=True,
        )

        wikipage_requests = program(query=query)

        return wikipage_requests

    def create_wikidocs(self, wikipage_requests):
        WikipediaReader = download_loader("WikipediaReader")
        loader = WikipediaReader()
        documents = loader.load_data(pages=wikipage_requests)
        return documents

    def create_index(self, query):
        wikipage_requests = self.wikipage_list(query)
        documents = self.create_wikidocs(wikipage_requests)
        text_splits = get_default_text_splitter(chunk_size=150, chunk_overlap=45)
        parser = SimpleNodeParser.from_defaults(text_splitter=text_splits)
        service_context = ServiceContext.from_defaults(node_parser=parser)
        index = VectorStoreIndex.from_documents(documents, service_context=service_context)
        return index, text_splits

    def wikisearch_engine(self, index, n_results):
        query_engine = index.as_query_engine(
            response_mode="compact", verbose=True, similarity_top_k=n_results
        )
        
        return query_engine

    def query_vector_db(
                    self,
                    index,
                    query_texts: List[str],
                    n_results: int = 10,
                    search_string: str = "",
                    **kwargs,
                ) -> Dict[str, Union[List[str], List[List[str]]]]:
        
        query_engine = index.as_query_engine(
            response_mode="compact", verbose=True, similarity_top_k=n_results
        )

        nodes = query_engine.query(search_string)
        results = {
            "ids": [],
            "text": []
        }
    
        for node in nodes:
            results["ids"].append(node.node.id_)
            results["text"].append(node.node.text)  # Assuming the 'text' attribute exists in 'node.node'

            return results

In [57]:
from llama_hub.tools.wikipedia import WikipediaToolSpec
from llama_index.agent import OpenAIAgent

tool_spec = WikipediaToolSpec()

agent = OpenAIAgent.from_tools(tool_spec.to_tool_list())

agent.chat('What is the population of London?')

InvalidRequestError: [] is too short - 'messages'

In [62]:
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from llama_index import download_loader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.text_splitter import get_default_text_splitter
import openai
from pydantic import BaseModel
from llama_index.program import OpenAIPydanticProgram
from utils import get_apikey
from typing import Callable, Dict, Optional, Union, List, Tuple, Any

# def wikipage_list(query):
#     openai.api_key = get_apikey()

#     prompt_template_str = """
#     Given the input {query}, 
#     extract the Wikipedia pages mentioned after 
#     "please index:" and return them as a list.
#     If only one page is mentioned, return a single
#     element list.
#     """
#     program = OpenAIPydanticProgram.from_defaults(
#         output_cls=WikiPageList,
#         prompt_template_str=prompt_template_str,
#         verbose=True,
#     )

#     wikipage_requests = program(query=query)

#     return wikipage_requests

def create_wikidocs(wikipage_requests):
    WikipediaReader = download_loader("WikipediaReader")
    loader = WikipediaReader()
    documents = loader.load_data(pages=wikipage_requests)
    return documents

def create_index():
    openai.api_key = get_apikey()
    wikipage_requests = "Anne Arundel County"
    documents = create_wikidocs(wikipage_requests)
    text_splits = get_default_text_splitter(chunk_size=150, chunk_overlap=45)
    parser = SimpleNodeParser.from_defaults(text_splitter=text_splits)
    service_context = ServiceContext.from_defaults(node_parser=parser)
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    return index, documents

def wikisearch_engine(index, n_results):
    query_engine = index.as_query_engine(
        response_mode="compact", verbose=True, similarity_top_k=n_results
    )
    
    return query_engine

# def query_vector_db(
#                 index,
#                 query_texts: List[str],
#                 n_results: int = 10,
#                 search_string: str = "",
#                 # **kwargs,
#             ) -> Dict[str, Union[List[str], List[List[str]]]]:
    
#     query_engine = index.as_query_engine(
#         response_mode="compact", verbose=True, similarity_top_k=n_results
#     )

#     nodes = query_engine.query(search_string)
#     results = {
#         "ids": [],
#         "text": []
#     }
    
#     for node in nodes:
#         results["ids"].append(node.node.id_)
#         results["text"].append(node.node.text)  # Assuming the 'text' attribute exists in 'node.node'

#         return results

In [63]:
index, text_splits = create_index()
# test = wikisearch_engine(index, n_results=10)
# nodes = test.query("Population of London").source_nodes

PageError: Page id " " does not match any pages. Try another id!

In [33]:
text_splits

[Document(id_='880a0169-bc00-4784-980c-e3e38f801e29', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='0f44c3c18e41cc2ae75560c5e660256f7d1b96b0b04a2205de2babd736a21e85', text='L, or l, is the twelfth letter in the Latin alphabet, used in the modern English alphabet, the alphabets of other western European languages and others worldwide. Its name in English is el (pronounced ), plural els.\n\n\n== History ==\nLamedh may have come from a pictogram of an ox goad or cattle prod. Some have suggested a shepherd\'s staff.\n\n\n== Use in writing systems ==\n\n\n=== Phonetic and phonemic transcription ===\nIn phonetic and phonemic transcription, the International Phonetic Alphabet uses ⟨l⟩ to represent the lateral alveolar approximant.\n\n\n=== English ===\nIn English orthography, ⟨l⟩ usually represents the phoneme , which can have several sound values, depending on the speaker\'s accent, and whether it occurs before or after a

In [19]:
test = len(nodes)

results = {
"ids": [],
"text": []
}

for node in nodes:
        results["ids"].append(node.node.id_)
        results["text"].append(node.node.text)  # Assuming the 'text' attribute exists in 'node.node'

results

{'ids': ['6572ce9a-ed09-4212-a7f8-f62beefcada8',
  'a8a9b2fc-31b0-4c01-bc17-28a2383296e5',
  '326d6ef2-c088-40e1-b922-81e35719be0a',
  '1abb9136-7a17-49d7-90a6-d0849ec4eaf1',
  '38ab9574-f6dc-443b-93c5-602bf0649a2f',
  'a01c6194-486c-4b46-bcbc-729e2e700c6e',
  'bdee30c5-f62a-4f27-a348-ef3b871ffc79',
  'c94ca1d3-9028-482b-bf8b-e01de949c071',
  '97a87c35-8278-4546-af59-72ec55667d6a',
  '27ff9081-4af1-41c7-add4-f1894bd7e0e1'],
 'text': ["London's continuous urban area extends beyond Greater London and numbered 9,787,426 people in 2011, while its wider metropolitan area had a population of 12–14 million, depending on the definition used. According to Eurostat, London is the second most populous metropolitan area in Europe. A net 726,000 immigrants arrived there in the period 1991–2001.The region covers 610 square miles (1,579 km2), giving a population density of 13,410 inhabitants per square mile (5,177/km2) more than ten times that of any other British region. In population terms, London 

In [None]:
from pydantic import BaseModel
from llama_index import OpenAIPydanticProgram

# define the data model in pydantic
class WikiPageList(BaseModel):
    "Data model for WikiPageList"
    pages: list


def wikipage_list(query):
    openai.api_key = get_apikey()

    prompt_template_str = """
    Given the input {query}, 
    extract the Wikipedia pages mentioned after 
    "please index:" and return them as a list.
    If only one page is mentioned, return a single
    element list.
    """
    program = OpenAIPydanticProgram.from_defaults(
        output_cls=WikiPageList,
        prompt_template_str=prompt_template_str,
        verbose=True,
    )

    wikipage_requests = program(query=query)

    return wikipage_requests


def create_wikidocs(wikipage_requests):
    WikipediaReader = download_loader("WikipediaReader")
    loader = WikipediaReader()
    documents = loader.load_data(pages=wikipage_requests)
    return documents


def create_index(query):
    global index
    wikipage_requests = wikipage_list(query)
    documents = create_wikidocs(wikipage_requests)
    text_splits = get_default_text_splitter(chunk_size=150, chunk_overlap=45)
    parser = SimpleNodeParser.from_defaults(text_splitter=text_splits)
    service_context = ServiceContext.from_defaults(node_parser=parser)
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)

    return index


def wikisearch_engine(index):
    query_engine = index.as_query_engine(
        response_mode="compact", verbose=True, similarity_top_k=10
    )
    return query_engine


def create_react_agent(MODEL):
    query_engine_tools = [
        QueryEngineTool(
            query_engine=wikisearch_engine(index),
            metadata=ToolMetadata(
                name="Wikipedia Search",
                description="Useful for performing searches on the wikipedia knowledgebase",
            ),
        )
    ]

In [6]:
from llama_index import download_loader

WikipediaReader = download_loader("WikipediaReader")

loader = WikipediaReader()
documents = loader.load_data(pages=['Berlin'])
documents

[Document(id_='2de8a71f-9acc-487b-8fee-11ba932fe66f', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='266bfbce0e5a7588d75610b6e40dca0c6d3566232f92ad41ccb3b261f05dbd16', text='Berlin ( bur-LIN, German: [bɛʁˈliːn] ) is the capital and largest city of Germany by both area and population. Its more than 3.85 million inhabitants make it the European Union\'s most populous city, according to population within city limits. One of Germany\'s sixteen constituent states, Berlin is surrounded by the State of Brandenburg and contiguous with Potsdam, Brandenburg\'s capital. Berlin\'s urban area, which has a population of around 4.5 million, is the most populous urban area in Germany. The Berlin-Brandenburg capital region has around 6.2 million inhabitants and is Germany\'s second-largest metropolitan region after the Rhine-Ruhr region.Berlin straddles the banks of the Spree, which flows into the Havel (a tributary of the Elbe) in t

## 1. Set Up Configurations

In [5]:
import autogen

config_list = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={
        "model":["gpt-3.5-turbo-16k"]
    },
)

