## This Notebook will demonstrate different configurations and executions of LLM-Analyst

### Environment Variables

* For proper execution one or more of the below environment variables may be required.
* For these examples we will require `OPENAI_API_KEY`

```bash
export OPENAI_API_KEY=""
export TAVILY_API_KEY=""
export SERPER_API_KEY=""
export SERP_API_KEY=""
export HUGGINGFACEHUB_API_TOKEN=""
export LANGCHAIN_API_KEY=""
export GROQ_API_KEY=""
export GOOGLE_CX_KEY=""
export GOOGLE_API_KEY=""
export BING_API_KEY=""
export NCBI_API_KEY=""
export ORCID_ACCESS_TOKEN=""
export ORCID_REFRESH_TOKEN=""
export PYPI_API_TOKEN=""
export DOCKERHUB_API_TOKEN=""
```


In [2]:
# System level imports
import sys
import os

# Setting the USER_AGENT to fix warning with langchain_community code
# WARNING:langchain_community.utils.user_agent:USER_AGENT
user_agent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
              "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0")
os.environ['USER_AGENT'] = user_agent

# PATH TO WHERE THE CODE WAS "GIT CLONED"
llm_analyst_base_dir='/Users/dan/Code/LLM/llm_analyst'
sys.path.insert(0, llm_analyst_base_dir)


In [3]:
# Let's import llm_analyst content in one cell to make the rest of the code cleaner.
from llm_analyst.core.config import Config, DataSource
from llm_analyst.core.research_analyst import LLMAnalyst
from llm_analyst.core.research_editor import LLMEditor
from llm_analyst.core.research_publisher import LLMPublisher
from llm_analyst.core.config import Config


DEBUG:pyvirtualdisplay:version=3.0


### Demonstrate running LLM-Analyst on Local Data

In [5]:
## Let's run a simple research report against a set of local documents (Published Papers)
## Three things are required.
## 1. An active research topic 
## 2. A defined path to the local data to research against
## 3. Indicate the data source (LOCAL_STORE, WEB, SELECT_URLS)

# Requirement 1.
research_topic = "I would like to better understand how the metabolism of S-adenosylmethionine is linked to lipid metabolism and stress-responsive gene expression."

# Requirement 2.
config_params = {
    "internet_search" :"ddg_search",
    "llm_provider"    :"openai",
    "llm_model"       :"gpt-4o-2024-05-13",
    "local_store_dir" :f"{llm_analyst_base_dir}/tests/resources/tst_documents",
    "report_out_dir"  :f"{llm_analyst_base_dir}/notebooks/data"
}
config = Config()
config.set_values_for_config(config_params)
print(config)


internet_search=<function ddg_search at 0x10cb15260>
embedding_provider=client=<openai.resources.embeddings.Embeddings object at 0x168be2c10> async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x168d39550> model='text-embedding-ada-002' dimensions=None deployment='text-embedding-ada-002' openai_api_version='' openai_api_base=None openai_api_type='' openai_proxy='' embedding_ctx_length=8191 openai_api_key=SecretStr('**********') openai_organization=None allowed_special=None disallowed_special=None chunk_size=1000 max_retries=2 request_timeout=None headers=None tiktoken_enabled=True tiktoken_model_name=None show_progress_bar=False model_kwargs={} skip_empty=False default_headers=None default_query=None retry_min_seconds=4 retry_max_seconds=20 http_client=None http_async_client=None check_embedding_ctx_length=True
llm_provider=<class 'llm_analyst.chat_models.openai.OPENAI_Model'>
llm_model=gpt-4o-2024-05-13
llm_token_limit=4000
llm_temperature=0.25
browse_chunk_max_length

In [None]:
# Now that we has set things up lets get down to conducting the research!
# To execute preliminary research we use the LLMAnalyst 
# Request the analysts conducts research and then writes a report

llm_analyst = LLMAnalyst(active_research_topic = research_topic, 
                         data_source = DataSource.LOCAL_STORE, 
                         config = config)

await llm_analyst.conduct_research()
research_state = await llm_analyst.write_report()


# Once the report is written we can ask the LLMPublisher to make a pdf
llm_publisher = LLMPublisher(**research_state.dump(), config = config)
published_research_path = await llm_publisher.publish_to_pdf_file()
published_research_path

In [None]:
## Let's build on the first research project and now build a "detailed report"
## A "detailed report" requires over site therefore we will use an Editor v.s. an Analyst
## The key difference between an Editor and the Analyst is that
## the Editor will coordinate the efforts of multiple Analysts and 
## will utilize a specialized Report Writer to pull the final report together

## Inputs are the same as the Research Analyst Report above

## But we will explicitly define the publish directory.
## FYI this is the default so not actually required.
config_params["report_out_dir"] = "~/llm_analyst_out"

llm_editor = LLMEditor(active_research_topic = research_topic, 
                       data_source = DataSource.LOCAL_STORE,
                       config = config)

research_state = await llm_editor.create_detailed_report()

llm_publisher = LLMPublisher(**research_state.dump(), config = config)
published_research_path = await llm_publisher.publish_to_pdf_file()


### Demonstrate Running LLM-Analyst on Web Scraped Data

In [None]:
## Let's run a simple research report against the internet
## All that is required is an active research topic

# Requirement 1.
research_topic = "How does DAF-19 regulate transcription of regeneration associated genes?"


config_params = {
    "internet_search":"ddg_search",
    "llm_provider"   :"openai",
    "llm_model"      :"gpt-4o-2024-05-13",
}
config = Config()
config.set_values_for_config(config_params)

In [None]:
# To execute some preliminary research we use the LLMAnalyst 

llm_analyst = LLMAnalyst(active_research_topic = research_topic, 
                         config = config)

await llm_analyst.conduct_research()
research_state = await llm_analyst.write_report()


# Once the report is written we can ask the LLMPublisher to mak a pdf
llm_publisher = LLMPublisher(**research_state.dump(), config = config)
published_research_path = await llm_publisher.publish_to_pdf_file()

In [None]:
## Let's build on the first research project and now build a "detailed report"
## A "detailed report" requires oversite therefore we will use an Editor v.s. an Analyst
## The key difference between an Editor and the Analyst is that
## the Editor will coordinate the efforts of multiple Analysts and 
## will utilize a specilized Report Writer to pull the final report together

## Inputs are the same as the Research Analyst Report above

## But we will explicitly define the publish directory.
## FYI this is the default so not actually required.
config_params["report_out_dir"] = "~/llm_analyst_out"

llm_editor = LLMEditor(active_research_topic = research_topic, 
                       data_source = DataSource.LOCAL_STORE,
                       config = config)

research_state = await llm_editor.create_detailed_report()

llm_publisher = LLMPublisher(**research_state.dump(), config = config)
published_research_path = await llm_publisher.publish_to_pdf_file()