In [8]:
# !pip install dspy langchain langchain_community
# !pip install rank_bm25

In [1]:
from src.database import build_dcids_database, load_database
build_dcids_database()
dcid_collection = load_database()

In [1]:
from src.get_place_dcids import place_dcid

In [2]:
import os
from langchain.schema import Document
import ast

docs = []
for stat_files in os.listdir("src/STATS"):
    stat_file_name = ".".join(stat_files.split("_"))
    with open(os.path.join("src/STATS",stat_files), "r") as f:
        content = f.read()
    content = ast.literal_eval(content)
    for stat in content:
        docs.append(Document(page_content=stat['node_name'],metadata={'dcid': stat['node_dcid'],'link': stat['node_link'],'data_source':stat_file_name}))

In [36]:
import dspy
from langchain_community.retrievers import BM25Retriever
from dotenv import load_dotenv,find_dotenv
import concurrent.futures
import datacommons_pandas as dc

load_dotenv(find_dotenv(),override=True)

class PlaceKeywordSignature(dspy.Signature):
    """Returns the places that the question is talking about separated by semicolon (;) and also only the noun keywords relevant to the question in a list
    Make sure that you are only outputing the noun keywords and not other things"""
    question = dspy.InputField(prefix="Question: ",desc="Question asked by the user")
    places = dspy.OutputField(prefix="Places: ",desc="places like countries, states, towns, etc mentioned in the question separated by semicolon (;)")
    keywords = dspy.OutputField(prefix="Keywords: ",desc="noun keywords relevant to the question in a list. DON'T include the place names and be precise")

class SelectDCIDSignature(dspy.Signature):
    """Based on the dcid and their descriptions, select the dcid(s) that are most relevant to the question. Return the relevant dcids separated by semicolon (;)
    Don't output anything else, just output the relevant dcid(s). You have to output only from the given dcids, don't output any other dcids"""
    dcids_list = dspy.InputField(prefix="DCID and Description List: ",desc="DCIDs and its corresponding description")
    relevant_dcids = dspy.OutputField(prefix="Relevant DCIDs: ",desc="relevant dcids only separated by semicolon (;)")

llm = dspy.OpenAI(model="gpt-3.5-turbo")
dspy.settings.configure(lm=llm)

class DataCommonsDSPy(dspy.Module):
    def __init__(self):
        self.bm25_retriever = BM25Retriever.from_documents(
                docs, k=5, preprocess_func=(lambda x: x.lower())
            )
        self.place_keyword_llm = dspy.ChainOfThought(PlaceKeywordSignature)
        self.relevant_dcid_llm = dspy.ChainOfThought(SelectDCIDSignature)
    def __call__(self,question,**kwargs):
        return self.forward(question,**kwargs)

    def forward(self,question:str):
        llm_answer = self.place_keyword_llm(question=question)
        print(llm_answer)
        places = llm_answer.places.split(";")
        places = [pl.strip() for pl in places]
        keywords = llm_answer.keywords.split(",")

        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
            results = executor.map(place_dcid,places) 
        place_dcids = []
        for res in results:
            place_dcids.append(res)
        print(place_dcids)
        bm25_docs = self.bm25_retriever.invoke(question.lower())
        for key in keywords:
            curr_keyword_docs = self.bm25_retriever.invoke(key.lower())
            bm25_docs.extend(curr_keyword_docs)
        dcid_desc_str = ""
        for doc in bm25_docs:
            curr_dcid = doc.metadata['dcid']
            curr_desc = doc.metadata['data_source'] + doc.page_content
            dcid_desc_str += f"{curr_dcid}: {curr_desc}\n\n"
        relevant_dcid_result = self.relevant_dcid_llm(dcids_list=dcid_desc_str)
        relevant_dcid_list = relevant_dcid_result.relevant_dcids.split(";")
        relevant_dcid_list = [rdl.strip() for rdl in relevant_dcid_list]
        print(relevant_dcid_list)
        result_df = dc.build_multivariate_dataframe(place_dcids,relevant_dcid_list)
        return result_df

In [37]:
all_dcids_agent = DataCommonsDSPy()

In [27]:
all_dcids_agent(question="What is the number of patients recovered in COVID-19 from United States and Qatar?")

Prediction(
    rationale='produce the keywords. We need to identify the number of patients recovered in COVID-19 from the United States and Qatar.',
    places='United States; Qatar',
    keywords='number, patients, recovered, COVID-19'
)
['country/USA', 'country/QAT']


Unnamed: 0_level_0,InterestRate_TreasurySecurity_1Month_ConstantMaturity,Count_MedicalConditionIncident_COVID_19_PatientInICU
place,Unnamed: 1_level_1,Unnamed: 2_level_1
country/USA,5.51,1602


In [30]:
dcid = ['InterestRate_TreasurySecurity_1Month_ConstantMaturity', 'Count_MedicalConditionIncident_COVID_19_PatientInICU']
place_dcids = ['country/USA', 'country/QAT']

dc.build_multivariate_dataframe(place_dcids,dcid)

Unnamed: 0_level_0,Count_MedicalConditionIncident_COVID_19_PatientInICU,InterestRate_TreasurySecurity_1Month_ConstantMaturity
place,Unnamed: 1_level_1,Unnamed: 2_level_1
country/USA,1602,5.51


In [38]:
all_dcids_agent(question="What is the Nonfinancial Commercial Paper Interest Rate in USA?")

Prediction(
    rationale='produce the keywords. We are looking for the interest rate of nonfinancial commercial paper in the USA.',
    places='USA',
    keywords='nonfinancial commercial paper, interest rate'
)
['country/USA']
['InterestRate_FinancialInstrument_30Day_NonfinancialCommercialPaperAA', 'InterestRate_FinancialInstrument_90Day_NonfinancialCommercialPaperAA', 'InterestRate_FinancialInstrument_60Day_NonfinancialCommercialPaperAA', 'InterestRate_FinancialInstrument_1Year_PrimaryCredit', 'InterestRate_TreasurySecurity_1Year_ConstantMaturity', '']


Unnamed: 0_level_0,InterestRate_FinancialInstrument_30Day_NonfinancialCommercialPaperAA,InterestRate_FinancialInstrument_90Day_NonfinancialCommercialPaperAA,InterestRate_FinancialInstrument_60Day_NonfinancialCommercialPaperAA,InterestRate_FinancialInstrument_1Year_PrimaryCredit,InterestRate_TreasurySecurity_1Year_ConstantMaturity
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
country/USA,5.32,5.33,5.33,5.5,5.02


In [39]:
all_dcids_agent(question="What is the annual consumption of Lignite coal in India?")

Prediction(
    rationale='produce the keywords. We are looking for the annual consumption of Lignite coal in India.',
    places='India',
    keywords='annual consumption, Lignite coal'
)
['geoId/1836003']
['Count_Person_NotHispanicOrLatino_FoodInSecure_White_AsAFractionOf_Count_Person_NotHispanicOrLatino_White']


ValueError: No data for any of specified Places and StatisticalVariables.

## EXTRACT STATISTICAL VARIABLES

In [10]:
import requests
from bs4 import BeautifulSoup

stats_page = requests.get("https://datacommons.org/browser/memberList")
stats_soup = BeautifulSoup(stats_page.content,'lxml')

In [11]:
table_pages = stats_soup.find_all(class_="table-page-section")

In [13]:
table_pages

[]