In [4]:
!pip install datacommons_pandas

Collecting datacommons_pandas
  Downloading datacommons_pandas-0.0.3-py3-none-any.whl.metadata (2.3 kB)
Downloading datacommons_pandas-0.0.3-py3-none-any.whl (45 kB)
   ---------------------------------------- 0.0/45.8 kB ? eta -:--:--
   ---------------------------------------- 45.8/45.8 kB ? eta 0:00:00
Installing collected packages: datacommons_pandas
Successfully installed datacommons_pandas-0.0.3




In [4]:
import datacommons_pandas as dc

In [6]:
dc.get_stat_all(places=['country/USA'],stat_vars=['Percent_Person_Obesity'])

{'country/USA': {'Percent_Person_Obesity': {'sourceSeries': [{'val': {'2021': 33,
      '2020': 32,
      '2018': 30.9},
     'measurementMethod': 'AgeAdjustedPrevalence',
     'observationPeriod': 'P1Y',
     'importName': 'CDC500',
     'provenanceDomain': 'cdc.gov',
     'provenanceUrl': 'https://www.cdc.gov/places/index.html'},
    {'val': {'2018': 30.9, '2020': 31.9, '2021': 33},
     'measurementMethod': 'CrudePrevalence',
     'observationPeriod': 'P1Y',
     'importName': 'CDC500',
     'provenanceDomain': 'cdc.gov',
     'provenanceUrl': 'https://www.cdc.gov/places/index.html'}]}}}

In [6]:
city_dcids = dc.get_property_values(["CDC500_City"], "member", limit=500)["CDC500_City"]
city_dcids[:5]

['geoId/0107000',
 'geoId/0135896',
 'geoId/0137000',
 'geoId/0150000',
 'geoId/0151000']

In [7]:
data = dc.build_multivariate_dataframe(city_dcids,
                                       ["Percent_Person_Obesity", # Prevalence of obesity from CDC
                                        "Percent_Person_WithHighBloodPressure", # Prevalence of high blood pressure from CDC
                                        "UnemploymentRate_Person", # Unemployment rate from BLS
                                        "Count_Person_BelowPovertyLevelInThePast12Months", # Persons living below the poverty line from Census
                                        "Count_Person", # Total population from Census
                                       ]
                                      )
# Display the first five rows.
data.head(5)

Unnamed: 0_level_0,Percent_Person_Obesity,Percent_Person_WithHighBloodPressure,UnemploymentRate_Person,Count_Person_BelowPovertyLevelInThePast12Months,Count_Person
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
geoId/0107000,44.9,45.6,3.3,49921,200431
geoId/0135896,32.5,33.0,2.0,4965,91995
geoId/0137000,43.7,37.9,2.4,28809,215025
geoId/0150000,41.3,40.6,3.7,37379,186316
geoId/0151000,37.9,40.7,2.9,40625,199819


In [8]:
dc.build_multivariate_dataframe(["country/USA", "geoId/1714000", "geoId/06085"],["CumulativeCount_MedicalConditionIncident_COVID_19_ConfirmedOrProbableCase"])

Unnamed: 0_level_0,CumulativeCount_MedicalConditionIncident_COVID_19_ConfirmedOrProbableCase
place,Unnamed: 1_level_1
country/USA,103910034
geoId/06085,342015


## COLLECTION THE STATISTICAL VALUES

## AIM FOR TODAY

1. Extract all the statistical variables
2. Find the dcids of the places
3. Agentic Build up

## EXTRACT ALL THE STATISTICAL VARIABLES

In [16]:
import requests
from bs4 import BeautifulSoup
stats_vars_page = requests.get("https://datacommons.org/tools/statvar")
stats_vars_soup = BeautifulSoup(stats_vars_page.content,'html.parser')

In [18]:
stats_vars_soup.find(class_="dataset-selector-label")

In [34]:
# write data_source_outer_html to a file and read it
with open('data_source_html.txt','r') as f:
    data_source_outer_html = f.read()
data_source_html = BeautifulSoup(data_source_outer_html,'lxml')

In [70]:
data_sources_dict = {}
for opt in data_source_html.find(class_="dataset-selector-custom-input custom-select").find_all('option'):
    opt_str = str(opt)
    opt_str = opt_str[opt_str.find("value=")+6:opt_str.find(">")][1:-1]
    data_source_link = "https://datacommons.org/tools/statvar#s="+"%2F".join(opt_str.split("/"))
    data_sources_dict.update({opt.text:data_source_link})

In [73]:
data_sources_dict['Brazil INPE - National Institute for Space Research']

'https://datacommons.org/tools/statvar#s=dc%2Fs%2FBrazilInpe-NationalInstituteForSpaceResearch'

In [160]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from webdriver_manager.chrome import ChromeDriverManager

# Set up the WebDriver (you can use any driver like ChromeDriver, GeckoDriver, etc.)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Define a function to recursively click elements with the class name 'title'
def recursively_click_titles(driver):
    # Using a set to keep track of clicked elements to avoid clicking the same element twice
    clicked_titles = set()

    def click_title_elements():
        # Find all elements with the class 'title'
        left_scroll = False
        elements = driver.find_elements(By.CLASS_NAME, 'title')
        for element in elements:
            # if element.tag_name !='span':
            #     continue
            # Check if the element has already been clicked to avoid re-clicking
            if element not in clicked_titles:
                try:
                    # Scroll into view and click the element
                    # ActionChains(driver).move_to_element(element).perform()
                    element.click()
                    
                    
                    # Add the element to the set of clicked titles
                    clicked_titles.add(element)
                    
                    # Wait for a moment to allow any dynamic content to load
                    actions = ActionChains(driver)

                    # Move the cursor away (e.g., 100 pixels to the right and down from the element)
                    if left_scroll:
                        actions.move_to_element_with_offset(element, 20, 20).perform()
                        left_scroll = False
                    else:
                        actions.move_to_element_with_offset(element, -20, -20).perform()
                        left_scroll = True
                    # Recursively call the function to handle new elements
                    click_title_elements()
                except Exception as e:
                    print(f"Error clicking element: {e}")
                    actions.move_to_element_with_offset(element,30 , -20).perform()
                    # Move the cursor away (e.g., 100 pixels to the right and down from the element)
                    continue
    
    # Start the recursive clicking process
    click_title_elements()


# Navigate to the webpage
# driver.get(data_sources_dict['Brazil INPE - National Institute for Space Research'])
driver.get(data_sources_dict['Google'])
# driver.get(data_sources_dict['India National Sample Survey'])
driver.maximize_window()
# Wait for the page to fully load
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CLASS_NAME, "title"))
)
recursively_click_titles(driver)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
base_stats_link = "https://datacommons.org/browser/"
node_titles = soup.find_all(class_="node-title")
node_label_list = []
for curr_node_title in node_titles:
    node_label = curr_node_title.find('label')
    if node_label is not None:
        node_label_str = str(node_label)
        node_label_str = node_label_str[node_label_str.find("for=")+4:node_label_str.find("/")][1:-2]
        node_label_list.append({"node_name":node_label.text,"node_dcid":node_label_str,"node_link":base_stats_link+node_label_str})

Error clicking element: Message: element click intercepted: Element <span class="title">...</span> is not clickable at point (164, 282). Other element would receive the click: <div id="tree-widget-tooltip" style="visibility: visible; left: 121px; top: 177px;">...</div>
  (Session info: chrome=125.0.6422.142)
Stacktrace:
	GetHandleVerifier [0x003CB8E3+45827]
	(No symbol) [0x0035DCC4]
	(No symbol) [0x0025150F]
	(No symbol) [0x00298052]
	(No symbol) [0x002963D8]
	(No symbol) [0x0029425B]
	(No symbol) [0x00293823]
	(No symbol) [0x002885EF]
	(No symbol) [0x002B2DFC]
	(No symbol) [0x00288075]
	(No symbol) [0x002B3094]
	(No symbol) [0x002CC034]
	(No symbol) [0x002B2B96]
	(No symbol) [0x00286998]
	(No symbol) [0x0028751D]
	GetHandleVerifier [0x00684513+2899763]
	GetHandleVerifier [0x006D793D+3240797]
	GetHandleVerifier [0x004513B4+593364]
	GetHandleVerifier [0x004582DC+621820]
	(No symbol) [0x003670A4]
	(No symbol) [0x003637A8]
	(No symbol) [0x00363947]
	(No symbol) [0x003559FE]
	BaseThreadIni

StaleElementReferenceException: Message: stale element reference: stale element not found in the current frame
  (Session info: chrome=125.0.6422.142)
Stacktrace:
	GetHandleVerifier [0x003CB8E3+45827]
	(No symbol) [0x0035DCC4]
	(No symbol) [0x0025150F]
	(No symbol) [0x0025660F]
	(No symbol) [0x00258298]
	(No symbol) [0x00258310]
	(No symbol) [0x002977AA]
	(No symbol) [0x00296D25]
	(No symbol) [0x002D35B4]
	(No symbol) [0x002B2DFC]
	(No symbol) [0x002CC034]
	(No symbol) [0x002B2B96]
	(No symbol) [0x00286998]
	(No symbol) [0x0028751D]
	GetHandleVerifier [0x00684513+2899763]
	GetHandleVerifier [0x006D793D+3240797]
	GetHandleVerifier [0x004513B4+593364]
	GetHandleVerifier [0x004582DC+621820]
	(No symbol) [0x003670A4]
	(No symbol) [0x003637A8]
	(No symbol) [0x00363947]
	(No symbol) [0x003559FE]
	BaseThreadInitThunk [0x760A7BA9+25]
	RtlInitializeExceptionChain [0x7764BE3B+107]
	RtlClearBits [0x7764BDBF+191]


In [161]:
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
base_stats_link = "https://datacommons.org/browser/"
node_titles = soup.find_all(class_="node-title")
node_label_list = []
for curr_node_title in node_titles:
    node_label = curr_node_title.find('label')
    if node_label is not None:
        node_label_str = str(node_label)
        node_label_str = node_label_str[node_label_str.find("for=")+4:node_label_str.find("/")][1:-2]
        node_label_list.append({"node_name":node_label.text,"node_dcid":node_label_str,"node_link":base_stats_link+node_label_str})

In [162]:
len(node_label_list)

6

In [166]:
import json
with open('STATS/Google.json','w', encoding='utf-8') as f:
    json.dump(node_label_list, f, indent=4,ensure_ascii=True)

## GET PLACE DCID

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from webdriver_manager.chrome import ChromeDriverManager

place_url = "https://datacommons.org/place"
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver.get(place_url)
driver.maximize_window()

In [5]:
place_autocomplete = driver.find_element(value='place-autocomplete')

In [7]:
place_autocomplete.send_keys("United States")

In [8]:
from selenium.webdriver.common.keys import Keys

place_autocomplete.send_keys(Keys.ARROW_DOWN)
# Then send the Enter key to choose the selected option
place_autocomplete.send_keys(Keys.ENTER)


In [23]:
url = driver.current_url

In [24]:
url

'https://datacommons.org/place/country/USA'

In [25]:
# url = "https://datacommons.org/place/geoId/1805860"

# Split the URL by "/" and get the last part
# extracted_part = url.split('/')[-1]

# Join the last two parts to get 'geoId/1805860'
extracted_full = '/'.join(url.split('/')[-2:])

print(extracted_full)  # Output: geoId/1805860


country/USA


In [1]:
from src.get_place_dcids import place_dcid

place_dcid("United States")

'country/USA'

In [2]:
place_dcid("Bloomington Indiana")

'geoId/1805860'

In [3]:
place_dcid("bloomington Illinois")

'geoId/1706613'