In [1]:
import pandas as pd 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from datetime import datetime

In [50]:
def extractData():
  '''
  Data Extraction function to extract CDC Chronic Diseases data
  returns:
    df: pandas DataFrame with all the records
  '''
  
  # URL to Scan
  url = "https://data.cdc.gov/browse?category=Chronic+Diseases"
  
  # Chrome driver
  driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
  
  # load the URL
  driver.get(url)
  
  print("------------Starting Page Scanning------------")
  
  data = []
  count = 1
  while True:
    print(f"Scanning Page {count}")
    # Get page source and parse it with BeautifulSoup
    soup = BeautifulSoup(driver.page_source)
    
    divs = soup.find_all('div', class_='browse2-result-content')
    
    for div in divs:
      
      # get Header Details
      try:
        header = div.find('h2', class_='browse2-result-name')
        try:
          title = header.text.strip()
        except:
          title = None
        try:
          link = header.find('a').get("href")
          dataSetCode = link.split('/')[-1].strip()
        except:
          link = None
          dataSetCode = None
      except:
        print("Could Not Find Header")
        continue
      
      # get DataSet Type
      try:
        dataSetType = div.find('div', class_='browse2-result-type').text.strip()
      except:
        print("Could Not Determine DataSet Type")
        dataSetType = None
        
      # get Description
      try:
        description = div.find('div', class_='browse2-result-description').text.strip()
      except:
        print("Could Not Find Description")
        description = None
        
      # get Update date
      try:
        updateDate = div.find('div', class_='browse2-result-timestamp-value').text.strip()
      except:
        print("Could Not Find Description")
        updateDate = None
      
      temp = {
        "title": title,
        "link": link,
        "dataSetCode": dataSetCode,
        "dataSetType": dataSetType,
        "description": description,
        "updateDate": updateDate,
      }
      
      data.append(temp)
    
    try:
      next_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "nextLink"))
      )
      next_button.click()
      count += 1
    
    except:
      print("All Pages Viewed")
      break
  
  print("------------All Pages Scanned------------")
  
  # close the Driver
  driver.close()
  
    
  # dataFrame to return
  df = pd.DataFrame(data)
  
  return df

In [51]:
data = extractData()

------------Starting Page Scanning------------
Scanning Page 1
Scanning Page 2
Scanning Page 3
Scanning Page 4
Scanning Page 5
Scanning Page 6
Scanning Page 7
Scanning Page 8
Scanning Page 9
Scanning Page 10
Scanning Page 11
Scanning Page 12
Scanning Page 13
Scanning Page 14
Scanning Page 15
Scanning Page 16
Scanning Page 17
Scanning Page 18
Scanning Page 19
Scanning Page 20
Scanning Page 21
Scanning Page 22
Scanning Page 23
Scanning Page 24
All Pages Viewed
------------All Pages Scanned------------


In [8]:
def formatDate(date):
  date_object = datetime.strptime(date, "%B %d, %Y")
  return date_object.strftime("%d/%m/%Y")

In [10]:
data["updateDate"] = data["updateDate"].apply(lambda x: formatDate(x))

In [11]:
data

Unnamed: 0,title,link,dataSetCode,dataSetType,description,updateDate
0,DASH YRBSS - HS Cigar Use (by total grade/sex/...,https://data.cdc.gov/Youth-Risk-Behaviors/DASH...,bedg-mmpy,Filtered View,2015-2017. High School Dataset – Including Sex...,27/07/2023
1,Graph of Tobacco Use 3 Months Before and Last ...,https://data.cdc.gov/Maternal-Child-Health/Gra...,mbvg-apdj,Chart,2011. Centers for Disease Control and Prevent...,27/07/2023
2,"CDC Nutrition, Physical Activity, and Obesity ...",https://data.cdc.gov/Nutrition-Physical-Activi...,nxst-x9p4,Dataset,This dataset contains policy data for 50 US st...,25/08/2023
3,"Nutrition, Physical Activity, and Obesity - Be...",https://data.cdc.gov/Nutrition-Physical-Activi...,hn4x-zwk7,Dataset,"This dataset includes data on adult's diet, ph...",07/12/2023
4,Map of HS Cigar Use,https://data.cdc.gov/Youth-Risk-Behaviors/Map-...,a5cb-a2ww,Map,2015-2017. High School Dataset – Including Sex...,27/07/2023
...,...,...,...,...,...,...
235,"PLACES: ZCTA Data (GIS Friendly Format), 2021 ...",https://data.cdc.gov/500-Cities-Places/PLACES-...,9xb7-9z99,Dataset,This dataset contains model-based ZIP Code Tab...,25/08/2023
236,Stroke Mortality Data Among US Adults (35+) by...,https://data.cdc.gov/Heart-Disease-Stroke-Prev...,ua33-yiiu,Dataset,"2017 to 2019, 3-year average. Rates are age-st...",24/08/2023
237,"PLACES: Local Data for Better Health, ZCTA Dat...",https://data.cdc.gov/500-Cities-Places/PLACES-...,fbbf-hgkc,Dataset,This dataset contains model-based ZIP Code tab...,25/08/2023
238,"PLACES: Place Data (GIS Friendly Format), 2021...",https://data.cdc.gov/500-Cities-Places/PLACES-...,cj8b-94cj,Dataset,This dataset contains model-based place (incor...,25/08/2023


In [12]:
data.to_csv("../data/cdc_data.csv", sep="\t", index=False)