In [1]:
from bs4 import BeautifulSoup
# import pandas as pd
import csv
import os

In [2]:
# Web Scraped Data: 
# [KVUE Allergy Calendar](https://kvue.com/allergy)
# Scrape Monthly Allergy Calendar data as PDFs 
# save each to html
# loop through html files to make a list 

files = os.listdir('resources/html_files/cal_v2')
url = 'resources/html_files/cal_v2'

file_list = []
for f in files:
    file_list.append(url + "/" + f)  
file_list

['resources/html_files/cal_v2/09_sep.html',
 'resources/html_files/cal_v2/04_apr.html',
 'resources/html_files/cal_v2/08_aug.html',
 'resources/html_files/cal_v2/10_oct.html',
 'resources/html_files/cal_v2/07_jul.html',
 'resources/html_files/cal_v2/01_jan.html',
 'resources/html_files/cal_v2/06_jun.html',
 'resources/html_files/cal_v2/05_may.html',
 'resources/html_files/cal_v2/03_mar.html',
 'resources/html_files/cal_v2/02_feb.html']

In [3]:
# create list to put all the data into
raw_data = []

In [4]:
# loop through each html file from KVUE to scrap
for file in file_list:
    print(f'---- Processing file {file} -----')
    
    # scrape file
    soup = BeautifulSoup(open(file), "lxml")
    
    # get month & date
    month_date = soup.find('p', class_='s2').text
    # print(month_date)
    
    # get all the days (which are td's in the table)
    days = soup.find('table').find_all('td')
    
    #  loop through all the days and gather info for the month
    counter = 0
    calc_day = 0
    
    for day in days:
        try:
            data = day.find_all('p')

            # check to make sure we have two data elements. if two elements, we know it's actual day
            # even if the day element is empty/missing
            if len(data) > 1:
                # need a date calculator as bad html doesn't always produce a day number in the td
                calc_day +=1

                day = data[0].text
                allergens = data[1].text

                print(f'Counter {counter} Day: {day} Calc Day: {calc_day} Allergens: {allergens}')
                raw_data_dict = {}
                raw_data_dict["MonthYear"] = month_date
                raw_data_dict["Day"] = calc_day
                raw_data_dict["Allergens"] = allergens
                raw_data.append(raw_data_dict)
                
            else:
                # does not have two data elements. see if the day element has a value
                day = data[0].text
                if day != '':
                    # check to see if we already have this day in the dicitonary. 
                    # if so, skip it. just bad html formatting.
                    # if not, let's add it cuz it means no allergen data found for that day
                    if not any(data['MonthYear'] == month_date and data['Day'] == int(day) for data in raw_data):
                        print(f'Missing Allergen. Counter {counter} Day: {day}')
                        raw_data_dict = {}
                        raw_data_dict["MonthYear"] = month_date
                        raw_data_dict["Day"] = int(day)
                        raw_data_dict["Allergens"] = ''
                        raw_data.append(raw_data_dict)  

                        calc_day +=1
                    
        except:
            print(f'Issue for counter {counter}')
        finally:
            counter += 1

---- Processing file resources/html_files/cal_v2/09_sep.html -----
Counter 0 Day:  Calc Day: 1 Allergens: Mold Medium.
Counter 2 Day: 2 Calc Day: 2 Allergens: Mold Medium.
Counter 3 Day:  Calc Day: 3 Allergens: Molds High with Alternaria.
Counter 5 Day:  Calc Day: 4 Allergens: Mold Medium
Counter 7 Day:  Calc Day: 5 Allergens: Fall Elm Low 7 gr/m3, Grass Medium 20 gr/m3, and Mold Medium with Alternaria
Counter 9 Day:  Calc Day: 6 Allergens: Mold Medium.
Counter 11 Day:  Calc Day: 7 Allergens: Mold Medium.
Counter 13 Day:  Calc Day: 8 Allergens: Grass Medium 13 gr/m3, and Mold Medium with Alternaria
Counter 15 Day: 9 Calc Day: 9 Allergens: Ragweed medium 13 gr/m3, Mold Medium with alternaria
Counter 16 Day:  Calc Day: 10 Allergens: Ragweed Low 7 gr/m3, Mold Medium
Counter 18 Day:  Calc Day: 11 Allergens: Ragweed Low 7 gr/m3, Mold Medium
Counter 20 Day: 12 Calc Day: 12 Allergens: Ragweed Medium 27 gr/m3, Mold Medium with Alternaria.
Counter 21 Day: 13 Calc Day: 13 Allergens: Ragweed Medi

In [5]:
# export data to csv
keys = raw_data[0].keys()
# print(keys)

with open('resources/csv_files/kvue_monthly_allergens.csv', 'a') as output_file:
    dict_writer = csv.DictWriter(output_file, restval="-", fieldnames=keys, delimiter=',')
    dict_writer.writeheader()
    dict_writer.writerows(raw_data)