In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os
import numpy as np
from datetime import datetime

# Scraping KVUE Html Files

In [2]:
# Web Scraped Data: 
# [KVUE Allergy Calendar](https://kvue.com/allergy)
# Scrape Monthly Allergy Calendar data as PDFs 
# save each to html
# loop through html files to make a list 

files = os.listdir('resources/html_files/cal_v2')
url = 'resources/html_files/cal_v2'

file_list = []
for f in files:
    file_list.append(url + "/" + f)  
file_list

['resources/html_files/cal_v2/09_sep.html',
 'resources/html_files/cal_v2/04_apr.html',
 'resources/html_files/cal_v2/08_aug.html',
 'resources/html_files/cal_v2/10_oct.html',
 'resources/html_files/cal_v2/07_jul.html',
 'resources/html_files/cal_v2/01_jan.html',
 'resources/html_files/cal_v2/06_jun.html',
 'resources/html_files/cal_v2/05_may.html',
 'resources/html_files/cal_v2/03_mar.html',
 'resources/html_files/cal_v2/02_feb.html']

In [3]:
# create list to put all the data into
raw_data = []

In [4]:
# loop through each html file from KVUE to scrap
for file in file_list:
    print(f'---- Processing file {file} -----')
    
    # scrape file
    soup = BeautifulSoup(open(file), "lxml")
    
    # get month & date
    month_date = soup.find('p', class_='s2').text
    month = month_date.split(' ')[0]
    year = month_date.split(' ')[1]
    
    # print(month_date)
    
    # get all the days (which are td's in the table)
    days = soup.find('table').find_all('td')
    
    #  loop through all the days and gather info for the month
    counter = 0
    calc_day = 0
    
    for day in days:
        try:
            data = day.find_all('p')

            # check to make sure we have two data elements. if two elements, we know it's actual day
            # even if the day element is empty/missing
            if len(data) > 1:
                # need a date calculator as bad html doesn't always produce a day number in the td
                calc_day +=1

                day = data[0].text
                allergens = data[1].text

                print(f'Counter {counter} Day: {day} Calc Day: {calc_day} Allergens: {allergens}')
                raw_data_dict = {}
                raw_data_dict["MonthYear"] = month_date
                raw_data_dict["Month"] = month
                raw_data_dict["Year"] = year
                raw_data_dict["Day"] = calc_day
                raw_data_dict["Allergens"] = allergens
                raw_data.append(raw_data_dict)
                
            else:
                # does not have two data elements. see if the day element has a value
                day = data[0].text
                if day != '':
                    # check to see if we already have this day in the dicitonary. 
                    # if so, skip it. just bad html formatting.
                    # if not, let's add it cuz it means no allergen data found for that day
                    if not any(data['MonthYear'] == month_date and data['Day'] == int(day) for data in raw_data):
                        print(f'Missing Allergen. Counter {counter} Day: {day}')
                        raw_data_dict = {}
                        raw_data_dict["MonthYear"] = month_date
                        raw_data_dict["Month"] = month
                        raw_data_dict["Year"] = year
                        raw_data_dict["Day"] = int(day)
                        raw_data_dict["Allergens"] = ''
                        raw_data.append(raw_data_dict)  

                        calc_day +=1
                    
        except:
            print(f'Issue for counter {counter}')
        finally:
            counter += 1

---- Processing file resources/html_files/cal_v2/09_sep.html -----
Counter 0 Day:  Calc Day: 1 Allergens: Mold Medium.
Counter 2 Day: 2 Calc Day: 2 Allergens: Mold Medium.
Counter 3 Day:  Calc Day: 3 Allergens: Molds High with Alternaria.
Counter 5 Day:  Calc Day: 4 Allergens: Mold Medium
Counter 7 Day:  Calc Day: 5 Allergens: Fall Elm Low 7 gr/m3, Grass Medium 20 gr/m3, and Mold Medium with Alternaria
Counter 9 Day:  Calc Day: 6 Allergens: Mold Medium.
Counter 11 Day:  Calc Day: 7 Allergens: Mold Medium.
Counter 13 Day:  Calc Day: 8 Allergens: Grass Medium 13 gr/m3, and Mold Medium with Alternaria
Counter 15 Day: 9 Calc Day: 9 Allergens: Ragweed medium 13 gr/m3, Mold Medium with alternaria
Counter 16 Day:  Calc Day: 10 Allergens: Ragweed Low 7 gr/m3, Mold Medium
Counter 18 Day:  Calc Day: 11 Allergens: Ragweed Low 7 gr/m3, Mold Medium
Counter 20 Day: 12 Calc Day: 12 Allergens: Ragweed Medium 27 gr/m3, Mold Medium with Alternaria.
Counter 21 Day: 13 Calc Day: 13 Allergens: Ragweed Medi

# Transforming KVUE Data

In [5]:
# put the list of dictionary data into a dataframe
df = pd.DataFrame(raw_data)
df.head()

Unnamed: 0,Allergens,Day,Month,MonthYear,Year
0,Mold Medium.,1,September,September 2019,2019
1,Mold Medium.,2,September,September 2019,2019
2,Molds High with Alternaria.,3,September,September 2019,2019
3,Mold Medium,4,September,September 2019,2019
4,"Fall Elm Low 7 gr/m3, Grass Medium 20 gr/m3, a...",5,September,September 2019,2019


In [6]:
# clean up some of the data

# replace Fall Elm with just Elm
df['Allergens'].replace({'Fall Elm':'Elm'}, regex=True, inplace=True)

# # get rid of the word 'and'
df['Allergens'].replace({' and ':''}, regex=True, inplace=True)



In [7]:
df.head()

Unnamed: 0,Allergens,Day,Month,MonthYear,Year
0,Mold Medium.,1,September,September 2019,2019
1,Mold Medium.,2,September,September 2019,2019
2,Molds High with Alternaria.,3,September,September 2019,2019
3,Mold Medium,4,September,September 2019,2019
4,"Elm Low 7 gr/m3, Grass Medium 20 gr/m3,Mold Me...",5,September,September 2019,2019


In [8]:
# create a new formatted Date column combining Month, Day and Year fields
df['Date'] = df.apply(lambda x:datetime.strptime("{0} {1} {2}".format(x['Year'],x['Month'], x['Day']), "%Y %B %d"),axis=1)

# drop the Month, MonthYear, Year and Day columns
df.drop(['Month', 'MonthYear', 'Year', 'Day'], axis=1, inplace=True)
df.head()

Unnamed: 0,Allergens,Date
0,Mold Medium.,2019-09-01
1,Mold Medium.,2019-09-02
2,Molds High with Alternaria.,2019-09-03
3,Mold Medium,2019-09-04
4,"Elm Low 7 gr/m3, Grass Medium 20 gr/m3,Mold Me...",2019-09-05


In [9]:
# rearrange columns, order by date, reset index
cols = ['Date', 'Allergens']
df = df.reindex(columns=cols).sort_values('Date')
df.reset_index(drop=True, inplace=True)


In [10]:
df.head()

Unnamed: 0,Date,Allergens
0,2019-01-01,"Cedar Medium 346 gr/m3, Mold Medium"
1,2019-01-02,Mold Low.
2,2019-01-03,Mold Low.
3,2019-01-04,Mold Low.
4,2019-01-05,"Cedar Low 20 gr/m3, Mold Medium"


In [11]:
# export data to csv
df.to_csv('resources/csv_files/kvue_daily_allergens.csv', index=False)

## Parse out allergens into seperate data frame with one allergen per row

In [12]:
# Create function that will parse out the string of allergens into a list of dictionaries
def parse_allergen(date, raw_allergen):
    '''
    Pass in index, date, and allergen, parse out what we got!
    '''
    allergens = raw_allergen.split(',')
#     allergens

    parsed_allergens = []
    for allergen in allergens:
        try:
    #         print(f'Processing allergen: {allergen}')
            # set defaults
            p_allergen = np.nan
            p_severity = np.nan
            p_measure = np.nan

            raw = allergen.strip().split(' ')
    #         print(raw)

            if len(raw) == 2:
    #             print('length of 2')
                if raw[1].find('gr/m3') == -1:
                    # not a bad parse
                    p_allergen = raw[0].replace('.', '')
                    p_severity = raw[1].replace('.', '')

            elif len(raw) == 4:
    #             print('length of 4')
                p_allergen = raw[0].replace('.', '')
                p_severity = raw[1].replace('.', '')
                if raw[3].find('gr/m3') != -1:
                    p_measure = raw[2]

            parsed = {
                'Date': date,
                'Allergen': p_allergen,
                'Severity': p_severity,
                'Measure': p_measure
            }

    #         print(f'In loop: {parsed}')

            parsed_allergens.append(parsed)
        except:
            print(f'Error parsing index {index} and allergen {allergen}')
    return parsed_allergens

In [13]:
# loop through daily allergen data frame, parse allergen data, add to list
daily_allergens = []
for index, row in df.iterrows():
#     print(index)
#     print(row['Allergens'])
    
    if not pd.isna(row['Allergens']):
        parsed_allergens = parse_allergen(row['Date'], row['Allergens'])

#         print(parsed_allergens)
        for p in parsed_allergens:
            daily_allergens.append(p)

#     else:
#         print('has nan')

In [14]:
daily_allergens

[{'Date': Timestamp('2019-01-01 00:00:00'),
  'Allergen': 'Cedar',
  'Severity': 'Medium',
  'Measure': '346'},
 {'Date': Timestamp('2019-01-01 00:00:00'),
  'Allergen': 'Mold',
  'Severity': 'Medium',
  'Measure': nan},
 {'Date': Timestamp('2019-01-02 00:00:00'),
  'Allergen': 'Mold',
  'Severity': 'Low',
  'Measure': nan},
 {'Date': Timestamp('2019-01-03 00:00:00'),
  'Allergen': 'Mold',
  'Severity': 'Low',
  'Measure': nan},
 {'Date': Timestamp('2019-01-04 00:00:00'),
  'Allergen': 'Mold',
  'Severity': 'Low',
  'Measure': nan},
 {'Date': Timestamp('2019-01-05 00:00:00'),
  'Allergen': 'Cedar',
  'Severity': 'Low',
  'Measure': '20'},
 {'Date': Timestamp('2019-01-05 00:00:00'),
  'Allergen': 'Mold',
  'Severity': 'Medium',
  'Measure': nan},
 {'Date': Timestamp('2019-01-06 00:00:00'),
  'Allergen': 'Cedar',
  'Severity': 'Low',
  'Measure': '20'},
 {'Date': Timestamp('2019-01-06 00:00:00'),
  'Allergen': 'Mold',
  'Severity': 'Medium',
  'Measure': nan},
 {'Date': Timestamp('2019-0

In [15]:
# put daily allergens into dataframe
df_allergens_speciated = pd.DataFrame(daily_allergens)
df_allergens_speciated.head()

Unnamed: 0,Allergen,Date,Measure,Severity
0,Cedar,2019-01-01,346.0,Medium
1,Mold,2019-01-01,,Medium
2,Mold,2019-01-02,,Low
3,Mold,2019-01-03,,Low
4,Mold,2019-01-04,,Low


In [16]:
# rearrange columns, order by date, reset index
cols = ['Date', 'Allergen', 'Severity', 'Measure']
df_allergens_speciated = df_allergens_speciated.reindex(columns=cols).sort_values('Date')
df_allergens_speciated.reset_index(drop=True, inplace=True)
df_allergens_speciated.head()

Unnamed: 0,Date,Allergen,Severity,Measure
0,2019-01-01,Cedar,Medium,346.0
1,2019-01-01,Mold,Medium,
2,2019-01-02,Mold,Low,
3,2019-01-03,Mold,Low,
4,2019-01-04,Mold,Low,


In [17]:
# export data to csv
df_allergens_speciated.to_csv('resources/csv_files/kvue_daily_allergens_speciated.csv', index=False)