In [97]:
import csv
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import itertools as it
from multiprocessing import Queue
# this is where any files or directories/path variables go:
csv_filename = './meetings.csv'


## BUILDING A BASIC SCRAPER:
<hr>

In [98]:
# CREATE A CSV PARSER TO GET LINKS FROM CSV FILE:
def csv_parser(csv_reader, header: str):
    _header = next(csv_reader)
    headerIndex = _header.index(header)
    # now create an empty list to append the addresses to
    data_list = []
    # loop through CSV file and append to address_list
    for line in csv_reader:
        all_data = line[headerIndex]
        data_list.append(all_data)
    return data_list

# CREATE A CSV WRITER TO WRITE ROW DATA TO A NEW CSV FILE WITH MEETING DETAILS:
def csv_writer(csv_writer, data):
    # Open CSV file to append data to:
    with open('new_meeting_data.csv', 'w') as f:
        csv_writer = csv.DictWriter(f, field_names=list(data.keys()))


In [99]:
# create func to get list of links:
def get_links(csv_filename):
    with open(csv_filename, 'r') as f:
        csv_reader = csv.reader(f)
        link_list = csv_parser(csv_reader, 'link')
    return link_list 

# create func to get list of links:
def get_addresses(csv_filename):
    with open(csv_filename, 'r') as f:
        csv_reader = csv.reader(f)
        address_list = csv_parser(csv_reader, 'address')
    return address_list

In [100]:
def fetch_soup_data(link):
    # STEP 1: Get the webpage response obj from requests
    page = requests.get(link)
    # STEP 2: Get Soup object for html parsing
    soup = bs(page.text, "lxml")
    return soup

In [101]:
# CREATE A FUNCTION THAT WILL EXTRACT ADDRESS DATA FROM EACH LINK IN 
# LINK LIST:
def get_address_data(soup):
    
    try:
        address_tag = soup.address
        address = address_tag.contents[1]
        
        meeting_name = soup.find(
            'div', class_='fui-card-body').find(class_='weight-300')
        name = meeting_name.contents[1]
        
        city_tag = meeting_name.find_next('a')
        city = city_tag.contents[0]
        
        state_tag = city_tag.find_next('a')
        state = state_tag.contents[0]
        return {'name': name, 'address': address, 'city': city, 'state': state}
        
    except IndexError as ie:
        print(f"Index Error: {ie}")
        try:
            return {'name': name, 'address': address, 'city': city, 'state': 'state'}
        except UnboundLocalError as ule:
            print(f"UnboundLocalError: {ule}")
        try:
            return {'name': name, 'address': address, 'city': 'city', 'state': state}
        except UnboundLocalError as ule:
            print(f"UnboundLocalError: {ule}")
        try:
            return {'name': name, 'address': 'address', 'city': city, 'state': state}
        except UnboundLocalError as ule:
            print(f"UnboundLocalError: {ule}")
        try:
            return {'name': 'name', 'address': address, 'city': city, 'state': state}
        except UnboundLocalError as ule:
            print(f"UnboundLocalError: {ule}")

In [102]:
# CREATE A FUNCTION THAT WILL EXTRACT ALL THE TABLE DATA FROM EACH LINK
# IN THE LINK LIST. THE TABLE DATA WILL THEN NEED TO BE PARSED AND 
# CLEANED IF THERE ARE MULTIPLE ITEMS:
def get_table_data(soup):
    try:
        info_table = soup.find('table', class_='table fui-table')
        # obtain all the columns from <th>
        headers = []
        for i in info_table.find_all('th'):
            title = i.text
            headers.append(title.lower())

            # now create a dataframe:
        df = pd.DataFrame(columns=headers)

    
        # Now create the foor loop to fill dataframe
        # a row is under the <tr> tags and data under the <td> tags
        for j in info_table.find_all('tr')[1:]:
            # if info_table.find_all('tr')[1:] == AttributeError.NoneType:
            #     print("No info table found")
            row_data = j.find_all('td')
            row = [i.text for i in row_data]
            length = len(df)
            df.loc[length] = row

        # data['day'].append(df['day'].to_list())
        # data['time'].append(df['time'].to_list())
        # data['info'].append(df['info'].to_list())
        day = df['day'].to_list()
        time = df['time'].to_list()
        info = df['info'].to_list()

        # now return data
        return {'day': day, 'time': time, 'info': info}
    
    except AttributeError as ae:
        print(f"info_table.find_all('tr')[1:] raised error: {ae}")
        return {'day': 'day', 'time': 'time', 'info': 'info'}

In [103]:
# CREATE A FUNCTION THAT WILL PARSE THE ROW DATA AND STORE IT 
# IN A DICTIONARY. THAT DICTIONARY CAN THEN BE INSERTED INTO 
# A LIST OF DICTIONARIES CALLED ROW_LIST BUT TO DO THIS THE PARSER
# HAS TO JOIN LIST ITEMS INTO ONE LONG STRING SO EACH ROW HAS THE 
# SAME NUMBER OF COLUMNS:
# THIS WAS INSERTED INTO meeting_data_scraper

# VERSION 1
def row_parser1(item0, item1):
    """
    :param item0: This is the address data in a dictionary. Use the following keys to access
    the data -> Keys: 'name' - 'address' - 'city' - 'state' 
    :param item1: This is the meeting details data in a dictionary. Use the following keys to
    access the data -> Keys: 'day' - 'time' - 'info'
    
    create a final dictionary that will be used to store the information in the database as one row. 
    I will need to join the list items to create one string with a | seperating each item so I can 
    split the string when retrieving the data.
    """
    row = {}
    try:
        row['name'] = item0['name']
        row['address'] = item0['address']
        row['city'] = item0['city']
        row['state'] = item0['state']
    except Exception as e:
        print(e)
        row['name'] = 'name'
        row['address'] = 'address'
        row['city'] = 'city'
        row['state'] = 'state'

    # now add item1 to the row data
    row['day'] = ' | '.join(item1['day'])
    row['time'] = ' | '.join(item1['time'])
    row['info'] = ' | '.join(item1['info'])

    # now return the row data dictionary
    return row

# VERSION 2: 
def row_parser2(item0, item1):
    """
    :param item0: This is the address data in a dictionary. Use the following keys to access
    the data -> Keys: 'name' - 'address' - 'city' - 'state' 
    :param item1: This is the meeting details data in a dictionary. Use the following keys to
    access the data -> Keys: 'day' - 'time' - 'info'
    
    create a final dictionary that will be used to store the information in the database as one row. 
    I will need to join the list items to create one string with a | seperating each item so I can 
    split the string when retrieving the data.
    """
    try:
        row = {}
        try:
            row['name'] = item0['name']
            row['address'] = item0['address']
            row['city'] = item0['city']
            row['state'] = item0['state']
            # now add item1 to the row data
            row['day'] = ' | '.join(item1['day'])
            row['time'] = ' | '.join(item1['time'])
            row['info'] = ' | '.join(item1['info'])
            # now return the row data dictionary
            return row
        except Exception as e:
            print(f'{e}')
    except Exception as e:
        print(f'{e}')
        for k, v in row.items():
            if v is not None:
                pass
            else:
                v = k
        return row


In [104]:
# THIS IS THE 'MAIN LOGICAL FUNCTION' THIS FUNCTION WILL COMBINE THE 
# get_address_data, get_table_data, and meeting_row_parser FUNCTIONS.
# THAT WAY I CAN EXECUTE ALL OF THE FUNCTIONS IN ONE CALL.
def meeting_data_scraper(link):
    
    # Get Soup Data
    soup = fetch_soup_data(link)
    # Create two dicts with the following keys
    address_dict = get_address_data(soup)
    details_dict = get_table_data(soup)

    d = [address_dict, details_dict]
    row_data = row_parser1(d[0], d[1])

    return row_data


In [105]:
def flatten(list_of_lists):
    "Flatten one level of nesting"
    return it.chain.from_iterable(list_of_lists)

In [112]:
# whole list of links
link_list = get_links(csv_filename)

# shortened list: first 1000
links_1000 = link_list[:1000]

# shortened list: first 500
links_500 = link_list[:500]

links_100 = link_list[:100]


In [107]:
def scrape(link_list):

    soup_data = []
    try:
        for link in link_list:
            soup = fetch_soup_data(link)
            soup_data.append(soup)
        return soup_data
    
    except IndexError as e:
        print(f"{e}: List Exhausted. No More Items to Scrape..")

In [108]:
soup_data = scrape(links_100)


In [109]:
# THIS IS THE 'MAIN LOGICAL FUNCTION' THIS FUNCTION WILL COMBINE THE 
# get_address_data, get_table_data, and meeting_row_parser FUNCTIONS.
# THAT WAY I CAN EXECUTE ALL OF THE FUNCTIONS IN ONE CALL.
def soup_data_scraper(soup_data):
    
    try:
        rows = []
        for soup in soup_data:
            # Create two dicts with the following keys
            address_dict = get_address_data(soup)
            details_dict = get_table_data(soup)

            d = [address_dict, details_dict]
            row_data = row_parser1(d[0], d[1])
            rows.append(row_data)
        return rows

    except IndexError as e:
        print(f"{e}: List Exhausted. No More Soup Items.")

In [110]:
rows = soup_data_scraper(soup_data)

In [116]:
def get_rows(link_list):
    try:

        soup_data = scrape(link_list)
        rows = soup_data_scraper(soup_data)
        return rows

    except Exception as e:
        print(f"Exception {e}: Continue running..")

    except requests.exceptions.RequestException as re:
        print(f"Exception Raised: --> |{re}|")

In [117]:
row_data = get_rows(link_list)

In [None]:
csv_filename = './meeting_details.csv'
def csv_writer(row_data, csv_filename):
    with open(csv_filename, 'w') as f:   
        writer = csv.writer(f)
        for row in row_data:
            for k,v in row.items():
                writer.writerow(v)