In [6]:
import csv
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from multiprocessing.pool import Pool, ThreadPool
from multiprocessing.pool import MaybeEncodingError
import multiprocessing
import itertools as it

# this is where any files or directories/path variables go:
csv_filename = './meetings.csv'


## BUILDING A BASIC SCRAPER:
<hr>

In [7]:
# CREATE A CSV PARSER TO GET LINKS FROM CSV FILE:
def csv_parser(csv_reader, header: str):
    _header = next(csv_reader)
    headerIndex = _header.index(header)
    # now create an empty list to append the addresses to
    data_list = []
    # loop through CSV file and append to address_list
    for line in csv_reader:
        all_data = line[headerIndex]
        data_list.append(all_data)
    return data_list

# CREATE A CSV WRITER TO WRITE ROW DATA TO A NEW CSV FILE WITH MEETING DETAILS:
def csv_writer(csv_writer, data):
    # Open CSV file to append data to:
    with open('new_meeting_data.csv', 'w') as f:
        csv_writer = csv.DictWriter(f, field_names=list(data.keys()))


In [3]:
# create func to get list of links:
def get_links(csv_filename):
    with open(csv_filename, 'r') as f:
        csv_reader = csv.reader(f)
        link_list = csv_parser(csv_reader, 'link')
    return link_list

links = get_links(csv_filename)

In [424]:
# CREATE A FUNCTION THAT WILL EXTRACT ADDRESS DATA FROM EACH LINK IN 
# LINK LIST:
def get_address_data(soup):
        try:
            address_tag = soup.address
            address = address_tag.contents[1]
            
            meeting_name = soup.find(
                'div', class_='fui-card-body').find(class_='weight-300')
            name = meeting_name.contents[1]
            
            city_tag = meeting_name.find_next('a')
            city = city_tag.contents[0]
            
            state_tag = city_tag.find_next('a')
            state = state_tag.contents[0]
            return {'name': name, 'address': address, 'city': city, 'state': state}
            
        except IndexError as ie:
            print(f"Index Error: {ie}")
            try:
                return {'name': name, 'address': address, 'city': city, 'state': 'state'}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
            try:
                return {'name': name, 'address': address, 'city': 'city', 'state': state}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
            try:
                return {'name': name, 'address': 'address', 'city': city, 'state': state}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
            try:
                return {'name': 'name', 'address': address, 'city': city, 'state': state}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")

In [425]:
# CREATE A FUNCTION THAT WILL EXTRACT ALL THE TABLE DATA FROM EACH LINK
# IN THE LINK LIST. THE TABLE DATA WILL THEN NEED TO BE PARSED AND 
# CLEANED IF THERE ARE MULTIPLE ITEMS:
def get_table_data(soup):
        try:
            info_table = soup.find('table', class_='table fui-table')
            # obtain all the columns from <th>
            headers = []
            for i in info_table.find_all('th'):
                title = i.text
                headers.append(title.lower())

                # now create a dataframe:
            df = pd.DataFrame(columns=headers)

        
            # Now create the foor loop to fill dataframe
            # a row is under the <tr> tags and data under the <td> tags
            for j in info_table.find_all('tr')[1:]:
                # if info_table.find_all('tr')[1:] == AttributeError.NoneType:
                #     print("No info table found")
                row_data = j.find_all('td')
                row = [i.text for i in row_data]
                length = len(df)
                df.loc[length] = row

            # data['day'].append(df['day'].to_list())
            # data['time'].append(df['time'].to_list())
            # data['info'].append(df['info'].to_list())
            day = df['day'].to_list()
            time = df['time'].to_list()
            info = df['info'].to_list()

            # now return data
            return {'day': day, 'time': time, 'info': info}
        
        except AttributeError as ae:
            print(f"info_table.find_all('tr')[1:] raised error: {ae}")
            return {'day': 'day', 'time': 'time', 'info': 'info'}

In [426]:
# CREATE A FUNCTION THAT WILL PARSE THE ROW DATA AND STORE IT 
# IN A DICTIONARY. THAT DICTIONARY CAN THEN BE INSERTED INTO 
# A LIST OF DICTIONARIES CALLED ROW_LIST BUT TO DO THIS THE PARSER
# HAS TO JOIN LIST ITEMS INTO ONE LONG STRING SO EACH ROW HAS THE 
# SAME NUMBER OF COLUMNS:
# THIS WAS INSERTED INTO meeting_data_scraper
def meeting_row_parser(item0, item1):
    """
    :param item0: This is the address data in a dictionary. Use the following keys to access
    the data -> Keys: 'name' - 'address' - 'city' - 'state' 
    :param item1: This is the meeting details data in a dictionary. Use the following keys to
    access the data -> Keys: 'day' - 'time' - 'info'
    
    create a final dictionary that will be used to store the information in the database as one row. 
    I will need to join the list items to create one string with a | seperating each item so I can 
    split the string when retrieving the data.
    """
    try:
        row = {}
        try:
            row['name'] = item0['name']
            row['address'] = item0['address']
            row['city'] = item0['city']
            row['state'] = item0['state']
            # now add item1 to the row data
            row['day'] = ' | '.join(item1['day'])
            row['time'] = ' | '.join(item1['time'])
            row['info'] = ' | '.join(item1['info'])
            # now return the row data dictionary
            return row
        except Exception as e:
            print(f'{e}')
    except Exception as e:
        print(f'{e}')
        for k, v in row.items():
            if v is not None:
                pass
            else:
                v = str(k)
        return row


In [427]:
# THIS IS THE 'MAIN LOGICAL FUNCTION' THIS FUNCTION WILL COMBINE THE 
# get_address_data, get_table_data, and meeting_row_parser FUNCTIONS.
# THAT WAY I CAN EXECUTE ALL OF THE FUNCTIONS IN ONE CALL.
def meeting_data_scraper(link):
    
    def get_address_data(soup):
        
        try:
            address_tag = soup.address
            address = address_tag.contents[1]
            
            meeting_name = soup.find(
                'div', class_='fui-card-body').find(class_='weight-300')
            name = meeting_name.contents[1]
            
            city_tag = meeting_name.find_next('a')
            city = city_tag.contents[0]
            
            state_tag = city_tag.find_next('a')
            state = state_tag.contents[0]
            return {'name': name, 'address': address, 'city': city, 'state': state}
            
        except IndexError as ie:
            print(f"Index Error: {ie}")
            try:
                return {'name': name, 'address': address, 'city': city, 'state': 'state'}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
            try:
                return {'name': name, 'address': address, 'city': 'city', 'state': state}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
            try:
                return {'name': name, 'address': 'address', 'city': city, 'state': state}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
            try:
                return {'name': 'name', 'address': address, 'city': city, 'state': state}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
             
    def get_table_data(soup):
        try:
            info_table = soup.find('table', class_='table fui-table')
            # obtain all the columns from <th>
            headers = []
            for i in info_table.find_all('th'):
                title = i.text
                headers.append(title.lower())

                # now create a dataframe:
            df = pd.DataFrame(columns=headers)

        
            # Now create the foor loop to fill dataframe
            # a row is under the <tr> tags and data under the <td> tags
            for j in info_table.find_all('tr')[1:]:
                # if info_table.find_all('tr')[1:] == AttributeError.NoneType:
                #     print("No info table found")
                row_data = j.find_all('td')
                row = [i.text for i in row_data]
                length = len(df)
                df.loc[length] = row

            # data['day'].append(df['day'].to_list())
            # data['time'].append(df['time'].to_list())
            # data['info'].append(df['info'].to_list())
            day = df['day'].to_list()
            time = df['time'].to_list()
            info = df['info'].to_list()

            # now return data
            return {'day': day, 'time': time, 'info': info}
        
        except AttributeError as ae:
            print(f"info_table.find_all('tr')[1:] raised error: {ae}")
            return {'day': 'day', 'time': 'time', 'info': 'info'}

    def meeting_row_parser(item0, item1):
        """
        :param item0: This is the address data in a dictionary. Use the following keys to access
        the data -> Keys: 'name' - 'address' - 'city' - 'state' 
        :param item1: This is the meeting details data in a dictionary. Use the following keys to
        access the data -> Keys: 'day' - 'time' - 'info'
        
        create a final dictionary that will be used to store the information in the database as one row. 
        I will need to join the list items to create one string with a | seperating each item so I can 
        split the string when retrieving the data.
        """
        row = {}
        try:
            row['name'] = item0['name']
            row['address'] = item0['address']
            row['city'] = item0['city']
            row['state'] = item0['state']
        except Exception as e:
            print(e)
            row['name'] = 'name'
            row['address'] = 'address'
            row['city'] = 'city'
            row['state'] = 'state'

        # now add item1 to the row data
        row['day'] = ' | '.join(item1['day'])
        row['time'] = ' | '.join(item1['time'])
        row['info'] = ' | '.join(item1['info'])

        # now return the row data dictionary
        return row
    
    # STEP 1: Get the webpage response obj from requests
    page = requests.get(link)
    
    # STEP 2: Get Soup object for html parsing
    soup = bs(page.text, "lxml")

    # Create two dicts with the following keys
    address_dict = get_address_data(soup)
    details_dict = get_table_data(soup)

    d = [address_dict, details_dict]
    row_data = meeting_row_parser(d[0], d[1])
    
    return row_data


In [430]:
def flatten(list_of_lists):
    "Flatten one level of nesting"
    return it.chain.from_iterable(list_of_lists)

In [5]:
# whole list of links
link_list = get_links(csv_filename)

# shortened list: first 1000
links_1000 = link_list[:1001]

# shortened list: first 500
links_500 = link_list[:501]


In [4]:
csv_filename = './meetings.csv'
link_list = get_links(csv_filename)

In [434]:
# TODO: CREATE FUNCTION THAT WILL GET THE PAGE CONTENTS AND CREATE SOUP OBJ
def fetch_soup_data(link):
    page = requests.get(link)
    soup = bs(page.text, 'lxml')
    return soup

# TODO: CREATE FUNCTION THAT WILL RETRIEVE THE ADDRESS DATA AND RETURN DICT
def get_address_data(soup):

    try:
        address_tag = soup.address
        address = address_tag.contents[1]

        meeting_name = soup.find(
            'div', class_='fui-card-body').find(class_='weight-300')
        name = meeting_name.contents[1]

        city_tag = meeting_name.find_next('a')
        city = city_tag.contents[0]

        state_tag = city_tag.find_next('a')
        state = state_tag.contents[0]
        return {'name': name, 'address': address, 'city': city, 'state': state}

    except IndexError as ie:
        print(f"Index Error: {ie}")
        try:
            return {'name': name, 'address': address, 'city': city, 'state': 'state'}
        except UnboundLocalError as ule:
            print(f"UnboundLocalError: {ule}")
        try:
            return {'name': name, 'address': address, 'city': 'city', 'state': state}
        except UnboundLocalError as ule:
            print(f"UnboundLocalError: {ule}")
        try:
            return {'name': name, 'address': 'address', 'city': city, 'state': state}
        except UnboundLocalError as ule:
            print(f"UnboundLocalError: {ule}")
        try:
            return {'name': 'name', 'address': address, 'city': city, 'state': state}
        except UnboundLocalError as ule:
            print(f"UnboundLocalError: {ule}")

# TODO: CREATE FUNCTION THAT WILL RETRIEVE THE DETAILS DATA FROM AN HTML TABLE
def get_table_data(soup):
    try:
        info_table = soup.find('table', class_='table fui-table')
        # obtain all the columns from <th>
        headers = []
        for i in info_table.find_all('th'):
            title = i.text
            headers.append(title.lower())

            # now create a dataframe:
        df = pd.DataFrame(columns=headers)

        # Now create the foor loop to fill dataframe
        # a row is under the <tr> tags and data under the <td> tags
        for j in info_table.find_all('tr')[1:]:
            # if info_table.find_all('tr')[1:] == AttributeError.NoneType:
            #     print("No info table found")
            row_data = j.find_all('td')
            row = [i.text for i in row_data]
            length = len(df)
            df.loc[length] = row

        # data['day'].append(df['day'].to_list())
        # data['time'].append(df['time'].to_list())
        # data['info'].append(df['info'].to_list())
        day = df['day'].to_list()
        time = df['time'].to_list()
        info = df['info'].to_list()

        # now return data
        return {'day': day, 'time': time, 'info': info}

    except AttributeError as ae:
        print(f"info_table.find_all('tr')[1:] raised error: {ae}")
        return {'day': 'day', 'time': 'time', 'info': 'info'}

# TODO: PASS THOSE TWO DICTS TO A FUNCTION THAT WILL PARSE AND COMBINE THEM:
def meeting_row_parser(item0, item1):
    """
    :param item0: This is the address data in a dictionary. Use the following keys to access
    the data -> Keys: 'name' - 'address' - 'city' - 'state' 
    :param item1: This is the meeting details data in a dictionary. Use the following keys to
    access the data -> Keys: 'day' - 'time' - 'info'
    
    create a final dictionary that will be used to store the information in the database as one row. 
    I will need to join the list items to create one string with a | seperating each item so I can 
    split the string when retrieving the data.
    """
    row = {}
    try:
        row['name'] = item0['name']
        row['address'] = item0['address']
        row['city'] = item0['city']
        row['state'] = item0['state']
    except Exception as e:
        print(e)
        row['name'] = 'name'
        row['address'] = 'address'
        row['city'] = 'city'
        row['state'] = 'state'

    # now add item1 to the row data
    try:
        row['day']='|'.join(item1['day'])
        row['time']='|'.join(item1['time'])
        row['info'] = '|'.join(item1['info'])
    except Exception as e:
        print(e)
        # now return the row data dictionary
    return row


# TODO: CREATE A FUNCTION THAT WILL WRITE EACH ITEM IN THE ROW_DATA LIST TO A CSV FILE
# SAVING THE ROW VALUES IN THEIR KEY'S COLUMN:
def convert_row_data_to_csv(row_data):
    with open('meeting_details.csv', 'w') as csvfile:
        for d in row_data:
            csvwriter = csv.DictWriter(csvfile, delimiter=',', field_names=list(d.keys()))
            csvwriter.writerow(d)

# TODO: GET THE LINKED LIST FROM THE CSV FILE IN meetings.csv
csv_filename = './meetings.csv'

# create func to get list of links:
def get_links(csv_filename):
    with open(csv_filename, 'r') as f:
        csv_reader = csv.reader(f)
        link_list = csv_parser(csv_reader, 'link')
    return link_list

# TODO: RETRIEVE THAT COMBINED DICT, AND APPEND TO AN EMPTY LIST CALLED ROW DATA


In [435]:
# TODO: CREATE FUNCTION THAT WILL GET THE PAGE CONTENTS AND CREATE SOUP OBJ
def fetch_soup_data(link):
    page = requests.get(link)
    soup = bs(page.text, 'lxml')
    return soup

In [443]:
def main():
    # Get Links
    csv_filename = './meetings.csv'
    links = get_links(csv_filename)
    # Now lets get the soup data from each link:
    soup = [fetch_soup_data(link) for link in links[:100]]
    # now create a list of dicts containing address and detail data
    row_data = []
    for each in soup:
        address_data = get_address_data(each)
        detail_data = get_table_data(each)
        d = [address_data, detail_data]
        row_data.append(meeting_row_parser(d[0], d[1]))
    convert_row_data_to_csv(row_data)


In [8]:
class Meeting:
    COUNT = 1
    LINKS = []
    
    
    def __init__(self, link):
        self.new_csv_path = '~/projects/github/web-scraper/scraped_meeting_data/'
        self.rowId = COUNT
        self.soup = self._fetch_soup_data(link)
        self.address = self._address_data(self.soup)
        self.info = self._table_data(self.soup)
        self.row = self._row_parser(self.address, self.info)
        COUNT += 1
        
    def _fetch_soup_data(self, link):
        page = requests.get(link)
        soup = bs(page.text, 'lxml')
        return soup
    
    def _address_data(self, soup):
        try:
            address_tag = soup.address
            address = address_tag.contents[1]

            meeting_name = soup.find(
                'div', class_='fui-card-body').find(class_='weight-300')
            name = meeting_name.contents[1]

            city_tag = meeting_name.find_next('a')
            city = city_tag.contents[0]

            state_tag = city_tag.find_next('a')
            state = state_tag.contents[0]
            return {'name': name, 'address': address, 'city': city, 'state': state}

        except IndexError as ie:
            print(f"Index Error: {ie}")
            try:
                return {'name': name, 'address': address, 'city': city, 'state': 'state'}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
            try:
                return {'name': name, 'address': address, 'city': 'city', 'state': state}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
            try:
                return {'name': name, 'address': 'address', 'city': city, 'state': state}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
            try:
                return {'name': 'name', 'address': address, 'city': city, 'state': state}
            except UnboundLocalError as ule:
                print(f"UnboundLocalError: {ule}")
                
    def _table_data(self, soup):
        """
        This extracts the table data from each link in the link_list object
        """
        try:
            info_table = soup.find('table', class_='table fui-table')
            # obtain all the columns from <th>
            headers = []
            for i in info_table.find_all('th'):
                title = i.text
                headers.append(title.lower())

                # now create a dataframe:
            df = pd.DataFrame(columns=headers)

            # Now create the foor loop to fill dataframe
            # a row is under the <tr> tags and data under the <td> tags
            for j in info_table.find_all('tr')[1:]:
                # if info_table.find_all('tr')[1:] == AttributeError.NoneType:
                #     print("No info table found")
                row_data = j.find_all('td')
                row = [i.text for i in row_data]
                length = len(df)
                df.loc[length] = row

            # data['day'].append(df['day'].to_list())
            # data['time'].append(df['time'].to_list())
            # data['info'].append(df['info'].to_list())
            day = df['day'].to_list()
            time = df['time'].to_list()
            info = df['info'].to_list()

            # now return data
            return {'day': day, 'time': time, 'info': info}

        except AttributeError as ae:
            print(f"info_table.find_all('tr')[1:] raised error: {ae}")
            return {'day': 'day', 'time': 'time', 'info': 'info'}

    def _row_parser(self, item0, item1):
        """
        :param item0: This is the address data in a dictionary. Use the following keys to access
        the data -> Keys: 'name' - 'address' - 'city' - 'state' 
        :param item1: This is the meeting details data in a dictionary. Use the following keys to
        access the data -> Keys: 'day' - 'time' - 'info'
        
        create a final dictionary that will be used to store the information in the database as one row. 
        I will need to join the list items to create one string with a | seperating each item so I can 
        split the string when retrieving the data.
        """
        row = {}
        try:
            row['name'] = item0['name']
            row['address'] = item0['address']
            row['city'] = item0['city']
            row['state'] = item0['state']
        except Exception as e:
            print(e)
            row['name'] = 'name'
            row['address'] = 'address'
            row['city'] = 'city'
            row['state'] = 'state'

        # now add item1 to the row data
        try:
            row['day']='|'.join(item1['day'])
            row['time']='|'.join(item1['time'])
            row['info'] = '|'.join(item1['info'])
        except Exception as e:
            print(e)
            # now return the row data dictionary
        return row


In [None]:
class MeetingScraperInterface(Meeting):
    
    def __init__(self):
        super(Meeting, self).__init__()
        rowId = Meeting.COUNT

In [395]:
# list comprehension for address_data and detail_data
for each in soup_data:
    address_data = get_address_data(each)
    detail_data = get_table_data(each)
    d = [address_data, detail_data]
print(d[1])

{'day': 0    Saturday
Name: day, dtype: object, 'time': 0    10:00 am - 11:00 am
Name: time, dtype: object, 'info': 0    Discussion, Open Meeting
Name: info, dtype: object}


In [None]:
def add_row_data_to_list(data: dict):
    # empty list to append individual data to
    dict_list = []

    # Now loop through the CSV file and append to the data list:
    for k, v in data.items():
        row_data = {k: v}
        dict_list.append(row_data)
    return dict_list
