This is used to clead unwanted code output

In [1]:
from IPython.display import clear_output

Importing all the packages used for the notebook

In [2]:
import os, csv, json, pandas, scrapy, multiprocessing
from scrapy.crawler import CrawlerProcess

Creating folders needed for organization

In [3]:
if not os.path.exists('csv/'):
  os.makedirs('csv/')

if not os.path.exists('json_lines/'):
  os.makedirs('json_lines/')

First class defining a web spider and its behavior

In [4]:
class BONAPSpider(scrapy.Spider):
    name = 'bonap_spider'

    def parse(self, response):
        ID_COUNT = 0
        TITLE_SELECTOR = '.page .h11 ::text'
        TABLE_SELECTOR = '.page .tl'
        HREF_SELECTOR = 'a ::attr(href)'
        A_SELECTOR = './/tr/td/a'
        ACCEPTED_TITLES = [
            '\r\n(US County-Level Species Maps: List by Traditional Family)\r\n',
            '\r\n(US County-Level Species Maps: List by Modern Family)\r\n',
            '\r\n(State-Level Species Maps: List by Traditional Family)\r\n',
            '\r\n(State-Level Species Maps: List by Modern Family)\r\n']

        for title in response.css(TITLE_SELECTOR):
            if title.extract() in ACCEPTED_TITLES:
                TITLE_TAG = title.extract().replace('\r\n', '')
            else:
                table = scrapy.Selector(text=response.css(
                    TABLE_SELECTOR).extract()[ID_COUNT])

                yield {
                    'ID': ID_COUNT,
                    'TITLE TAG': TITLE_TAG,
                    'FAMILY': title.extract(),
                    'URI ARRAY': table.xpath(A_SELECTOR).css(HREF_SELECTOR).extract()
                }
                ID_COUNT += 1

        ID_COUNT = 0

Second class defining a web spider and its behavior

In [5]:
class BONAPMapSpider(scrapy.Spider):
    name = 'bonap_map_spider'

    def parse(self, response):
        TITLE_SELECTOR = '._tfixed div ::text'
        LINK_SELECTOR = '._tfixed a ::attr(href)'

        for (title, link) in zip(response.css(TITLE_SELECTOR).extract(), response.css(LINK_SELECTOR).extract()):
            yield {
                'PLANT NAME': title,
                'ROOT URL': response.url,
                'URI': link
            }

Function for defining a crawler process for multiprocessing

In [6]:
def spider_process(FEED_URI, spider, urls):
    process = CrawlerProcess(settings={
        'LOG_FILE': 'scrapy.log',
        'FEED_URI': FEED_URI,
        'FEED_FORMAT': 'jsonlines',
    })
    if spider == 'BONAPSpider':
        process.crawl(BONAPSpider, start_urls=urls)
    elif spider == 'BONAPMapSpider':
        process.crawl(BONAPMapSpider, start_urls=urls)
    process.start()

Function for prepending site urls to uris

In [7]:
def prepend(list, str):
    str += '{0}'
    list = [str.format(i) for i in list]
    return(list)

First chunk of optimised web crawling using multiprocessing

In [8]:
process_1 = multiprocessing.Process(target=spider_process, args=(
    'json_lines/county_traditional_family_maps.jsonl', 'BONAPSpider', ['http://bonap.net/NAPA/Family/Traditional/County']))
process_2 = multiprocessing.Process(target=spider_process, args=(
    'json_lines/county_modern_family_maps.jsonl', 'BONAPSpider', ['http://bonap.net/NAPA/Family/Modern/County']))
process_3 = multiprocessing.Process(target=spider_process, args=(
    'json_lines/state_traditional_family_maps.jsonl', 'BONAPSpider', ['http://bonap.net/NAPA/Family/Traditional/State']))
process_4 = multiprocessing.Process(target=spider_process, args=(
    'json_lines/state_modern_family_maps.jsonl', 'BONAPSpider', ['http://bonap.net/NAPA/Family/Modern/State']))

process_1.start()
process_2.start()
process_3.start()
process_4.start()
process_1.join()
process_2.join()
process_3.join()
process_4.join()

Creating start urls for the next web crawl process

In [9]:
process_1_urls = []
with open('json_lines/county_traditional_family_maps.jsonl', 'r') as open_file:
    for line in open_file:
        data = json.loads(line)
        process_1_urls += data['URI ARRAY']
    process_1_urls = prepend(process_1_urls, 'http://www.bonap.net')

process_2_urls = []
with open('json_lines/county_modern_family_maps.jsonl', 'r') as open_file:
    for line in open_file:
        data = json.loads(line)
        process_2_urls += data['URI ARRAY']
    process_2_urls = prepend(process_2_urls, 'http://www.bonap.net')

process_3_urls = []
with open('json_lines/state_traditional_family_maps.jsonl', 'r') as open_file:
    for line in open_file:
        data = json.loads(line)
        process_3_urls += data['URI ARRAY']
    process_3_urls = prepend(process_3_urls, 'http://www.bonap.net')

process_4_urls = []
with open('json_lines/state_modern_family_maps.jsonl', 'r') as open_file:
    for line in open_file:
        data = json.loads(line)
        process_4_urls += data['URI ARRAY']
    process_4_urls = prepend(process_4_urls, 'http://www.bonap.net')

Second chunk of optimised web crawling using multiprocessing

In [10]:
process_1 = multiprocessing.Process(target=spider_process, args=(
    'json_lines/county_traditional_family_images.jsonl', 'BONAPMapSpider', process_1_urls))
process_2 = multiprocessing.Process(target=spider_process, args=(
    'json_lines/county_modern_family_images.jsonl', 'BONAPMapSpider', process_2_urls))
process_3 = multiprocessing.Process(target=spider_process, args=(
    'json_lines/state_traditional_family_images.jsonl', 'BONAPMapSpider', process_3_urls))
process_4 = multiprocessing.Process(target=spider_process, args=(
    'json_lines/state_modern_family_images.jsonl', 'BONAPMapSpider', process_4_urls))

process_1.start()
process_2.start()
process_3.start()
process_4.start()
process_1.join()
process_2.join()
process_3.join()
process_4.join()

creating csv files for pandas

In [11]:
with open('json_lines/county_traditional_family_images.jsonl', 'r') as open_file, open('csv/county_traditional_family_images.csv', 'w') as out_file:
    writer = csv.writer(out_file)
    writer.writerow(['PLANT NAME', 'ROOT URL', 'URI'])
    for line in open_file:
        data = json.loads(line)
        row = [data[map] for map in data.keys()]
        writer.writerow(row)

with open('json_lines/county_modern_family_images.jsonl', 'r') as open_file, open('csv/county_modern_family_images.csv', 'w') as out_file:
    writer = csv.writer(out_file)
    writer.writerow(['PLANT NAME', 'ROOT URL', 'URI'])
    for line in open_file:
        data = json.loads(line)
        row = [data[map] for map in data.keys()]
        writer.writerow(row)

with open('json_lines/county_traditional_family_maps.jsonl', 'r') as open_file, open('csv/county_traditional_family_maps.csv', 'w') as out_file:
    writer = csv.writer(out_file)
    writer.writerow(['ID', 'TAG TITLE', 'FAMILY', 'URI ARRAY'])
    for line in open_file:
        data = json.loads(line)
        row = [data[map] for map in data.keys()]
        writer.writerow(row)

with open('json_lines/county_modern_family_maps.jsonl', 'r') as open_file, open('csv/county_modern_family_maps.csv', 'w') as out_file:
    writer = csv.writer(out_file)
    writer.writerow(['ID', 'TAG TITLE', 'FAMILY', 'URI ARRAY'])
    for line in open_file:
        data = json.loads(line)
        row = [data[map] for map in data.keys()]
        writer.writerow(row)

creating csv files for pandas

In [12]:
with open('json_lines/state_traditional_family_images.jsonl', 'r') as open_file, open('csv/state_traditional_family_images.csv', 'w') as out_file:
    writer = csv.writer(out_file)
    writer.writerow(['PLANT NAME', 'ROOT URL', 'URI'])
    for line in open_file:
        data = json.loads(line)
        row = [data[map] for map in data.keys()]
        writer.writerow(row)

with open('json_lines/state_modern_family_images.jsonl', 'r') as open_file, open('csv/state_modern_family_images.csv', 'w') as out_file:
    writer = csv.writer(out_file)
    writer.writerow(['PLANT NAME', 'ROOT URL', 'URI'])
    for line in open_file:
        data = json.loads(line)
        row = [data[map] for map in data.keys()]
        writer.writerow(row)

with open('json_lines/state_traditional_family_maps.jsonl', 'r') as open_file, open('csv/state_traditional_family_maps.csv', 'w') as out_file:
    writer = csv.writer(out_file)
    writer.writerow(['ID', 'TAG TITLE', 'FAMILY', 'URI ARRAY'])
    for line in open_file:
        data = json.loads(line)
        row = [data[map] for map in data.keys()]
        writer.writerow(row)

with open('json_lines/state_modern_family_maps.jsonl', 'r') as open_file, open('csv/state_modern_family_maps.csv', 'w') as out_file:
    writer = csv.writer(out_file)
    writer.writerow(['ID', 'TAG TITLE', 'FAMILY', 'URI ARRAY'])
    for line in open_file:
        data = json.loads(line)
        row = [data[map] for map in data.keys()]
        writer.writerow(row)

reading csv data in pandas

In [13]:
county_traditional_family_images = pandas.read_csv(
    'csv/county_traditional_family_images.csv')
county_modern_family_images = pandas.read_csv(
    'csv/county_modern_family_images.csv')
county_traditional_family_maps = pandas.read_csv(
    'csv/county_traditional_family_maps.csv')
county_modern_family_maps = pandas.read_csv(
    'csv/county_modern_family_maps.csv')

state_traditional_family_images = pandas.read_csv(
    'csv/state_traditional_family_images.csv')
state_modern_family_images = pandas.read_csv(
    'csv/state_modern_family_images.csv')
state_traditional_family_maps = pandas.read_csv(
    'csv/state_traditional_family_maps.csv')
state_modern_family_maps = pandas.read_csv(
    'csv/state_modern_family_maps.csv')

printing resulting dataframe

In [14]:
print(county_traditional_family_images.head())
print(county_modern_family_images.head())
print(county_traditional_family_maps.head())
print(county_modern_family_maps.head())
print(state_traditional_family_images.head())
print(state_modern_family_images.head())
print(state_traditional_family_maps.head())
print(state_modern_family_maps.head())

            PLANT NAME                                           ROOT URL  \
0            Asystasia  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
1  Asystasia gangetica  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
2             Acanthus  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
3      Acanthus mollis  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
4          Dyschoriste  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   

                                          URI  
0      /MapGallery/County/Genus/Asystasia.png  
1  /MapGallery/County/Asystasia gangetica.png  
2       /MapGallery/County/Genus/Acanthus.png  
3      /MapGallery/County/Acanthus mollis.png  
4    /MapGallery/County/Genus/Dyschoriste.png  
            PLANT NAME                                           ROOT URL  \
0            Avicennia  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
1  Avicennia germinans  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
2     Avicennia ma

molding data

In [15]:
county_traditional_family_maps['URL ARRAY'] = county_traditional_family_maps['URI ARRAY']
county_modern_family_maps['URL ARRAY'] = county_modern_family_maps['URI ARRAY']
state_traditional_family_maps['URL ARRAY'] = state_traditional_family_maps['URI ARRAY']
state_modern_family_maps['URL ARRAY'] = state_modern_family_maps['URI ARRAY']

for index in range(county_traditional_family_maps['URL ARRAY'].size):
  array = json.loads(county_traditional_family_maps['URL ARRAY'][index].replace("'", '"'))
  county_traditional_family_maps['URL ARRAY'][index] = prepend(array, 'http://www.bonap.net')
for index in range(county_modern_family_maps['URL ARRAY'].size):
  array = json.loads(county_modern_family_maps['URL ARRAY'][index].replace("'", '"'))
  county_modern_family_maps['URL ARRAY'][index] = prepend(array, 'http://www.bonap.net')
for index in range(state_traditional_family_maps['URL ARRAY'].size):
  array = json.loads(state_traditional_family_maps['URL ARRAY'][index].replace("'", '"'))
  state_traditional_family_maps['URL ARRAY'][index] = prepend(array, 'http://www.bonap.net')
for index in range(state_modern_family_maps['URL ARRAY'].size):
  array = json.loads(state_modern_family_maps['URL ARRAY'][index].replace("'", '"'))
  state_modern_family_maps['URL ARRAY'][index] = prepend(array, 'http://www.bonap.net')

clear_output()

function to help mold data

In [16]:
def family(index, column, maps):
    for (url, family_name) in zip(maps['URL ARRAY'], maps['FAMILY']):
        if column[index] in url:
            return family_name
        return ''

molding more data

In [17]:
family_list = []
for index in range(county_traditional_family_images['ROOT URL'].size):
    family_list.append(
        family(index, county_traditional_family_images['ROOT URL'], county_traditional_family_maps))
county_traditional_family_images['FAMILY'] = family_list

family_list = []
for index in range(county_modern_family_images['ROOT URL'].size):
    family_list.append(
        family(index, county_modern_family_images['ROOT URL'], county_modern_family_maps))
county_modern_family_images['FAMILY'] = family_list

family_list = []
for index in range(state_traditional_family_images['ROOT URL'].size):
    family_list.append(
        family(index, state_traditional_family_images['ROOT URL'], state_traditional_family_maps))
state_traditional_family_images['FAMILY'] = family_list

family_list = []
for index in range(state_modern_family_images['ROOT URL'].size):
    family_list.append(
        family(index, state_modern_family_images['ROOT URL'], state_modern_family_maps))
state_modern_family_images['FAMILY'] = family_list

printing results

In [18]:
print(county_traditional_family_images.head())
print(county_modern_family_images.head())
print(county_traditional_family_maps.head())
print(county_modern_family_maps.head())
print(state_traditional_family_images.head())
print(state_modern_family_images.head())
print(state_traditional_family_maps.head())
print(state_modern_family_maps.head())

            PLANT NAME                                           ROOT URL  \
0            Asystasia  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
1  Asystasia gangetica  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
2             Acanthus  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
3      Acanthus mollis  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
4          Dyschoriste  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   

                                          URI       FAMILY  
0      /MapGallery/County/Genus/Asystasia.png  ACANTHACEAE  
1  /MapGallery/County/Asystasia gangetica.png  ACANTHACEAE  
2       /MapGallery/County/Genus/Acanthus.png  ACANTHACEAE  
3      /MapGallery/County/Acanthus mollis.png  ACANTHACEAE  
4    /MapGallery/County/Genus/Dyschoriste.png  ACANTHACEAE  
            PLANT NAME                                           ROOT URL  \
0            Avicennia  http://www.bonap.net/Napa/TaxonMaps/Genus/Coun...   
1  Avicennia germ

Creating final result databases

In [19]:
county_traditional_family_images.to_csv('csv/bonap_county_traditional_family.csv', sep=',', encoding='utf-8')
county_modern_family_images.to_csv('csv/bonap_county_modern_family.csv', sep=',', encoding='utf-8')
state_traditional_family_images.to_csv('csv/bonap_state_traditional_family.csv', sep=',', encoding='utf-8')
state_modern_family_images.to_csv('csv/bonap_state_modern_family.csv', sep=',', encoding='utf-8')