# Data from Bulbapedia

[Bulbapedia](https://bulbapedia.bulbagarden.net/wiki/Main_Page) is an online Pokemon encyclopedia containing information about the show and games.
We will scrape it for pokemon image and icon urls.

### Required dependencies
bs4, pandas, requests, re, lxml, json

### Fetch the list of pokemon to get their individual page urls

In [1]:
from bs4 import BeautifulSoup as bs
import requests

BASE_URL = 'https://bulbapedia.bulbagarden.net'

pokemon_list_url = f'{BASE_URL}/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number'
pokemon_list_data = requests.get(pokemon_list_url).text

soup = bs(pokemon_list_data, features='lxml')

### Filter the page for the "Generation X" tables
dom structure:
```
  h3
    span
        "Generation X"
  table
```

In [2]:
import re

def generational_span_filter(header):
    span = header.find('span')
    if span is not None:
        matches = re.match('^Generation [IXVML]+$', span.text)
        return matches is not None
    return False

generation_headers = filter(generational_span_filter, soup.find_all('h3'))

html_tables = [(header, header.findNext('table')) for header in generation_headers]

### Convert to pandas data frame for ease of manipulation

In [14]:
import pandas as pd

df = pd.DataFrame()

for t in html_tables:
    header, table = t
    table_data = pd.read_html(str(table))[0]
    images = [(
            row.find('img')['src'],
            BASE_URL + row.find('a')['href'])
        for row in table.find_all('tr')[1:]]
    # pandas is unable to parse a > img > src, retroactively put into DF
    # likewise add indivdual page url
    table_data['MS'] = [f'https:{images[i][0]}' for i in table_data.index]
    table_data['url'] = [images[i][1] for i in table_data.index]
    table_data['Ndex'] = [ndex[1:] for ndex in table_data['Ndex']] # strip leading hash
    df = df.append(table_data)

### Convert to JSON per agreed data shape spec

In [15]:
from collections import defaultdict
import json

d = defaultdict(lambda: { 'sprites': set() })

for index, row in df.iterrows():
    ndex = row['Ndex']
    ms = row['MS']
    d[ndex]['sprites'].add(ms)

### Fetch page level contents

In [16]:
import time

for index, row in df[['Ndex', 'Pokémon']].iterrows():
    ndex = row['Ndex']
    name = row['Pokémon']
    try:
        data = requests.get(f'https://archives.bulbagarden.net/wiki/Category:{name}')
        soup = bs(data.text, features='lxml')
        anchors = soup.find_all('a', { 'class': 'image' })
        # print(f'found {len(anchors)} images for {ndex}:{name}')
        for a in anchors:
            img = a.find('img')
            src = img['src']
            d[ndex]['sprites'].add(src)
        # try not to dos attack bulbapedia
        time.sleep(1)
    except Exception as e:
        print(f'error getting icons for {name}: {e.message}')

### Export JSON

In [17]:
import os
prefix = os.environ.get('poke-prefix', './')
with open(f'{prefix}data/output.json', 'w') as f:
    json.dump(dict(d), f, default=lambda o: list(o) if isinstance(o, set) else o, indent=4)