# Getting data directly from a website
This notebook walks you through some steps in collecting data from [Bulbapedia's National Pokedex](https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number) using `requests` and `BeautifulSoup`

### Import `requests` library
This package allows you to get any website's HTML code so that you can extract from it. Let's save the website's URL in the `URL` variable.

In [2]:
import requests
import pandas as pd
URL="https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"

### Load the page

In [3]:
page=requests.get(URL)

### Parse HTML data

In [4]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page.content, 'html.parser')

### Find all tables that contain Pokemon details

In [5]:
# Get main content <div>
poke_content = soup.find(id = 'mw-content-text')
#poke_content
poke_tables = poke_content.find_all('table')
len(poke_tables)

13

### Get list of First Generation Pokemons

In [6]:
select_generation = 1
gen1_list = poke_tables[select_generation]

In [7]:
# Check its contents and find where the first Pokemon entry is
# The first Pokemon entry
gen1_list.contents[1]

<tbody><tr>
<th style="border-top-left-radius: 5px; -moz-border-radius-topleft: 5px; -webkit-border-top-left-radius: 5px; -khtml-border-top-left-radius: 5px; -icab-border-top-left-radius: 5px; -o-border-top-left-radius: 5px; background: #64D364"><a href="/wiki/List_of_Pok%C3%A9mon_by_Kanto_Pok%C3%A9dex_number" title="List of Pokémon by Kanto Pokédex number"><span style="color:#000;">Kdex</span></a>
</th>
<th style="background: #64D364">Ndex
</th>
<th style="background: #64D364">MS
</th>
<th style="background: #64D364">Pokémon
</th>
<th colspan="2" style="border-top-right-radius: 5px; -moz-border-radius-topright: 5px; -webkit-border-top-right-radius: 5px; -khtml-border-top-right-radius: 5px; -icab-border-top-right-radius: 5px; -o-border-top-right-radius: 5px; background: #64D364">Type
</th></tr>
<tr style="background:#FFF">
<td style="font-family:monospace">#001
</td>
<td style="font-family:monospace">#001
</td>
<th><a href="/wiki/Bulbasaur_(Pok%C3%A9mon)" title="Bulbasaur"><img alt="Bu

In [8]:
# Study the structure
for each in gen1_list.contents[1]:
    print(each)
    print('##################################')

<tr>
<th style="border-top-left-radius: 5px; -moz-border-radius-topleft: 5px; -webkit-border-top-left-radius: 5px; -khtml-border-top-left-radius: 5px; -icab-border-top-left-radius: 5px; -o-border-top-left-radius: 5px; background: #64D364"><a href="/wiki/List_of_Pok%C3%A9mon_by_Kanto_Pok%C3%A9dex_number" title="List of Pokémon by Kanto Pokédex number"><span style="color:#000;">Kdex</span></a>
</th>
<th style="background: #64D364">Ndex
</th>
<th style="background: #64D364">MS
</th>
<th style="background: #64D364">Pokémon
</th>
<th colspan="2" style="border-top-right-radius: 5px; -moz-border-radius-topright: 5px; -webkit-border-top-right-radius: 5px; -khtml-border-top-right-radius: 5px; -icab-border-top-right-radius: 5px; -o-border-top-right-radius: 5px; background: #64D364">Type
</th></tr>
##################################


##################################
<tr style="background:#FFF">
<td style="font-family:monospace">#001
</td>
<td style="font-family:monospace">#001
</td>
<th><a hre

In [9]:
poke_info = gen1_list.contents[1]

# Strategy 1 - Tuple loops + Dataframe conversion + Saving to Json: 

In [13]:
extracted_poke_info = []
info_start = 1
# place where to get the pokemon info
info_row = gen1_list.contents[info_start]

for pokemon_info_values, even_index_chec in zip(info_row.contents, range(0,len(info_row.contents))):
    # Pokemons' values are stored in even index (divisible by 2 and is not 0)
    if ((even_index_chec % 2) == 0) & (even_index_chec != 0) :
        pokemon_raw_info = pokemon_info_values.text.strip().split()
        #print(pokemon_raw_info)
            
        if len(pokemon_raw_info) == 5:
                kan_dex = pokemon_raw_info[0]
                nat_dex = pokemon_raw_info[1]
                name = pokemon_raw_info[2]
                type_1 = pokemon_raw_info[3]
                type_2 = pokemon_raw_info[4]
                cat = "Regular"
            
        elif len(pokemon_raw_info) == 4:
                kan_dex = pokemon_raw_info[0]
                
                if pokemon_raw_info[1].replace("#", "").isdigit() == True:
                    nat_dex = pokemon_raw_info[1]
                    name = pokemon_raw_info[2]
                    type_1 = pokemon_raw_info[3]
                    type_2 = None
                    cat = "Regular"
                    
                else:
                    nat_dex = None
                    name = pokemon_raw_info[1]
                    type_1 = pokemon_raw_info[2]
                    type_2 = pokemon_raw_info[3]
                    cat = "Variant"
            
        extracted_poke_info.append((kan_dex, nat_dex, name, type_1, type_2, cat))

labels = ['kan_dex', 'nat_dex', 'name', 'type_1', 'type_2', 'cat']        
poke_info = pd.DataFrame(extracted_poke_info, columns = labels)
poke_info.head(30)

Unnamed: 0,kan_dex,nat_dex,name,type_1,type_2,cat
0,#001,#001,Bulbasaur,Grass,Poison,Regular
1,#002,#002,Ivysaur,Grass,Poison,Regular
2,#003,#003,Venusaur,Grass,Poison,Regular
3,#004,#004,Charmander,Fire,,Regular
4,#005,#005,Charmeleon,Fire,,Regular
5,#006,#006,Charizard,Fire,Flying,Regular
6,#007,#007,Squirtle,Water,,Regular
7,#008,#008,Wartortle,Water,,Regular
8,#009,#009,Blastoise,Water,,Regular
9,#010,#010,Caterpie,Bug,,Regular


In [None]:
extracted_poke_info = []
info_start = 1
# place where to get the pokemon info
info_row = gen1_list.contents[info_start]

for pokemon_info_values, even_index_chec in zip(info_row.contents, range(0,len(info_row.contents))):
    # Pokemons' values are stored in even index (divisible by 2 and is not 0)
    if ((even_index_chec % 2) == 0) & (even_index_chec != 0) :
        pokemon_raw_info = pokemon_info_values.text.strip().split('\n')
        
        
## Pokemons that are other regional forms and has 1 type
        if len(pokemon_raw_info) == 7:
            kdex = pokemon_raw_info[0]
            ndex = pokemon_raw_info[1]
            poke_name = pokemon_raw_info[4]
            type1 = pokemon_raw_info[6]
            type2 = ''
            categ = 'Other Form Single Type'
#             print(kdex,ndex,poke_name, type1,type2)
#             print(pokemon_info_values.text.strip().split('\n'))
#             print('####')

## Pokemons that are other regional forms and has 2 types
        elif len(pokemon_raw_info) == 8:
            kdex = pokemon_raw_info[0]
            ndex = pokemon_raw_info[1]
            poke_name = pokemon_raw_info[4]
            type1 = pokemon_raw_info[6]
            type2 = pokemon_raw_info[7]
            categ = 'Other Form Multi Type'
#             print(kdex,ndex,poke_name, type1,type2)
#             print(pokemon_info_values.text.strip().split('\n'))
#             print('####')

## Orig pokemon list with single typing
        elif len(pokemon_raw_info) == 9:
            kdex = pokemon_raw_info[0]
            ndex = pokemon_raw_info[2]
            poke_name = pokemon_raw_info[6]
            type1 = pokemon_raw_info[8]
            type2 = ''
            categ = 'Orig Form Single Type'
#             print(kdex,ndex,poke_name, type1,type2)
#             print(pokemon_info_values.text.strip().split('\n'))
#             print('####')


# Orig pokemon list with multiple typing
        elif len(pokemon_raw_info) == 10:
            kdex = pokemon_raw_info[0]
            ndex = pokemon_raw_info[2]
            poke_name = pokemon_raw_info[6]
            type1 = pokemon_raw_info[8]
            type2 = pokemon_raw_info[9]
            categ = 'Orig Form Multi Type'
#             print(kdex,ndex,poke_name, type1,type2)
#             print(pokemon_info_values.text.strip().split('\n'))
#             print('####')

        else:
            print('Check out elements containing ' + str(len(pokemon_raw_info)) + ' elements')
        
        # Saving as a tuple
        extracted_poke_info.append((kdex, ndex, poke_name, type1, type2, categ))
        
    
    else:
        pass
#         print(pokemon_info_values)

In [None]:
df_pokemon_list = pd.DataFrame(extracted_poke_info)

In [None]:
df_pokemon_list.columns = ['Kdex', 'Ndex', 'Pokemon', 'Type 1', 'Type 2', 'Category']

In [None]:
df_pokemon_list.to_json('Extracted Pokemon List Json.json',orient='index')

# Strategy 2 Json Conversion per Loop:

In [None]:
gen1_json = []

info_start = 1
# place where to get the pokemon info
info_row=gen1_list.contents[info_start]

for pokemon_info_values, even_index_chec in zip(info_row.contents, range(0,len(info_row.contents))):
    # Pokemons' values are stored in even index (divisible by 2 and is not 0)
    if ((even_index_chec % 2) == 0) & (even_index_chec != 0) :
        pokemon_raw_info = pokemon_info_values.text.strip().split('\n')

        
## Pokemons that are other regional forms and has 1 type
        if len(pokemon_raw_info) == 7:
            kdex = pokemon_raw_info[0]
            ndex = pokemon_raw_info[1]
            poke_name = pokemon_raw_info[4]
            type1 = pokemon_raw_info[6]
            type2 = ''
            categ = 'Other Form Single Type'
#             print(kdex,ndex,poke_name, type1,type2)
#             print(pokemon_info_values.text.strip().split('\n'))
#             print('####')

## Pokemons that are other regional forms and has 2 types
        elif len(pokemon_raw_info) == 8:
            kdex = pokemon_raw_info[0]
            ndex = pokemon_raw_info[1]
            poke_name = pokemon_raw_info[4]
            type1 = pokemon_raw_info[6]
            type2 = pokemon_raw_info[7]
            categ = 'Other Form Multi Type'
#             print(kdex,ndex,poke_name, type1,type2)
#             print(pokemon_info_values.text.strip().split('\n'))
#             print('####')

## Orig pokemon list with single typing
        elif len(pokemon_raw_info) == 9:
            kdex = pokemon_raw_info[0]
            ndex = pokemon_raw_info[2]
            poke_name = pokemon_raw_info[6]
            type1 = pokemon_raw_info[8]
            type2 = ''
            categ = 'Orig Form Single Type'
#             print(kdex,ndex,poke_name, type1,type2)
#             print(pokemon_info_values.text.strip().split('\n'))
#             print('####')


# Orig pokemon list with multiple typing
        elif len(pokemon_raw_info) == 10:
            kdex = pokemon_raw_info[0]
            ndex = pokemon_raw_info[2]
            poke_name = pokemon_raw_info[6]
            type1 = pokemon_raw_info[8]
            type2 = pokemon_raw_info[9]
            categ = 'Orig Form Multi Type'
#             print(kdex,ndex,poke_name, type1,type2)
#             print(pokemon_info_values.text.strip().split('\n'))
#             print('####')

        else:
            print('Check out elements containing ' + str(len(pokemon_raw_info)) + ' elements')
        
        # Saving as a tuple
        gen1_json.append({"kdex" : kdex,
                          "ndex" : ndex,
                          "poke_name" : poke_name,
                          "type1" : type1,
                          "type2" : type2,
                          "category" : categ})
        
#         gen1_json.append({
#             "kdex": kdex,
#             "ndex": ndex,
#             "name": name,
#             "type1": type1,
#             "type2": type2
#         })
        

In [None]:
gen1_json