In [74]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os

In [39]:
url = 'https://bulbapedia.bulbagarden.net/wiki/List_of_Pokémon_by_base_Egg_cycles'
response = requests.get(url)
html_content = response.text

In [40]:
soup = BeautifulSoup(html_content, 'html.parser')

In [41]:
css_selector = 'html > body > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(6) > div:nth-of-type(4) > div > table:nth-of-type(2)'
table = soup.select(css_selector)

In [42]:
if table:
    # Convert the HTML table to a string
    table_string = str(table[0])

    # Use pandas to read the HTML table string and create a DataFrame
    df = pd.read_html(table_string)[0]
    print(df)
else:
    print("Table not found.")

           # Unnamed: 1_level_0       Pokémon         Egg Group 1 Egg Group 2  \
           # Unnamed: 1_level_1       Pokémon         Egg Group 1 Egg Group 2   
0        1.0                NaN     Bulbasaur             Monster       Grass   
1        2.0                NaN       Ivysaur             Monster       Grass   
2        3.0                NaN      Venusaur             Monster       Grass   
3        4.0                NaN    Charmander             Monster      Dragon   
4        5.0                NaN    Charmeleon             Monster      Dragon   
...      ...                ...           ...                 ...         ...   
1006  1007.0                NaN      Koraidon  No Eggs Discovered           —   
1007  1008.0                NaN      Miraidon  No Eggs Discovered           —   
1008  1009.0                NaN  Walking Wake  No Eggs Discovered           —   
1009  1010.0                NaN   Iron Leaves  No Eggs Discovered           —   
1010     NaN                

In [43]:
# Create a new list to store the collapsed column headers
collapsed_headers = []

# Iterate over the current column headers
for header in df.columns:
    # Check if the header is a tuple (indicating a multi-level header)
    if isinstance(header, tuple):
        # Join the tuple elements with a separator
        collapsed_header = ' '.join(header)
        collapsed_headers.append(collapsed_header)
    else:
        collapsed_headers.append(header)

# Assign the collapsed headers to the DataFrame
df.columns = collapsed_headers

# Check the updated column headers
print(df.columns)

Index(['# #', 'Unnamed: 1_level_0 Unnamed: 1_level_1', 'Pokémon Pokémon',
       'Egg Group 1 Egg Group 1', 'Egg Group 2 Egg Group 2', 'Cycles Cycles',
       'Steps Gen II', 'Steps Gen III', 'Steps Gen IV', 'Steps Gen V-VI',
       'Steps Gen VII', 'Steps Gen VIII-IX'],
      dtype='object')


In [44]:
df = df.drop(df.columns[1], axis=1)

In [45]:
# Deduplicate the column names
deduplicated_columns = []
for col in df.columns:
    words = col.split()  # Split the column name into words
    unique_words = []
    for word in words:
        if word not in unique_words:
            unique_words.append(word)
    deduplicated_col = ' '.join(unique_words)  # Join the unique words back into a single string
    deduplicated_columns.append(deduplicated_col)

df.columns = deduplicated_columns

In [49]:
df = df.replace('—', np.nan)

In [51]:
df = df.drop(df.index[-1])

In [53]:
df = df.replace('No Eggs Discovered', np.nan)
df = df.rename(columns={df.columns[1]: 'pokemon'})

In [59]:
df.columns = df.columns.str.lower()

In [62]:
df.columns = df.columns.str.replace(' ', '_')

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   #                  1010 non-null   float64
 1   pokémon            1010 non-null   object 
 2   egg_group_1        869 non-null    object 
 3   egg_group_2        275 non-null    object 
 4   cycles             1010 non-null   float64
 5   steps_gen_ii       251 non-null    object 
 6   steps_gen_iii      386 non-null    object 
 7   steps_gen_iv       493 non-null    object 
 8   steps_gen_v-vi     721 non-null    object 
 9   steps_gen_vii      809 non-null    object 
 10  steps_gen_viii-ix  1010 non-null   float64
dtypes: float64(3), object(8)
memory usage: 86.9+ KB


In [69]:

df = df.rename(columns={df.columns[1]: 'pokemon', '#':'pokemon_id'})

In [72]:
dtype_dict = {
    'pokemon_id': 'int64',
    'cycles': 'int64',
    'steps_gen_viii-ix': 'int64'
}

df = df.astype(dtype_dict)

In [75]:
output_path = os.path.join('..', 'data', 'pokemon_egg_group.csv')
df.to_csv(output_path, index=False)

In [76]:
df.head(5)

Unnamed: 0,pokemon_id,pokemon,egg_group_1,egg_group_2,cycles,steps_gen_ii,steps_gen_iii,steps_gen_iv,steps_gen_v-vi,steps_gen_vii,steps_gen_viii-ix
0,1,Bulbasaur,Monster,Grass,20,5120,5376,5355,5140,5120,2560
1,2,Ivysaur,Monster,Grass,20,5120,5376,5355,5140,5120,2560
2,3,Venusaur,Monster,Grass,20,5120,5376,5355,5140,5120,2560
3,4,Charmander,Monster,Dragon,20,5120,5376,5355,5140,5120,2560
4,5,Charmeleon,Monster,Dragon,20,5120,5376,5355,5140,5120,2560
