In [1]:
import pandas as pd
import numpy as np
import time
import requests
import pickle

from urllib.request import urlopen
from bs4 import BeautifulSoup
from collections import defaultdict

import matplotlib.pyplot as plt


In [8]:
pokemon_description = defaultdict(str)
n_pokemon = 890
save_img = False

In [9]:
pokemon_url = "https://www.pokemon.com/us/pokedex/bulbasaur"
pokemon_domain = "https://www.pokemon.com"

pokemon_array = np.zeros((n_pokemon, 475, 475, 4))

for i in range(n_pokemon):
    start = time.time()
    
    pokedex_request = requests.get(pokemon_url)
    pokedex_bs = BeautifulSoup(pokedex_request.text)

    try:
        pokemon_number = pokedex_bs.find("div", {"class": "pokedex-pokemon-pagination-title"})
    except:
        raise Exception("Text not found")
     
    
    name_number = pokemon_number.text.split()
    file_name = name_number[0] + "_" + name_number[1][1:]
    # Finding the image
    
    try:
        pokemon_imgs = pokedex_bs.find("img", {"class": "active"})
    except:
        raise Exception("Image not found")
    
    name = pokemon_imgs['alt']
    source = pokemon_imgs['src']
    
    img_source = urlopen(source)
    img = plt.imread(img_source)
    
    if save_img:
        img_path = "imgs/" + file_name + ".png"
        plt.imsave(img_path, img)
    else:
        pokemon_array[i,:,:,:] = img
    
    # Finding the description
    
    try:
        pokemon_txt = pokedex_bs.find("p", {"class": "active"})
    except:
        raise Exception("Text not found")
        
    pokemon_description[file_name] = pokemon_txt.text
    
    # Finding next
    
    try:
        pokemon_next = pokedex_bs.find("a", {"class": "next"})
    except:
        raise Exception("Next not found")
    
    pokemon_url = pokemon_domain + pokemon_next['href']
    done = i + 1 / n_pokemon
    
    time.sleep(.4)

filehandler = open("data/pokemon_imgs.pkl","wb")
pickle.dump(pokemon_array, filehandler, protocol = 4)
filehandler.close()

In [10]:
pokemon_df = pd.DataFrame(pokemon_description, index = ['description'])\
               .T\
               .reset_index()\
               .rename(columns = {'index': 'pokemon'})
pokemon_df['number_nationaldex'] = pokemon_df['pokemon'].str.split("_", expand=True)[1]
pokemon_df['pokemon'] = pokemon_df['pokemon'].str.split("_", expand=True)[0]

pokemon_df.to_csv('data/pokemon_descriptions.csv')