In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re

In [2]:
PATH_LINKS = 'data/input/links.txt' # Path to the file with the links to the pokemon pages
PATH_TMP_LINKS = 'data/tmp/links_left_to_scrape.txt'

# Check if the file 'links_left_to_scrape.txt' exists
# If not, create it from the file 'links.txt'
if not os.path.exists(PATH_TMP_LINKS):
    with open(PATH_LINKS, 'r') as f:
        for line in f:
            with open(PATH_TMP_LINKS, 'a') as f2:
                f2.write(line)

URLS = []

# Read the links from the file 'links_left_to_scrape.txt'
with open(PATH_TMP_LINKS, 'r') as f:
    for line in f:
        URLS.append(line.replace('\n','').strip())

In [3]:
def remove_japanese_characters(input_string):
    cleaned_string = ''.join([i for i in input_string if not re.findall("[^\u0000-\u05C0\u2100-\u214F]+",i)])
    return cleaned_string

def erase_file(filepath, URL):
    # Erase the link from the input file
    with open(filepath, 'r') as f:
        lines = f.readlines()
    with open(filepath, 'w') as f:
        for line in lines:
            if line.strip('\n') != URL:
                f.write(line)

In [4]:
def get_data_from_webpage(URL):

    # Send a GET request to the URL
    response = requests.get(URL)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve data from {URL}. Status code: {response.status_code}")
        return
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract data based on the structure of the webpage
    webpage_name = URL.split('/')[-1].split('.')[0]
    print(f'Getting data from {webpage_name}...')

    with open(f'data/tmp/{webpage_name}.txt', 'w') as f:
        f.write('')
    
    e_rows = None
    if webpage_name == 'transferonly':
        e_rows = soup.select('table:nth-of-type(1) tr td:nth-of-type(2) a')
    else:
        e_rows = soup.select('table:nth-of-type(2) tr td:nth-of-type(3) a')
    
    for e_row in e_rows:
        name = remove_japanese_characters(e_row.text)

        with open(f'data/tmp/{webpage_name}.txt', 'a') as f:
            if name != '':
                f.write(f'{name}\n')

    erase_file(PATH_TMP_LINKS, URL)

In [5]:
for URL in URLS:
    get_data_from_webpage(URL)

Getting data from paldeapokedex...
Getting data from kitakamipokedex...
Getting data from transferonly...


In [6]:
# Delete the file 'links_left_to_scrape.txt'
os.remove(PATH_TMP_LINKS)

In [None]:
# Create an empty file to store the Pokemon valid in Regulation E
with open('data/output/pokemon_regE.txt', 'w') as f:
    f.write('')

# Join the files into one
webpages = os.listdir('data/tmp')
for webpage in webpages:
    with open(f'data/tmp/{webpage}', 'r') as f:
        with open('data/output/pokemon_regE.txt', 'a') as f2:
            for line in f:
                f2.write(line)

# Create a dict with keys as Pokemon names and values as the number of times they appear in all the files
pokemons = {}
with open('data/output/pokemon_regE.txt', 'r') as f:
    for line in f:
        if line.strip('\n') in pokemons.keys():
            pokemons[line.strip('\n')] += 1
        else:
            pokemons[line.strip('\n')] = 1

with open('data/output/pokemon_regE.txt', 'w') as f:
    for pokemon in pokemons:
        f.write(f'{pokemon}\n')

# Delete the files with the data from the webpages
for webpage in webpages:
    os.remove(f'data/tmp/{webpage}')