In [2]:
import os
import requests
from bs4 import BeautifulSoup
import re

In [6]:
PATH_LINKS = 'data/input/links.txt' # Path to the file with the links to the pokemon pages
PATH_TMP_LINKS = 'data/tmp/links_left_to_scrape.txt'

# Check if the file 'links_left_to_scrape.txt' exists
# If not, create it from the file 'links.txt'
if not os.path.exists(PATH_TMP_LINKS):
    with open(PATH_LINKS, 'r') as f:
        for line in f:
            with open(PATH_TMP_LINKS, 'a') as f2:
                f2.write(line)

URLS = []

# Read the links from the file 'links_left_to_scrape.txt'
with open(PATH_TMP_LINKS, 'r') as f:
    for line in f:
        URLS.append(line.replace('\n','').strip())

In [7]:
def remove_japanese_characters(input_string):
    cleaned_string = ''.join([i for i in input_string if not re.findall("[^\u0000-\u05C0\u2100-\u214F]+",i)])
    return cleaned_string

def erase_file(filepath, URL):
    # Erase the link from the input file
    with open(filepath, 'r') as f:
        lines = f.readlines()
    with open(filepath, 'w') as f:
        for line in lines:
            if line.strip('\n') != URL:
                f.write(line)

In [8]:
def get_data_from_webpage(URL):

    # Send a GET request to the URL
    response = requests.get(URL)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve data from {URL}. Status code: {response.status_code}")
        return
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract data based on the structure of the webpage
    webpage_name = URL.split('/')[-1].split('.')[0]
    print(f'Getting data from {webpage_name}...')

    with open(f'data/tmp/{webpage_name}.txt', 'w') as f:
        f.write('')
    
    e_rows = None
    if webpage_name == 'transferonly':
        e_rows = soup.select('table:nth-of-type(1) tr td:nth-of-type(2) a')
    else:
        e_rows = soup.select('table:nth-of-type(2) tr td:nth-of-type(3) a')
    
    for e_row in e_rows:
        name = remove_japanese_characters(e_row.text)

        with open(f'data/tmp/{webpage_name}.txt', 'a') as f:
            if name != '':
                f.write(f'{name}\n')

    erase_file(PATH_TMP_LINKS, URL)

In [9]:
for URL in URLS:
    get_data_from_webpage(URL)

KeyboardInterrupt: 

In [None]:
# Delete the file 'links_left_to_scrape.txt'
os.remove(PATH_TMP_LINKS)

In [None]:
# Create an empty file to store the Pokemon valid in Regulation E
with open('data/output/pokemon_regE.txt', 'w') as f:
    f.write('')

# Join the files into one
webpages = os.listdir('data/tmp')
for webpage in webpages:
    with open(f'data/tmp/{webpage}', 'r') as f:
        with open('data/output/pokemon_regE.txt', 'a') as f2:
            for line in f:
                f2.write(line)

# Create a dict with keys as Pokemon names and values as the number of times they appear in all the files
pokemons = {}
with open('data/output/pokemon_regE.txt', 'r') as f:
    for line in f:
        if line.strip('\n') in pokemons.keys():
            pokemons[line.strip('\n')] += 1
        else:
            pokemons[line.strip('\n')] = 1

with open('data/output/pokemon_regE.txt', 'w') as f:
    for pokemon in pokemons:
        f.write(f'{pokemon}\n')

# Delete the files with the data from the webpages
for webpage in webpages:
    os.remove(f'data/tmp/{webpage}')

## Retirando dados do Pokémon Showdown

In [None]:
def get_all_match_codes(URL):

    # Send a GET request to the URL
    response = requests.get(URL)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve data from {URL}. Status code: {response.status_code}")
        return
    
    # Parse the JSON content from the API
    str_response = response.content.decode('utf-8')

    id_matches = re.findall(r'"id":"gen9vgc2023regulatione-(.+?)"', str_response)
    id_player1 = re.findall(r'"p1":"(.+?)"', str_response)
    id_player2 = re.findall(r'"p2":"(.+?)"', str_response)
    id_players = id_player1 + id_player2

    with open(f'data/tmp/usernames.txt', 'r') as f:
        all_usernames = f.read().splitlines()
    with open(f'data/tmp/usernames.txt', 'a') as f:
        for id_player in id_players:
            if id_player not in all_usernames:
                f.write(f'{id_player}\n')
                all_usernames.append(id_player)

    with open(f'data/tmp/matches.txt', 'r') as f:
        all_id_matches = f.read().splitlines() 
    with open(f'data/tmp/matches.txt', 'a') as f:
        for id_match in id_matches:
            if id_match not in all_id_matches:
                f.write(f'{id_match}\n')
                all_id_matches.append(id_match)

In [None]:
for i in range(1, 101):
    for sort in ['rating', 'date']:
        URL_matches = f'https://replay.pokemonshowdown.com/api/replays/search?username=&format=gen9vgc2023regulatione&page={i}&sort={sort}'
        get_all_match_codes(URL_matches)

In [None]:
with open(f'data/tmp/usernames.txt', 'r') as f:
    all_usernames = f.read().splitlines()
for username in all_usernames:
    for i in range(1, 4):
        URL_matches = f'https://replay.pokemonshowdown.com/api/replays/search?username={username}&format=gen9vgc2023regulatione&page={i}'
        try:
            get_all_match_codes(URL_matches)
        except:
            break
    print(f'Getting matches from {username}...')

## Extracting data from the log files of Pokémon Showdown battles

In [25]:
import re
def get_data_from_match(URL, id_match):

    # Send a GET request to the URL
    response = requests.get(URL)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve data from {URL}. Status code: {response.status_code}")
        return
    
    # Parse the JSON content from the API
    log = response.content.decode('utf-8')

    matches_team = re.findall(r'\|poke\|p[1-2]\|(.+?)[-|,]', log)

    team1 = matches_team[:6]
    if len(team1) != 6:
        # Erase the match_id from the input file
        with open('data/tmp/matches.txt', 'r') as f:
            lines = f.readlines()
        with open('data/tmp/matches.txt', 'w') as f:
            for line in lines:
                if line.strip('\n') != id_match:
                    f.write(line)
        return
        # raise Exception(f"Team 1: {len(team1)} is different from 6\n{URL}")
    team1 = ','.join(team1)
    team2 = matches_team[6:]
    if len(team2) != 6:
        # Erase the match_id from the input file
        with open('data/tmp/matches.txt', 'r') as f:
            lines = f.readlines()
        with open('data/tmp/matches.txt', 'w') as f:
            for line in lines:
                if line.strip('\n') != id_match:
                    f.write(line)
        return
        # raise Exception(f"Team 2: {len(team2)} is different from 6\n{URL}")
    team2 = ','.join(team2)

    matches_player = re.findall(r'\|player\|p[1-2]\|(.+?)\|', log)

    player1 = matches_player[0]
    player2 = matches_player[1]

    try:
        match_winner = re.findall(r'\|win\|(.+?)\n', log)[0]
        if player1 == match_winner:
            winner = 1
        elif player2 == match_winner:
            winner = 2
    except:
        # Erase the match_id from the input file
        with open('data/tmp/matches.txt', 'r') as f:
            lines = f.readlines()
        with open('data/tmp/matches.txt', 'w') as f:
            for line in lines:
                if line.strip('\n') != id_match:
                    f.write(line)
        return
        print(f"Could not determine winner")

    # Write the data to the output file
    with open('data/output/matches.csv', 'a') as f:
        f.write(f'{winner},{team1},{team2}\n')

    # Erase the match_id from the input file
    with open('data/tmp/matches.txt', 'r') as f:
        lines = f.readlines()
    with open('data/tmp/matches.txt', 'w') as f:
        for line in lines:
            if line.strip('\n') != id_match:
                f.write(line)


In [29]:
# If file doesn't exist, create it
if not os.path.exists('data/tmp/matches.txt'):
    with open('data/output/matches.csv', 'w') as f:
        f.write('winner,pokemon1_p1,pokemon2_p1,pokemon3_p1,pokemon4_p1,pokemon5_p1,pokemon6_p1,pokemon1_p2,pokemon2_p2,pokemon3_p2,pokemon4_p2,pokemon5_p2,pokemon6_p2\n')

In [None]:
with open(f'data/tmp/matches.txt', 'r') as f:
    all_id_matches = f.read().splitlines()
for id_match in all_id_matches:
    URL = f'https://replay.pokemonshowdown.com/gen9vgc2023regulatione-{id_match}.log'
    get_data_from_match(URL, id_match)

# Delete the file with the match ids
os.remove('data/tmp/matches.txt')
os.remove('data/tmp/usernames.txt')