# Road to Pokémon Master!

Welcome to this data science hands-on! This notebook contains following contents;

1. Exploratory data analysis with [Pokemon with stats](https://www.kaggle.com/abcsds/pokemon#)
2. Legendary pokémons detection
3. 


In [None]:
import json
import time
from pathlib import Path
import logging

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [None]:
LADDER_URL = 'http://pokemonshowdown.com/ladder/{ladder}'
USERNAME_URL = ('http://replay.pokemonshowdown.com/search/?output=html&'
                'user={user}&format=&page={page}&output=html')
REPLAY_URL = 'http://replay.pokemonshowdown.com/{replay_id}'
SLEEP = 0.8
save_dir = './data/battle'

In [None]:
# ladder = 'gen7battlespotsingles' # gen7
ladder = 'gen8battlestadiumsingles' # gen8

In [None]:
def top_users(save_dir, ladder):
    save_dir = Path(save_dir)
    save_dir.mkdir(exist_ok=True)
    save_file = save_dir / '{}_top_users.json'.format(ladder)

    url = LADDER_URL.format(ladder=ladder)
    text = requests.get(url).text
    soup = BeautifulSoup(text, 'html.parser')
    users = [a.get('href')
             for a in soup.find_all('a', {'class': 'subtle'})]
    users = [Path(user).name for user in users]

    save_file.write_text(json.dumps({'ladder': ladder, 'users': users}))

In [None]:
top_users(save_dir, ladder)

In [None]:
def replay_ids(save_dir, users_file):
    save_dir = Path(save_dir)

    users_file = Path(users_file)
    data = json.loads(users_file.read_text())

    ladder = data['ladder']
    users = data['users']

    save_file = save_dir / '{}_replay_ids.json'.format(ladder)

    all_replay_ids = {}

    for user in tqdm(users):
        logging.info('user = {}'.format(user))
        replay_ids = []
        alredy_ids = set()
        for page in range(1, 100):
            url = USERNAME_URL.format(
                user=user,
                page=page
            )
            html = requests.get(url).text
            time.sleep(SLEEP)
            soup = BeautifulSoup(html, 'html.parser')
            links = soup.find_all('a')
            ids = [link.get('href') for link in links]
            if len(ids) == 0:
                break

            ids = [x for x in ids if ladder in x]
            if len(ids) == 0:
                continue
            if ids[0] in alredy_ids or ids[-1] in alredy_ids:
                break

            replay_ids += ids
            alredy_ids |= set(ids)
        logging.info(len(replay_ids))
        all_replay_ids[user] = replay_ids

    save_file.write_text(json.dumps(
        {'ladder': ladder, 'replay_ids': all_replay_ids}))

In [None]:
replay_ids(save_dir, save_dir + f'/{ladder}_top_users.json')

In [None]:
def battle_logs(save_dir, replay_ids_file):
    save_dir = Path(save_dir)

    replay_ids_file = Path(replay_ids_file)
    data = json.loads(replay_ids_file.read_text())

    ladder = data['ladder']
    replay_ids = data['replay_ids']

    save_file = save_dir / '{}_battle_logs.json'.format(ladder)
    battle_logs = {}
    sorted_reply_ids = sorted(replay_ids.items(), key=lambda x: x[0])

    for user, replay_id_list in tqdm(sorted_reply_ids):
        logging.info('user = {}'.format(user))
        logs = []
        for replay_id in replay_id_list:
            html = requests.get(REPLAY_URL.format(replay_id=replay_id)).text
            soup = BeautifulSoup(html, 'html.parser')
            time.sleep(SLEEP)
            log = soup.find('script', {'class': 'log'}).text
            assert len(log) != 0
            logs.append(log)

        battle_logs[user] = logs

    save_file.write_text(json.dumps(
        {'ladder': ladder, 'battle_logs': battle_logs}))


In [None]:
battle_logs(save_dir, save_dir + f"/{ladder}_replay_ids.json")

In [None]:
import re
USER_PLAYER = re.compile(r"\|player\|(?P<player>.+?)\|(?P<username>.+?)\|.*?")
POKE = re.compile(r"\|poke\|(?P<player>.+?)\|(?P<poke>.+?)\|.*?")


def to_id(name):
    return re.sub(r'[^a-z0-9]+', '', name.lower())


def parse_logs(save_dir, battle_logs_file):
    print(save_dir, battle_logs_file)
    save_dir = Path(save_dir)
    battle_logs_file = Path(battle_logs_file)

    data = json.loads(battle_logs_file.read_text())

    ladder = data['ladder']
    battle_logs = data['battle_logs']

    save_file = save_dir / '{}_parsed_battle_logs.json'.format(ladder)

    players_list = []
    pokes_list = []

    for user, battle_log_list in sorted(battle_logs.items(),
                                        key=lambda x: x[0]):
        logging.info('user = {}'.format(user))
        for battle_log in battle_log_list:
            players = {}
            matches = USER_PLAYER.findall(battle_log)
            for match in matches:
                players[match[0]] = to_id(match[1])

            pokes = {}
            matches = POKE.findall(battle_log)
            for match in matches:
                player, poke = match
                poke = to_id(poke.split(',')[0])

                if player not in pokes:
                    pokes[player] = []

                pokes[player].append(poke)

            players_list.append(players)
            pokes_list.append(pokes)

    save_file.write_text(json.dumps({
        'ladder': ladder,
        'players': players_list,
        'pokes': pokes_list
    }))


In [None]:
parse_logs(save_dir, save_dir + f'/{ladder}_battle_logs.json')

In [None]:
import numpy as np
import random


def set_seed(random_seed):
  random.seed(random_seed)
  np.random.seed(random_seed)


def preprocess(save_dir, parsed_battle_logs_file, random_seed=42):
    set_seed(random_seed)
    save_dir = Path(save_dir)
    parsed_battle_logs_file = Path(parsed_battle_logs_file)

    data = json.loads(parsed_battle_logs_file.read_text())
    ladder = data['ladder']

    save_file = save_dir / '{}_dataset.json'.format(ladder)

    pokes = []
    for poke in data['pokes']:
        if not poke:
            continue
        if len(poke['p1']) == 6:
            pokes.append(tuple(sorted(poke['p1'])))
        if len(poke['p2']) == 6:
            pokes.append(tuple(sorted(poke['p2'])))

    uniq_pokes = list(set(pokes))

    logging.info('reduce {} -> {} ({:.03f} %)'
                .format(len(pokes), len(uniq_pokes),
                        100 * len(uniq_pokes) / len(pokes)))

    np.random.shuffle(uniq_pokes)
    N = len(uniq_pokes)

    train = uniq_pokes[N // 10:]
    valid = uniq_pokes[:N // 10]

    save_file.write_text(json.dumps({
        'ladder': ladder,
        'train': train,
        'valid': valid
    }))

In [None]:
preprocess(save_dir, save_dir + f'/{ladder}_parsed_battle_logs.json')

In [None]:
save_dir = 'data'
parsed_battle_logs_file = f'data/{ladder}_parsed_battle_logs.json'
random_seed = 42
set_seed(random_seed)
save_dir = Path(save_dir)
parsed_battle_logs_file = Path(parsed_battle_logs_file)

data = json.loads(parsed_battle_logs_file.read_text())
ladder = data['ladder']

save_file = save_dir / '{}_dataset.json'.format(ladder)

In [None]:
data["pokes"]