In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

import time
import json
import requests

import pandas as pd

In [18]:
class StatScraper:
    def __init__(self, url):
        self.driver = None
        self.url = url
        self.trailblazer_order = [
            'Destruction - Caelus',
            'Destruction - Stelle',
            'Preservation - Caelus',
            'Preservation - Stelle'
        ]
    
    def create_driver(self):
        self.driver = webdriver.Chrome()
        self.driver.get(self.url)
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, 10)

    def enable_unreleased(self):
        # constants
        UNRELEASED_SELECT = '//select[@class="font-medium rounded-md text-blazer-950 text-right pb-0.5"]'
        CLOSE_SETTINGS_MENU_ICON = '//div[@class="absolute right-3 cursor-pointer p-4"]'

        # enable unreleased content
        unreleased_select = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.XPATH, UNRELEASED_SELECT))
        )
        unreleased_select.click()
        enable_option = unreleased_select.find_element(By.XPATH, './/option[@value="on"]')
        enable_option.click()

        # close the settings menu
        settings_title = self.driver.find_element(By.XPATH, '//div[@name="title"]')
        x_button = settings_title.find_element(By.XPATH, CLOSE_SETTINGS_MENU_ICON)
        x_button.click()

    def mine_stats(self, alphabetical):
        CHARACTER_GRID = '//div[@class="flex flex-row flex-wrap justify-center gap-6 mt-2"]'
        SLIDER_BUTTON = '//button[text()="Slider"]'
        STAT_TABLE = '//table[@class="table-outline text-sm font-normal text-white"]'
        CHAR_NAME = '//div[@class="text-2xl font-bold text-white xl:text-3xl"]'
        RARITY_DIV = '//div[@class="flex flex-row gap-1"]'
        ELEMENT_DIV = '//img[@class="h-12 w-10 object-contain"]'
        PATH_DIV = '//img[@class="h-8 w-8 object-contain drop-shadow-icon filter"]'

        all_characters = self.wait.until(EC.presence_of_element_located((By.XPATH, CHARACTER_GRID))).find_elements(By.TAG_NAME, 'a')
        
        char_img_links = []
        for character in all_characters:
            char_img_link = character.find_elements(By.TAG_NAME, 'img')[2].get_attribute('src')
            char_img_links.append(char_img_link)

        char_jsons = []
        tb_idx = 0
        # main scraping loop
        for i in range(len(all_characters)):
            
            # refresh reference
            char_grid = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, CHARACTER_GRID))
            )
            all_characters = char_grid.find_elements(By.TAG_NAME, 'a')
            
            # click the character
            all_characters[i].click()

            # get the character name
            char_name_div = self.wait.until(EC.presence_of_element_located((By.XPATH, CHAR_NAME)))
            char_name = char_name_div.get_attribute('innerText')
            if(char_name == 'Trailblazer'):
                char_name = 'Trailblazer - ' + self.trailblazer_order[tb_idx]
                tb_idx += 1

            # find the slider button and turn to table
            slider_button = self.wait.until(EC.presence_of_element_located((By.XPATH, SLIDER_BUTTON)))
            slider_button.click()

            # get the table and convert it to pandas
            stat_table = self.wait.until(EC.presence_of_element_located((By.XPATH, STAT_TABLE)))
            stat_df = pd.read_html(stat_table.get_attribute('outerHTML'))
            stat_df = stat_df[0]
            stat_df = stat_df.ffill(axis=1).fillna('Aggro')[['LVL', '80']].T
            stat_df.columns = stat_df.iloc[0, :]
            stat_df = stat_df.iloc[1:, :]
            char_spd = stat_df['SPD']

            # get the character rarity
            rarity_div = self.wait.until(EC.presence_of_element_located((By.XPATH, RARITY_DIV)))
            rarity = len(rarity_div.find_elements(By.TAG_NAME, 'svg'))

            # get the character element
            element_div = self.wait.until(EC.presence_of_element_located((By.XPATH, ELEMENT_DIV)))
            element = element_div.get_attribute('alt').split()[-1]

            # get the character path
            path_div = self.wait.until(EC.presence_of_element_located((By.XPATH, PATH_DIV)))
            path = path_div.get_attribute('alt')
            if 'Hunt' in path:
                path = 'The Hunt'
            else:
                path = path.split()[-1]

            stat_json = {
                'Character Name': char_name,
                'Speed': int(char_spd[0]),
                'Rarity': rarity,
                'Element': element,
                'Path': path,
                'Image Path': 'char_icons/' + char_name + '.png'
            }
            char_jsons.append(stat_json)
            self.driver.back()

        # add things
        for i in range(len(char_jsons)):
            char_jsons[i]['Image Link'] = char_img_links[i]
        self.driver.close()

        # download the images
        for i in range(len(char_jsons)):
            char_json = char_jsons[i]
            image_link = char_json['Image Link']
            response = requests.get(image_link)
            with open('../../public/' + char_json['Image Path'], 'wb') as f:
                f.write(response.content)

        # alphebatize the json
        if alphabetical == True:
            char_jsons.sort(key=lambda x: x['Character Name'])
            return char_jsons
        else:
            return char_jsons

In [19]:
YATTA_URL = 'https://hsr.yatta.top/en/archive/avatar'

scraper = StatScraper(YATTA_URL)
scraper.create_driver()
scraper.enable_unreleased()
char_json = scraper.mine_stats(alphabetical=False)
with open('../data/characters_releasedate.json', 'w') as outfile:
    json.dump(char_json, outfile)