In [36]:
import pandas as pd
import numpy as np
import json

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
UNRELEASED_SELECT = '//select[@class="font-medium rounded-md text-blazer-950 text-right pb-0.5"]'
CLOSE_SETTINGS_MENU_ICON = '//div[@class="absolute right-3 cursor-pointer p-4"]'
CHAR_SELECT_ICON = '//a[@href="/en/archive/avatar"]'
CHAR_GRID = '//div[@class="flex flex-row flex-wrap justify-center gap-6 mt-2"]'
CHAR_NAME = './/div[@class="text-2xl font-bold text-white xl:text-3xl"]'

class StatMiner:
    def __init__(self, target_url):
        self.target_url = target_url
        self.driver = None
    
    def create_driver(self):
        self.driver = webdriver.Chrome()
        self.driver.get(self.target_url)
        self.driver.maximize_window()

    def close_menu(self):
        # close the settings menu
        settings_title = self.driver.find_element(By.XPATH, '//div[@name="title"]')
        x_button = settings_title.find_element(By.XPATH, CLOSE_SETTINGS_MENU_ICON)
        x_button.click()

    def get_chars(self):
        # find the div which contains all char links
        char_grid = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.XPATH, CHAR_GRID))
        )
        all_characters = char_grid.find_elements(By.TAG_NAME, 'a')

        # loop
        df_list = []
        name_list = []
        for i in range(len(all_characters)):
            char_grid = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, CHAR_GRID))
            )
            all_characters = char_grid.find_elements(By.TAG_NAME, 'a')
            all_characters[i].click()

            # find character name
            char_name_div = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.NAME, 'Avatar Name'))
            )
            char_name = char_name_div.find_element(By.XPATH, CHAR_NAME).get_attribute('innerText')
            name_list.append(char_name)
            
            # find the slider button to convert to table
            slider_button = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//button[text()="Slider"]'))
            )
            slider_button.click()

            # get a dataframe from the table
            stat_table = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.NAME, 'STAT TABLE'))
            )
            stat_table_html = stat_table.get_attribute('outerHTML')
            char_df = pd.read_html(stat_table_html)[0]
            df_list.append(char_df)

            self.driver.back()
        return df_list, name_list

In [3]:
YATTA_URL = 'https://hsr.yatta.top/en/archive/avatar'
scraper = StatMiner(YATTA_URL)
scraper.create_driver()
scraper.close_menu()
df_list, name_list = scraper.get_chars()

Misha
Sparkle
Black Swan
Dr. Ratio
Xueyi
Ruan Mei
Hanya
Argenti
Huohuo
Topaz & Numby
Guinaifen
Jingliu
Lynx
Fu Xuan
Dan Heng • Imbibitor Lunae
Kafka
Luka
Blade
Luocha
Yukong
Silver Wolf
Jing Yuan
March 7th
Dan Heng
Himeko
Welt
Arlan
Asta
Herta
Bronya
Seele
Serval
Gepard
Natasha
Pela
Clara
Sampo
Hook
Qingque
Tingyun
Sushang
Yanqing
Bailu
Trailblazer
Trailblazer
Trailblazer
Trailblazer


In [76]:
TRAILBLAZER_ORDER = [
    'Caelus - Destruction',
    'Stelle - Destruction',
    'Caelus - Preservation',
    'Stelle - Preservation'
]
dfs = df_list.copy()
names = name_list.copy()

def convert_to_json(df):
    char_name = df['Name'][0]
    df = df.T
    df.columns = df.iloc[1, :]
    df = df.iloc[2:, :]
    df = df.fillna(method='ffill')
    df['CRIT Rate'] = df['CRIT Rate'].replace('[%.]', '', regex=True).astype(float)/1000
    df['CRIT DMG'] = df['CRIT DMG'].replace('[%.]', '', regex=True).astype(float)/1000
    for col in df.columns:
        df[col] = df[col].astype(float)
    df_json = json.loads(df.to_json())
    return_json = {
        'Character Name': char_name,
        'Stats': df_json,
    }
    return return_json

character_stats = []
for df in dfs:
    character_stats.append(convert_to_json(df))

with open('../general_logic/data/hsr_char_stats.json', 'w') as f:
    json.dump(character_stats, f, indent=4)

In [77]:
character_stats

[{'Character Name': 'Misha',
  'Stats': {'ATK': {'1': 81.0,
    '2': 85.0,
    '3': 89.0,
    '4': 93.0,
    '5': 97.0,
    '6': 101.0,
    '7': 106.0,
    '8': 110.0,
    '9': 114.0,
    '10': 118.0,
    '11': 122.0,
    '12': 126.0,
    '13': 130.0,
    '14': 134.0,
    '15': 138.0,
    '16': 142.0,
    '17': 146.0,
    '18': 150.0,
    '19': 155.0,
    '20': 159.0,
    '20+': 191.0,
    '21': 195.0,
    '22': 199.0,
    '23': 204.0,
    '24': 208.0,
    '25': 212.0,
    '26': 216.0,
    '27': 220.0,
    '28': 224.0,
    '29': 228.0,
    '30': 232.0,
    '30+': 265.0,
    '31': 269.0,
    '32': 273.0,
    '33': 277.0,
    '34': 281.0,
    '35': 285.0,
    '36': 289.0,
    '37': 293.0,
    '38': 297.0,
    '39': 301.0,
    '40': 306.0,
    '40+': 338.0,
    '41': 342.0,
    '42': 346.0,
    '43': 350.0,
    '44': 354.0,
    '45': 359.0,
    '46': 363.0,
    '47': 367.0,
    '48': 371.0,
    '49': 375.0,
    '50': 379.0,
    '50+': 412.0,
    '51': 416.0,
    '52': 420.0,
    '53': 424