In [2]:
import pandas as pd
import numpy as np
import json

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [1]:
CLOSE_SETTINGS_MENU_ICON = '//div[@class="absolute right-3 cursor-pointer p-4"]'
LC_SELECT_ICON = '//a[@href="/en/archive/avatar"]'
LC_GRID = '//div[@class="flex flex-row flex-wrap justify-center gap-6 mt-2"]'
LC_NAME = './/h1[@class="font-bold my-2 text-3xl text-white"]'

class StatMiner:
    def __init__(self, target_url):
        self.target_url = target_url
        self.driver = None
    
    def create_driver(self):
        self.driver = webdriver.Chrome()
        self.driver.get(self.target_url)
        self.driver.maximize_window()

    def close_menu(self):
        # close the settings menu
        settings_title = self.driver.find_element(By.XPATH, '//div[@name="title"]')
        x_button = settings_title.find_element(By.XPATH, CLOSE_SETTINGS_MENU_ICON)
        x_button.click()

    def get_lcs(self):
        # find the div which contains all LC links
        lc_grid = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.XPATH, LC_GRID))
        )
        all_lcs = lc_grid.find_elements(By.TAG_NAME, 'a')

        # loop
        df_list = []
        name_list = []
        for i in range(len(all_lcs)):
            lc_grid = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, LC_GRID))
            )
            all_lcs = lc_grid.find_elements(By.TAG_NAME, 'a')
            all_lcs[i].click()

            # find lc name
            lc_name_element = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, LC_NAME))
            )
            lc_name = lc_name_element.get_attribute('innerText')
            name_list.append(lc_name)
            
            # find the slider button to convert to table
            slider_button = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//button[text()="Slider"]'))
            )
            slider_button.click()

            # get a dataframe from the table
            stat_table = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.NAME, 'STAT TABLE'))
            )
            stat_table_html = stat_table.get_attribute('outerHTML')
            lc_df = pd.read_html(stat_table_html)[0]
            df_list.append(lc_df)

            self.driver.back()
        return df_list, name_list

In [3]:
YATTA_URL = 'https://hsr.yatta.top/en/archive/equipment'
scraper = StatMiner(YATTA_URL)
scraper.create_driver()
scraper.close_menu()
df_list, name_list = scraper.get_lcs()

In [16]:
dfs = df_list.copy()
names = name_list.copy()

def convert_to_json(df, name):
    df = df.T
    df.columns = df.iloc[0, :]
    df = df.iloc[1:, :]
    df = df.fillna(method='ffill')
    for col in df.columns:
        df[col] = df[col].astype(float)
    df_json = json.loads(df.to_json())
    return_json = {
        'Name': name,
        'Stats': df_json,
    }
    return return_json

jsons = []
for i in range(len(dfs)):
    name = names[i]
    lc_json = convert_to_json(dfs[i], name)
    jsons.append(lc_json)

with open('../general_logic/data/hsr_lc_stats.json', 'w') as f:
    json.dump(jsons, f, indent=4)