In [2]:
import pandas as pd
import numpy as np
import json
import requests

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [18]:
RELIC_GRID = '//div[@class="grid grid-cols-1 md:grid-cols-2 h-100 p-4 text-slate-950"]'
RELIC_PIECES = '//div[@class="grid grid-cols-3 m-4 whitespace-nowrap"]'
ORNAMENTS_GRID = '//div[@class="grid grid-cols-2 m-4 whitespace-nowrap"]'
RELIC_PIECE_DIV = '//div[@class="relative p-4 rounded-full aspect-square border-2 border-slate-400 hover:scale-110 hover:border-slate-100 shadow-[0px_0px_5px_0px_white] transition-all duration-[0.2s] ease-[ease-in-out];"]'
RELIC_PIECE_NAME = '//div[@class="text-lg pb-1 font-bold flex"]'

class RelicBasicDataScraper:
    def __init__(self, url):
        self.url = url
        self.driver = None
    
    def create_driver(self):
        self.driver = webdriver.Chrome()
        self.driver.get(self.url)
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, 10)
    
    def mine_relics(self):
        relic_grid = self.wait.until(EC.presence_of_element_located((By.XPATH, RELIC_GRID)))
        relic_divs = relic_grid.find_elements(By.XPATH, './/div[@class="grid grid-cols-6 p-2 text-slate-100 py-4"]')
        return_dict = {}
        for i in range(len(relic_divs)):
            relic_dict = {}

            # refresh references
            relic_grid = self.wait.until(EC.presence_of_element_located((By.XPATH, RELIC_GRID)))
            relic_divs = relic_grid.find_elements(By.XPATH, './/div[@class="grid grid-cols-6 p-2 text-slate-100 py-4"]')

            relic_div = relic_divs[i]

            # get ID
            id = relic_div.find_element(By.TAG_NAME, 'a').get_attribute('href').split('/')[-1]

            # get name
            name = relic_div.find_element(By.TAG_NAME, 'div').find_element(By.TAG_NAME, 'a').get_attribute('innerText')
            
            # get image link
            image_link = relic_div.find_element(By.TAG_NAME, 'a').find_element(By.TAG_NAME, 'img').get_attribute('src')

            image_data = requests.get(image_link).content
            image_path = f"../assets/relic_images/{name.replace(' ', '').replace(':', '')}.webp"
            with open(image_path, 'wb') as f:
                f.write(image_data)
            relic_type = 'Planar Ornament' if id[0] == '3' else 'Relic'
            
            # create object
            relic_dict = {
                'Name': name,
                'RelicType': relic_type,
                'ImageLink': image_link,
                'ImagePath': image_path
            }

            # go in and get the piece images
            relic_div.find_element(By.TAG_NAME, 'a').click()

            relic_pieces_obj = {}
            if relic_type == 'Relic':
                relic_pieces_grid = self.wait.until(EC.presence_of_element_located((By.XPATH, RELIC_PIECES)))
                relic_piece_divs = relic_pieces_grid.find_elements(By.XPATH, RELIC_PIECE_DIV)
                for relic_piece_div in relic_piece_divs:
                    relic_piece_obj = {}
                    relic_piece_div.click()
                    relic_piece_name = self.wait.until(EC.presence_of_element_located((By.XPATH, RELIC_PIECE_NAME))).get_attribute('innerText')
                    relic_piece_type = relic_piece_div.find_element(By.TAG_NAME, 'div').get_attribute('innerText').replace(' ', '')
                    relic_piece_img_link = relic_piece_div.find_element(By.TAG_NAME, 'img').get_attribute('src')
                    relic_img_data = requests.get(relic_piece_img_link).content
                    relic_piece_img_path = image_path[:-5] + relic_piece_type + '.webp'
                    with open(relic_piece_img_path, 'wb') as f:
                        f.write(relic_img_data)
                    relic_piece_obj['Name'] = relic_piece_name
                    relic_piece_obj['RelicType'] = relic_piece_type
                    relic_piece_obj['ImageLink'] = relic_piece_img_link
                    relic_piece_obj['ImagePath'] = relic_piece_img_path
                    relic_pieces_obj[relic_piece_type] = relic_piece_obj
            else:
                relic_pieces_grid = self.wait.until(EC.presence_of_element_located((By.XPATH, ORNAMENTS_GRID)))
                relic_piece_divs = relic_pieces_grid.find_elements(By.XPATH, RELIC_PIECE_DIV)
                for relic_piece_div in relic_piece_divs:
                    relic_piece_obj = {}
                    relic_piece_div.click()
                    relic_piece_name = self.wait.until(EC.presence_of_element_located((By.XPATH, RELIC_PIECE_NAME))).get_attribute('innerText')
                    relic_piece_type = relic_piece_div.find_element(By.TAG_NAME, 'div').get_attribute('innerText').replace(' ', '')
                    relic_piece_img_link = relic_piece_div.find_element(By.TAG_NAME, 'img').get_attribute('src')
                    relic_img_data = requests.get(relic_piece_img_link).content
                    relic_piece_img_path = image_path + relic_piece_type + '.webp'
                    with open(relic_piece_img_path, 'wb') as f:
                        f.write(relic_img_data)
                    relic_piece_obj['Name'] = relic_piece_name
                    relic_piece_obj['RelicType'] = relic_piece_type
                    relic_piece_obj['ImageLink'] = relic_piece_img_link
                    relic_piece_obj['ImagePath'] = relic_piece_img_path
                    relic_pieces_obj[relic_piece_type] = relic_piece_obj
            relic_dict['Pieces'] = relic_pieces_obj
            self.driver.back()
            return_dict[id] = relic_dict
        self.driver.close()
        return return_dict

In [19]:
HAKUSHIN_URL = 'https://hsr3.hakush.in/relicset'

scraper = RelicBasicDataScraper(HAKUSHIN_URL)
scraper.create_driver()
relic_dict = scraper.mine_relics()

In [20]:
with open ('../data/hsr_relic_dictionary.json', 'w') as f:
    json.dump(relic_dict, f, indent=4)