In [None]:
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from pathlib import Path

class HTTPLink:
    def __init__(self, txt):
        self.link = txt
        self.filter_end()
    
    def filter_end(self):
        if self.link[-1] == '/':
            self.link = self.link[:-1]
            
    def __truediv__(self, txt):
        if isinstance(txt, str):
            return HTTPLink(self.link + "/" + txt)
        else:
            raise NotImplementedError(f"Can't use / on type {type(txt)}")  
    
    def __str__(self):
        return self.link
    
    def __repr__(self):
        return self.link
        
PALWORLD_WIKI = HTTPLink("https://palworld.wiki.gg/wiki")
PALWORLD_IMG_ROOT = HTTPLink("https://palworld.wiki.gg")
PAL_LIST_URL = PALWORLD_WIKI / "Pals"

page = requests.get(PAL_LIST_URL)
soup = BeautifulSoup(page.content, "html.parser")

bodyContent = soup.find(id="bodyContent")
results = bodyContent.find_all("article")
known_pals = [x for x in results if x['data-title'] == 'Known Pals'][0]
tables = known_pals.find_all("table")[0]
rows = tables.find_all("tr")


In [None]:
def parse_pal_row(row):
    span = row.find_all("span")[0]
    a = span.find_all("a")[-1]
    pal_name = a['title']
    pal_link = PALWORLD_WIKI / a['href'].split('/')[-1]
    return pal_name, pal_link

def parse_elements_row(elements):
    spans = elements.find_all("span")
    return [s.find_all("a")[-1]['title'] for s in spans]

def parse_work_row(work_suitability):
    # All rows are in p elements except for the first row. it just skips right to the span
    def parse_p(p):
        lvl = p.text.strip().split(" ")[1].strip()
        title = p.find_all("span")[-1].find_all("a")[-1]['title']
        return lvl, title 
    
    works = [parse_p(work_suitability)]
    for p in work_suitability.find_all("p"):
        works.append(parse_p(p))
        
    return works

def parse_row(row):
    cols = row.find_all("td")
    pal, id, pal_video_id, elements, work_suitability, alpha_title = cols
    pal_name, pal_link = parse_pal_row(pal)
    id = id.string.strip()
    pal_video_id = pal_video_id.string.strip()
    elements = parse_elements_row(elements)
    works = parse_work_row(work_suitability)
    alpha_title = alpha_title.string.strip()
    return id, pal_name, pal_link, pal_video_id, elements, works, alpha_title

pals = []
for i, r in enumerate(rows):
    if i == 0: continue
    pals.append(parse_row(r))

In [None]:
pal_img_links = {}
for pal in tqdm(pals, desc="Extracting pal information"):
    pal_name = pal[1]
    pal_img_links[pal_name] = []
    pal_link = pal[2]
    page = requests.get(pal_link)
    soup = BeautifulSoup(page.content, "html.parser")
    tab_img_links = []
    for i in range(1, 3):
        tab = soup.find(id=f"pi-tab-{i}")
        if tab is None: break
        for img in tab.find_all("img"):
            link = PALWORLD_IMG_ROOT / img['src'][1:]
            tab_img_links.append(link)
    if tab is None:
        content = soup.find(id=f"bodyContent")
        for img in content.find("img", class_="pi-image-thumbnail"):
            tab_img_links.append(PALWORLD_IMG_ROOT / img['src'][1:])
    
    lis = soup.find(id="mw-content-text").find_all("li", class_="gallerybox")
    gallery_links = []
    for li in lis:
        for img in li.find_all("img"):
            gallery_links.append(PALWORLD_IMG_ROOT / img['src'][1:])
    
    pal_img_links[pal_name].extend(tab_img_links + gallery_links)
    

In [None]:
import os
import json

DOWNLOAD_IMG_ROOT = Path("pal_images")
os.makedirs(DOWNLOAD_IMG_ROOT, exist_ok=True)
for pal, links in tqdm(pal_img_links.items(), desc="Downloading pal images"):
    metadata = []
    pal_root = DOWNLOAD_IMG_ROOT / pal
    os.makedirs(pal_root, exist_ok=True)
    for i, link in enumerate(links):
        img_data = requests.get(link).content
        fname = f'{i}.png'
        img_path = pal_root / fname
        with open(img_path, 'wb') as handler:
            handler.write(img_data)
            
        metadata.append({"file_name": fname, "text": f"A photo of a {pal}"})
    with open(DOWNLOAD_IMG_ROOT / pal / "metadata.jsonl", 'w') as f:
        for data in metadata:
            f.write(json.dumps(data) + "\n")
    
    
            
# It appears some of the pals did not have links to images, but the website clearly has them..
# Do not save empty folders. It breaks the training.