In [7]:
import requests, pandas as pd, time
from lxml import html
from urllib.parse import urljoin
from tqdm import tqdm

BASE   = "https://knowyourmeme.com"
LIST   = f"{BASE}/categories/meme"
HDRS   = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..."}
SEL    = ("body > main > article > article > section"
          " > div.contents-container > div.contents"
          " > section > div:nth-child(1) > a:nth-child(odd)")

def detail_urls(list_html: str):
    tree  = html.fromstring(list_html)
    return [urljoin(BASE, a.get("href")) for a in tree.cssselect(SEL)]

def parse_detail(url: str):
    t = html.fromstring(requests.get(url, headers=HDRS, timeout=20).text)
    return {
        "title": t.xpath('normalize-space(//h1)'),
        "about": t.xpath('normalize-space(//*[@id="entry_section_about"]/p[1])'),
        "added": (t.xpath('string(//section[@id="entry_about"]//span[@class="time"]/@datetime)') or "")[:10],
        "url":   url,
    }

def crawl(pages=1, pause=1.5):
    rows = []
    for p in range(1, pages+1):
        url  = LIST if p == 1 else f"{LIST}/page/{p}"
        res  = requests.get(url, headers=HDRS, timeout=20)
        for durl in tqdm(detail_urls(res.text), desc=f"Page {p}"):
            try:
                rows.append(parse_detail(durl))
            except Exception as e:
                print("skip:", durl, e)
            time.sleep(pause)
        time.sleep(pause)
    return pd.DataFrame(rows)

if __name__ == "__main__":
    df = crawl(pages=1)          # 필요하면 pages 값을 늘리세요
    df.to_csv("kym_meme.csv", index=False, encoding="utf-8-sig")
    print("saved", len(df), "rows")


Page 1: 0it [00:00, ?it/s]


saved 0 rows


In [15]:
import requests
from bs4 import BeautifulSoup
BASE = 'https://knowyourmeme.com'
LIST   = f"{BASE}/categories/meme"

page = requests.get(BASE)
soup = BeautifulSoup(page.text, 'html.parser')
soup

<!DOCTYPE html>

<html lang="en" xmlns="https://www.w3.org/1999/xhtml" xmlns:fb="https://www.facebook.com/2008/fbml">
<head>
<title>
Internet Meme Database | Know Your Meme
</title>
<link href="https://knowyourmeme.com/" rel="canonical"/>
<link href="https://a.kym-cdn.com" rel="preconnect">
<link href="https://i.kym-cdn.com" rel="preconnect"/>
<link href="https://ads.blogherads.com" rel="preconnect"/>
<link href="https://a.kym-cdn.com" rel="dns-prefetch"/>
<link href="https://i.kym-cdn.com" rel="dns-prefetch"/>
<link href="https://ads.blogherads.com" rel="dns-prefetch"/>
<meta content="Know Your Meme" property="og:title"/>
<meta content="Know Your Meme" property="og:site_name"/>
<meta content="https://a.kym-cdn.com/assets/kym-logo-large-2be3f3818691470a0369e154647ca0f0.png" property="og:image"/>
<meta content="article" property="og:type"/>
<meta content="104675392961482" property="fb:app_id"/>
<meta content="88519108736" property="fb:pages"/>
<meta content="https://www.facebook.com/kno

In [38]:
import csv, re, time
from urllib.parse import urljoin

import cloudscraper            # Cloudflare 우회용 (pip install cloudscraper)
from bs4 import BeautifulSoup

BASE  = "https://knowyourmeme.com"
LIST  = f"{BASE}/categories/meme"
DELAY = 1.0                    # 페이지당 예의상 딜레이(초)

# ────────────────────────────────────────────────────────
# 1) Cloudflare 우회 세션 만들기
# ────────────────────────────────────────────────────────
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True},
)

# 밈 URL 패턴: /memes/… 또는 /memes/subcultures/… 도 포함하려면 r"^/memes/"
HREF_RE = re.compile(r"^/memes/[^/?#]+")

def extract_links(html_text: str) -> list[str]:
    """HTML 한 페이지에서 밈 상세 href를 절대경로로 변환해 반환"""
    soup = BeautifulSoup(html_text, "html.parser")
    links = [
        urljoin(BASE, a["href"])
        for a in soup.find_all("a", href=HREF_RE)
    ]
    return links

def crawl_all() -> list[str]:
    """페이지네이션을 따라가며 모든 밈 URL 수집"""
    collected: set[str] = set()
    page = 1
    while True:
        url = LIST if page == 1 else f"{LIST}/page/{page}"
        print(f"📥  {url}")
        res = scraper.get(url, timeout=20)
        if res.status_code != 200:
            print("🚫  HTTP error", res.status_code)
            break

        links = extract_links(res.text)
        if not links:                      # 더 이상 밈이 없으면 끝
            break

        collected.update(links)
        page += 1
        time.sleep(DELAY)

    return sorted(collected)

if __name__ == "__main__":
    meme_urls = crawl_all()
    print(f"✅  collected {len(meme_urls):,} meme links")

    # CSV 저장
    with open("kym_memes.csv", "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["url"])
        writer.writerows([[u] for u in meme_urls])

    print("💾  saved → kym_memes.csv")


📥  https://knowyourmeme.com/categories/meme
📥  https://knowyourmeme.com/categories/meme/page/2
📥  https://knowyourmeme.com/categories/meme/page/3
📥  https://knowyourmeme.com/categories/meme/page/4
📥  https://knowyourmeme.com/categories/meme/page/5
📥  https://knowyourmeme.com/categories/meme/page/6
📥  https://knowyourmeme.com/categories/meme/page/7
📥  https://knowyourmeme.com/categories/meme/page/8
📥  https://knowyourmeme.com/categories/meme/page/9
📥  https://knowyourmeme.com/categories/meme/page/10
📥  https://knowyourmeme.com/categories/meme/page/11
📥  https://knowyourmeme.com/categories/meme/page/12
📥  https://knowyourmeme.com/categories/meme/page/13
📥  https://knowyourmeme.com/categories/meme/page/14
📥  https://knowyourmeme.com/categories/meme/page/15
📥  https://knowyourmeme.com/categories/meme/page/16
📥  https://knowyourmeme.com/categories/meme/page/17
📥  https://knowyourmeme.com/categories/meme/page/18
📥  https://knowyourmeme.com/categories/meme/page/19
📥  https://knowyourmeme.com/

In [31]:
import csv, re, time
from urllib.parse import urljoin

import cloudscraper            # Cloudflare 우회용 (pip install cloudscraper)
from bs4 import BeautifulSoup

BASE  = "https://knowyourmeme.com"
PATH = "/categories/meme"  
QS   = "?sort=views&status=confirmed"
LIST  = f"{BASE}/categories/meme"
DELAY = 1.0                    # 페이지당 예의상 딜레이(초)

# ────────────────────────────────────────────────────────
# 1) Cloudflare 우회 세션 만들기
# ────────────────────────────────────────────────────────
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True},
)

# 밈 URL 패턴: /memes/… 또는 /memes/subcultures/… 도 포함하려면 r"^/memes/"
HREF_RE = re.compile(r"^/memes/[^/?#]+")

def extract_links(html_text: str) -> list[str]:
    """HTML 한 페이지에서 밈 상세 href를 절대경로로 변환해 반환"""
    soup = BeautifulSoup(html_text, "html.parser")
    links = [
        urljoin(BASE, a["href"])
        for a in soup.find_all("a", href=HREF_RE)
    ]
    return links

def crawl_all() -> list[str]:
    """페이지네이션을 따라가며 모든 밈 URL 수집"""
    collected: set[str] = set()
    page = 1
    while True:
        if page == 1:
            url = f"{BASE}{PATH}{QS}"
        else:
            url = f"{BASE}{PATH}/page/{page}{QS}"
        print(f"📥  {url}")
        res = scraper.get(url, timeout=20)
        if res.status_code != 200:
            print("🚫  HTTP error", res.status_code)
            break

        links = extract_links(res.text)
        if not links:                      # 더 이상 밈이 없으면 끝
            break

        collected.update(links)
        page += 1
        time.sleep(DELAY)

    return sorted(collected)

if __name__ == "__main__":
    meme_urls = crawl_all()
    print(f"✅  collected {len(meme_urls):,} meme links")

    # CSV 저장
    with open("kym_memes_views.csv", "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["url"])
        writer.writerows([[u] for u in meme_urls])

    print("💾  saved → kym_memes.csv")


📥  https://knowyourmeme.com/categories/meme?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/2?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/3?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/4?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/5?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/6?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/7?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/8?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/9?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/10?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/11?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/12?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/13?sort

In [35]:
import csv, re, time
from urllib.parse import urljoin

import cloudscraper            # Cloudflare 우회용 (pip install cloudscraper)
from bs4 import BeautifulSoup

BASE  = "https://knowyourmeme.com"
PATH = "/categories/meme"  
QS   = "?sort=images&status=confirmed"
LIST  = f"{BASE}/categories/meme"
DELAY = 1.0                    # 페이지당 예의상 딜레이(초)

# ────────────────────────────────────────────────────────
# 1) Cloudflare 우회 세션 만들기
# ────────────────────────────────────────────────────────
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True},
)

# 밈 URL 패턴: /memes/… 또는 /memes/subcultures/… 도 포함하려면 r"^/memes/"
HREF_RE = re.compile(r"^/memes/[^/?#]+")

def extract_links(html_text: str) -> list[str]:
    """HTML 한 페이지에서 밈 상세 href를 절대경로로 변환해 반환"""
    soup = BeautifulSoup(html_text, "html.parser")
    links = [
        urljoin(BASE, a["href"])
        for a in soup.find_all("a", href=HREF_RE)
    ]
    return links

def crawl_all() -> list[str]:
    """페이지네이션을 따라가며 모든 밈 URL 수집"""
    collected: set[str] = set()
    page = 1
    while True:
        if page == 1:
            url = f"{BASE}{PATH}{QS}"
        else:
            url = f"{BASE}{PATH}/page/{page}{QS}"
        print(f"📥  {url}")
        res = scraper.get(url, timeout=20)
        if res.status_code != 200:
            print("🚫  HTTP error", res.status_code)
            break

        links = extract_links(res.text)
        if not links:                      # 더 이상 밈이 없으면 끝
            break

        collected.update(links)
        page += 1
        time.sleep(DELAY)

    return sorted(collected)

if __name__ == "__main__":
    meme_urls = crawl_all()
    print(f"✅  collected {len(meme_urls):,} meme links")

    # CSV 저장
    with open("kym_memes_images.csv", "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["url"])
        writer.writerows([[u] for u in meme_urls])

    print("💾  saved → kym_memes.csv")


📥  https://knowyourmeme.com/categories/meme?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/2?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/3?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/4?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/5?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/6?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/7?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/8?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/9?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/10?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/11?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/12?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/

In [40]:
import csv, re, time
from urllib.parse import urljoin

import cloudscraper
from bs4 import BeautifulSoup

##############################################################################
# 설정
##############################################################################
BASE   = "https://knowyourmeme.com"
PATH   = "/categories/meme"
CATS   = ["views", "oldest", "chronological", "reverse-chronological",
          "comments", "images", "videos"]
DELAY  = 1.0                 # 요청 간 딜레이 (초)
OUTCSV = "kym_memes_all.csv"

# /memes/… (subculture 포함) 링크만 필터
HREF_RE = re.compile(r"^/memes(?:/subcultures)?/[^/?#]+")

##############################################################################
# 공통 세션 (Cloudflare 우회)
##############################################################################
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True},
)

##############################################################################
# 헬퍼 함수
##############################################################################
def extract_links(html_text: str) -> list[str]:
    soup = BeautifulSoup(html_text, "html.parser")
    return [urljoin(BASE, a["href"])
            for a in soup.find_all("a", href=HREF_RE)]

def crawl_one_sort(sort_key: str) -> set[str]:
    """주어진 sort 파라미터에서 모든 페이지를 돌며 링크 집합 반환"""
    collected = set()
    page = 1
    while True:
        qs  = f"?sort={sort_key}&status=confirmed"
        url = (f"{BASE}{PATH}{qs}" if page == 1
               else f"{BASE}{PATH}/page/{page}{qs}")
        print(f"📥  [{sort_key}] {url}")
        res = scraper.get(url, timeout=20)
        if res.status_code != 200:
            print(" 🚫  HTTP", res.status_code, "→ 중단")
            break

        links = extract_links(res.text)
        if not links:
            break

        collected.update(links)
        page += 1
        time.sleep(DELAY)

    print(f" ✅  {sort_key}: {len(collected):,} links")
    return collected

##############################################################################
# 메인 크롤링 루프
##############################################################################
if __name__ == "__main__":
    all_links: set[str] = set()
    for cat in CATS:
        all_links |= crawl_one_sort(cat)      # 집합 합집합(중복 제거)

    print(f"\n🎉  TOTAL unique links: {len(all_links):,}")

    # CSV 저장
    with open(OUTCSV, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["url"])
        writer.writerows([[u] for u in sorted(all_links)])

    print(f"💾  saved → {OUTCSV}")


📥  [views] https://knowyourmeme.com/categories/meme?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/2?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/3?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/4?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/5?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/6?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/7?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/8?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/9?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/10?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/11?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/m

In [39]:
pages = ["views","oldest","chronological","reverse-chronological","comments","images","videos"]

In [3]:
# kym_scraper.py  ── 핵심 부분만 발췌
import re, json, csv, time, datetime as dt
from pathlib import Path
from bs4 import BeautifulSoup
import cloudscraper, pandas as pd


################################################################################
# 1)  엔트리 HTML → dict  (필드가 없으면 '' 반환)
################################################################################
SECTION_NAMES = {
    'about'            : 'about',
    'origin'           : 'origin',
    'precursors'       : 'precursors',
    'spread'           : 'spread',
    'search interest'  : 'search_interest',
    'notable examples' : 'notable_examples'
}

def text_normalize(s: str) -> str:
    return re.sub(r'\s+', ' ', s).strip()

def extract_section(soup: BeautifulSoup, header: str) -> str:
    """h2/h3 제목이 `header` 와 일치하는 블록을 찾아 내용을 이어붙임"""
    h = soup.find(lambda t: t.name in ('h2', 'h3') and
                            text_normalize(t.get_text()).lower() == header)
    if not h:
        return ''
    chunks = []
    for sib in h.find_next_siblings():
        if sib.name in ('h2', 'h3'):
            break
        chunks.append(text_normalize(sib.get_text(' ', strip=True)))
    return ' '.join(chunks).strip()

def parse_entry(html: str, url: str) -> dict:
    soup = BeautifulSoup(html, 'lxml')

    # ── ①  타이틀 / 대표 이미지 --------------------------------------------------
    title  = soup.find('meta', property='og:title')
    header = soup.find('meta', property='og:image')
    result = {
        'url'          : url,
        'title'        : title['content']  if title  else '',
        'header_image' : header['content'] if header else ''
    }

    # ── ②  좌측(혹은 우측) 메타 정보 ---------------------------------------------
    stats = soup.select_one('aside.stats dl')
    for field in ('Type', 'Status', 'Year', 'Origin', 'Added', 'Views'):
        result[field.lower()] = ''
    if stats:
        dts = [d.get_text(strip=True) for d in stats.find_all('dt')]
        dds = [d.get_text(strip=True) for d in stats.find_all('dd')]
        meta = dict(zip(dts, dds))
        result.update({
            'type'   : meta.get('Type',   ''),
            'status' : meta.get('Status', ''),
            'year'   : meta.get('Year',   ''),
            'origin' : meta.get('Origin', ''),
            'added'  : meta.get('Added',  ''),
            'views'  : meta.get('Views',  '')
        })

    # ── ③  태그(keyword) ---------------------------------------------------------
    tags = ''
    for scr in soup.find_all('script', type='application/ld+json'):
        try:
            data = json.loads(scr.string)
            if isinstance(data, dict) and data.get('keywords'):
                tags = data['keywords']
                break
        except Exception:
            pass
    result['tags'] = tags

    # ── ④  본문 섹션들 -----------------------------------------------------------
    for h_txt, col in SECTION_NAMES.items():
        result[col] = extract_section(soup, h_txt)

    return result


################################################################################
# 2)  URL 리스트 → CSV
################################################################################
def scrape_urls(urls, delay: float = 1.0) -> list[dict]:
    scraper = cloudscraper.create_scraper(
        browser={"browser": "chrome", "platform": "windows", "desktop": True},
        delay=delay
    )
    rows = []
    for i, url in enumerate(urls, 1):
        try:
            print(f'[{i:>5}/{len(urls)}]  GET {url}')
            html = scraper.get(url, timeout=30).text
            rows.append(parse_entry(html, url))
        except Exception as e:
            print('  ↳ ERROR:', e)
    return rows


################################################################################
# 3)  main
################################################################################
def main(in_csv: str = 'test.csv',
         out_csv: str = 'kym_entries.csv',
         delay : float = 1.0):

    urls = pd.read_csv(in_csv)['url'].dropna().unique().tolist()
    rows = scrape_urls(urls, delay=delay)

    # 컬럼 순서 고정
    cols = ['url','title','type','status','year','origin','added','views',
            'tags','about','origin_text','precursors','spread',
            'search_interest','notable_examples','header_image']
    pd.DataFrame(rows).reindex(columns=cols).to_csv(out_csv, index=False, encoding = 'utf8')
    print(f'✓ saved → {out_csv}')


if __name__ == '__main__':
    main('test.csv', 'kym_entries.csv', delay=1.0)



[    1/10]  GET https://knowyourmeme.com/memes/%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face
[    2/10]  GET https://knowyourmeme.com/memes/%D0%B4%D1%80%D1%83%D0%B3-apyr
[    3/10]  GET https://knowyourmeme.com/memes/%D0%BD%D0%B5%D1%82-no-poster
[    4/10]  GET https://knowyourmeme.com/memes/%E0%B2%A0_%E0%B2%A0-look-of-disapproval
[    5/10]  GET https://knowyourmeme.com/memes/05x-a-presses-but-first-we-need-to-talk-about-parallel-universes
[    6/10]  GET https://knowyourmeme.com/memes/09-f9-11-02-9d-74-e3-5b-d8-41-56-c5-63-56-88-c0
[    7/10]  GET https://knowyourmeme.com/memes/1-2-buckle-my-shoe-3-4-buckle-some-more-5-6-nike-kicks
[    8/10]  GET https://knowyourmeme.com/memes/1-2-oatmeal
[    9/10]  GET https://knowyourmeme.com/memes/1-am-by-civ-pop-out-at-one-in-the-morning-ai-bear-song
[   10/10]  GET https://knowyourmeme.com/memes/1-billion-lions-vs-1-of-every-pokemon
✓ saved → kym_entries.csv


In [2]:
!pip install cloudscraper

Collecting cloudscraper
  Using cached cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Collecting pyparsing>=2.4.7 (from cloudscraper)
  Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Collecting requests>=2.9.2 (from cloudscraper)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting requests-toolbelt>=0.9.1 (from cloudscraper)
  Using cached requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting idna<4,>=2.5 (from requests>=2.9.2->cloudscraper)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.9.2->cloudscraper)
  Using cached urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests>=2.9.2->cloudscraper)
  Using cached certifi-2025.4.26-py3-none-any.whl.metadata (2.5 kB)
Using cached cloudscraper-1.2.71-py2.py3-none-any.whl (99 kB)
Using cached pyparsing-3.2.3-py3-none-any.whl (111 kB)
Using cached requests-2.32.3-py3-none-an

In [44]:
page = requests.get("https://knowyourmeme.com/memes/%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face")
soup = BeautifulSoup(page.text, 'html.parser')
soup

<!DOCTYPE html>

<html lang="en" xmlns="https://www.w3.org/1999/xhtml" xmlns:fb="https://www.facebook.com/2008/fbml">
<head>
<title>
( ͡° ͜ʖ ͡°) / Lenny Face | Know Your Meme
</title>
<link href="https://knowyourmeme.com/memes/%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face" rel="canonical"/>
<link href="https://a.kym-cdn.com" rel="preconnect">
<link href="https://i.kym-cdn.com" rel="preconnect"/>
<link href="https://ads.blogherads.com" rel="preconnect"/>
<link href="https://a.kym-cdn.com" rel="dns-prefetch"/>
<link href="https://i.kym-cdn.com" rel="dns-prefetch"/>
<link href="https://ads.blogherads.com" rel="dns-prefetch"/>
<link as="image" href="https://i.kym-cdn.com/entries/icons/mobile/000/011/764/LennyFace.jpg" rel="preload"/>
<link as="image" href="https://i.kym-cdn.com/featured_items/icons/wide/000/028/421/cover16.jpg" rel="preload"/>
<link as="image" href="https://i.kym-cdn.com/featured_items/icons/wide/000/028/420/maxresdefault.jpg" rel="preload"/>
<link as="image" href="https://i.kym-c

In [10]:
# scraper_kym.py
import re, json, time, csv, sys, unicodedata, pathlib
from typing import Dict, List
from bs4 import BeautifulSoup
import cloudscraper             # cf-bypass 세션
scraper = cloudscraper.create_scraper(
            browser={"browser": "chrome", "platform": "windows", "desktop": True},
            delay=1.5)           # KYM rate-limit 완화용

# ---------- 헬퍼 ----------
def text(el) -> str:
    """모든 공백을 1칸으로 정리해 텍스트만 추출"""
    if not el: return ""
    return re.sub(r"\s+", " ", el.get_text(" ", strip=True)).strip()

def norm(s: str) -> str:         # csv 깨짐 방지용 NFC
    return unicodedata.normalize("NFC", s) if s else ""

def section_text(soup: BeautifulSoup, name: str) -> str:
    """h2(또는 h3) 제목이 `name` 인 섹션의 본문 단락을 이어 붙임"""
    h = soup.find(lambda tag: tag.name in ("h2", "h3") and 
                             re.fullmatch(name, tag.text.strip(), flags=re.I))
    if not h:
        return ""
    parts = []
    for sib in h.find_all_next():
        if sib.name in ("h2", "h3"):
            break
        if sib.name in ("p", "ul", "ol", "blockquote"):
            parts.append(text(sib))
    return " ".join(parts).strip()

# ---------- 메인 ----------
def scrape_entry(url: str) -> Dict[str, str]:
    """KYM 엔트리 1 개를 크롤링해서 Dict 반환"""
    # ① html 가져오기 ─ 실패하면 AMP 백업
    r = scraper.get(url, timeout=20)
    if not r.ok:
        amp_url = url.replace("knowyourmeme.com/", "amp.knowyourmeme.com/")
        r = scraper.get(amp_url, timeout=20)
    soup = BeautifulSoup(r.text, "lxml")

    out = dict(url=url)          # 결과 dict

    # ② ─ title ---------------------------------------------------------
    # (1) JSON-LD 선호 - 실패시 <title>
    try:
        ld = json.loads(soup.select_one(
             'script[type="application/ld+json"]').string)
        out["title"] = ld.get("headline") or ld.get("name") or ""
        out["tags"]  = ", ".join((ld.get("keywords") or "").split(", "))
    except Exception:
        t = soup.title.text if soup.title else ""
        out["title"] = t.replace(" | Know Your Meme", "")

    # ③ ─ HEADER stats(데스크톱 버전) ------------------------------------
    stats = {}
    for dl in soup.select("aside.stats dl"):
        dts = dl.find_all("dt")
        dds = dl.find_all("dd")
        for dt_el, dd_el in zip(dts, dds):
            key = dt_el.text.strip(": ").lower()
            stats[key] = text(dd_el)

    # ④ ─ AMP 백업 루트(텍스트 스캔) ------------------------------------
    amp_txt = soup.get_text(" ", strip=True)
    if not stats.get("year"):
        m = re.search(r"\bYear\s+(\d{4})\b", amp_txt, re.I)
        if m: stats["year"] = m.group(1)
    if not stats.get("origin"):
        m = re.search(r"\bOrigin\s+([A-Za-z &]+?)\s{2,}", amp_txt)
        if m: stats["origin"] = m.group(1)
    if not stats.get("type"):
        m = re.search(r"\bType\s+([A-Za-z -]+?)\s{2,}", amp_txt)
        if m: stats["type"] = m.group(1)

    # views / status (AMP 헤더 패턴)
    m = re.search(r"(Confirmed|Submission|Researching|Deadpool)\s+([\d,]+)", amp_txt)
    if m:
        stats["status"], stats["views"] = m.groups()

    # added 날짜
    m = re.search(r"Added\s+(\d{1,2}\s+\w+\s+\d{4})", amp_txt)
    if m:
        stats["added"] = m.group(1)

    # ⑤ ─ 헤더 스탬프(재디자인) ─ status 대안 -----------------------------
    if not stats.get("status"):
        stamp = soup.select_one("header .stamp")
        if stamp:
            stats["status"] = text(stamp)

    # ⑥ ─ 섹션 본문 ------------------------------------------------------
    for sec in ("About", "Origin", "Precursors",
                "Spread", "Search Interest", "Notable Examples"):
        out[sec.lower().replace(" ", "_")] = section_text(soup, sec)

    # ⑦ ─ header image (og:image 메타)
    og = soup.find("meta", property="og:image")
    out["header_image"] = og["content"] if og else ""

    # ⑧ ─ stats dict → out
    out.update({k: norm(v) for k, v in stats.items()})

    return out


# ---------- CLI ----------
def main(in_csv: str, out_csv: str):
    urls = list(dict.fromkeys(
        [u.strip() for u in pathlib.Path(in_csv).read_text().splitlines() if u.strip()]
    ))
    fieldnames = ["url", "title", "type", "status", "year", "origin",
                  "added", "views", "tags",
                  "about", "origin_text", "precursors", "spread",
                  "search_interest", "notable_examples", "header_image"]

    with open(out_csv, "w", newline="", encoding="utf-8") as f_out:
        wr = csv.DictWriter(f_out, fieldnames=fieldnames)
        wr.writeheader()
        for u in urls:
            try:
                row = scrape_entry(u)
                wr.writerow({k: row.get(k, "") for k in fieldnames})
                print("✓", u)
            except Exception as e:
                print("✗", u, e, file=sys.stderr)

if __name__ == "__main__":
   
    main("kym_memes_all.csv", "kym_test_v1.csv")


✗ ﻿url Failed to parse: ﻿url


✓ https://knowyourmeme.com/memes/%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face
✓ https://knowyourmeme.com/memes/%D0%B4%D1%80%D1%83%D0%B3-apyr
✓ https://knowyourmeme.com/memes/%D0%BD%D0%B5%D1%82-no-poster
✓ https://knowyourmeme.com/memes/%E0%B2%A0_%E0%B2%A0-look-of-disapproval
✓ https://knowyourmeme.com/memes/05x-a-presses-but-first-we-need-to-talk-about-parallel-universes
✓ https://knowyourmeme.com/memes/09-f9-11-02-9d-74-e3-5b-d8-41-56-c5-63-56-88-c0
✓ https://knowyourmeme.com/memes/1-2-buckle-my-shoe-3-4-buckle-some-more-5-6-nike-kicks
✓ https://knowyourmeme.com/memes/1-2-oatmeal
✓ https://knowyourmeme.com/memes/1-am-by-civ-pop-out-at-one-in-the-morning-ai-bear-song
✓ https://knowyourmeme.com/memes/1-billion-lions-vs-1-of-every-pokemon
✓ https://knowyourmeme.com/memes/1-guy-1-jar
✓ https://knowyourmeme.com/memes/1-guy-1-jar-death-rumor
✓ https://knowyourmeme.com/memes/10-guy
✓ https://knowyourmeme.com/memes/10-hours-of-walking-in-nyc
✓ https://knowyourmeme.com/memes/10x-engineer
✓ https://kn

KeyboardInterrupt: 

In [None]:
# kym_scraper.py
# KnowYourMeme entry bulk‑scraper (desktop & new redesign compatible)
# ---------------------------------------------------------------------------
#   * Reads a CSV that has a "url" column → scrapes every KYM entry page
#   * Persists the fields below into a second CSV in a deterministic order
#   * Robust to Cloudflare and to the new 2024 redesign (AMP/desktop)
#   * Requires: beautifulsoup4, cloudscraper, pandas, lxml            
# ---------------------------------------------------------------------------

from __future__ import annotations
import re, json, argparse, time
from pathlib import Path
from typing import List, Dict

import pandas as pd
from bs4 import BeautifulSoup
import cloudscraper

################################################################################
# 1)  Text helpers & constants
################################################################################

SECTION_NAMES = {
    "about"            : "about",
    "origin"           : "origin_text",
    "precursors"       : "precursors",
    "spread"           : "spread",
    "search interest"  : "search_interest",
    "notable examples" : "notable_examples",
}

META_FIELDS = ("Type", "Status", "Year", "Origin", "Added", "Views")

COL_ORDER = [
    "url", "title", "type", "status", "year", "origin", "added", "views",
    "tags", "about", "origin_text", "precursors", "spread",
    "search_interest", "notable_examples", "header_image",
]

def text_normalize(s: str) -> str:
    """Collapse whitespace → single spaces & strip"""
    return re.sub(r"\s+", " ", s or "").strip()

################################################################################
# 2)  Single‑entry HTML → dict
################################################################################

def _gather_stats(soup: BeautifulSoup) -> Dict[str, str]:
    """Return dict with keys matching META_FIELDS (lower‑case)."""
    stats_block = soup.select_one("aside.stats dl")  # old & new design
    meta: Dict[str, str] = {}
    if stats_block:
        dts = [d.get_text(strip=True) for d in stats_block.find_all("dt")]
        dds = [d.get_text(strip=True) for d in stats_block.find_all("dd")]
        meta = dict(zip(dts, dds))

    # -------- Fallbacks for missing Year / etc. ----------------------------
    if not meta.get("Year"):
        # 1) <meta property="article:published_time">
        m = soup.find("meta", {"property": [
            "article:published_time", "og:published_time"]})
        if m and re.match(r"\d{4}", m.get("content", "")):
            meta["Year"] = m["content"][:4]
        # 2) JSON‑LD datePublished
        if not meta.get("Year"):
            for scr in soup.find_all("script", type="application/ld+json"):
                try:
                    data = json.loads(scr.string)
                    if isinstance(data, dict) and data.get("datePublished"):
                        meta["Year"] = data["datePublished"][:4]
                        break
                except Exception:
                    continue

    # Strip commas from Views to keep it numeric‑friendly
    if "Views" in meta:
        meta["Views"] = text_normalize(meta["Views"]).replace(",", "")

    # lowercase keys → value or ""
    return {k.lower(): meta.get(k, "") for k in META_FIELDS}


def _extract_tags_from_jsonld(soup: BeautifulSoup) -> str:
    for scr in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(scr.string)
            if isinstance(data, dict) and data.get("keywords"):
                return text_normalize(data["keywords"])
        except Exception:
            pass
    return ""


def _extract_section(soup: BeautifulSoup, header_txt: str) -> str:
    """Locate the first h2/h3 whose normalized text == header_txt (lowercase)"""
    hdr = soup.find(lambda t: t and t.name in ("h2", "h3") and
                              text_normalize(t.get_text()).lower() == header_txt)
    if not hdr:
        return ""

    chunks: List[str] = []
    for sib in hdr.find_next_siblings():
        if sib.name in ("h2", "h3"):
            break
        chunks.append(text_normalize(sib.get_text(" ", strip=True)))
    return " ".join(chunks).strip()


def parse_entry(html: str, url: str) -> Dict[str, str]:
    soup = BeautifulSoup(html, "lxml")

    # -- title & header image -------------------------------------------------
    title_tag  = soup.find("meta", property="og:title")
    image_tag  = soup.find("meta", property="og:image")

    row: Dict[str, str] = {
        "url"          : url,
        "title"        : title_tag.get("content", "") if title_tag else "",
        "header_image" : image_tag.get("content", "") if image_tag else "",
    }

    # -- sidebar stats -------------------------------------------------------
    row.update(_gather_stats(soup))

    # -- tags (keywords) ------------------------------------------------------
    row["tags"] = _extract_tags_from_jsonld(soup)

    # -- content sections -----------------------------------------------------
    for hdr, col_name in SECTION_NAMES.items():
        row[col_name] = _extract_section(soup, hdr)

    return row

################################################################################
# 3)  Bulk scraping utilities
################################################################################

def scrape_urls(urls: List[str], delay: float = 1.0) -> List[Dict[str, str]]:
    scraper = cloudscraper.create_scraper(
        browser={"browser": "chrome", "platform": "windows", "desktop": True},
        delay=delay,
    )
    rows: List[Dict[str, str]] = []
    for i, url in enumerate(urls, 1):
        print(f"[{i:>5}/{len(urls)}]  GET {url}")
        try:
            resp = scraper.get(url, timeout=30)
            resp.raise_for_status()
            rows.append(parse_entry(resp.text, url))
        except Exception as exc:
            print("   ↳ ERROR:", exc)
        time.sleep(delay)
    return rows

################################################################################
# 4)  CLI wrapper
################################################################################

def main(in_csv: str, out_csv: str, delay: float = 1.0) -> None:
    urls = pd.read_csv(in_csv)["url"].dropna().unique().tolist()
    rows = scrape_urls(urls, delay)
    pd.DataFrame(rows).reindex(columns=COL_ORDER).to_csv(out_csv, index=False, encoding="utf‑8")
    print(f"✓ Saved → {out_csv}  (rows: {len(rows)})")


if __name__ == "__main__":
    main("kym_memes_all.csv", "kym_test_v2.csv")


[    1/10]  GET https://knowyourmeme.com/memes/%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face
[    2/10]  GET https://knowyourmeme.com/memes/%D0%B4%D1%80%D1%83%D0%B3-apyr
[    3/10]  GET https://knowyourmeme.com/memes/%D0%BD%D0%B5%D1%82-no-poster
[    4/10]  GET https://knowyourmeme.com/memes/%E0%B2%A0_%E0%B2%A0-look-of-disapproval
[    5/10]  GET https://knowyourmeme.com/memes/05x-a-presses-but-first-we-need-to-talk-about-parallel-universes
[    6/10]  GET https://knowyourmeme.com/memes/09-f9-11-02-9d-74-e3-5b-d8-41-56-c5-63-56-88-c0
[    7/10]  GET https://knowyourmeme.com/memes/1-2-buckle-my-shoe-3-4-buckle-some-more-5-6-nike-kicks
[    8/10]  GET https://knowyourmeme.com/memes/1-2-oatmeal
[    9/10]  GET https://knowyourmeme.com/memes/1-am-by-civ-pop-out-at-one-in-the-morning-ai-bear-song
[   10/10]  GET https://knowyourmeme.com/memes/1-billion-lions-vs-1-of-every-pokemon
✓ Saved → kym_test3.csv  (rows: 10)


In [12]:
import requests
from bs4 import BeautifulSoup
BASE = 'https://knowyourmeme.com'
LIST   = f"{BASE}/categories/meme"

page = requests.get("https://knowyourmeme.com/memes/jd-vance-killed-pope-francis-theory")
soup = BeautifulSoup(page.text, 'html.parser')
soup

<!DOCTYPE html>

<html lang="en" xmlns="https://www.w3.org/1999/xhtml" xmlns:fb="https://www.facebook.com/2008/fbml">
<head>
<title>
J.D. Vance Killed Pope Francis Theory | Know Your Meme
</title>
<link href="https://knowyourmeme.com/memes/jd-vance-killed-pope-francis-theory" rel="canonical"/>
<link href="https://a.kym-cdn.com" rel="preconnect">
<link href="https://i.kym-cdn.com" rel="preconnect"/>
<link href="https://ads.blogherads.com" rel="preconnect"/>
<link href="https://a.kym-cdn.com" rel="dns-prefetch"/>
<link href="https://i.kym-cdn.com" rel="dns-prefetch"/>
<link href="https://ads.blogherads.com" rel="dns-prefetch"/>
<link as="image" href="https://i.kym-cdn.com/entries/icons/mobile/000/053/892/JD_Vance_antichrist_meme_cover.jpg" rel="preload"/>
<link as="image" href="https://i.kym-cdn.com/featured_items/icons/wide/000/028/421/cover16.jpg" rel="preload"/>
<link as="image" href="https://i.kym-cdn.com/featured_items/icons/wide/000/028/420/maxresdefault.jpg" rel="preload"/>
<link 

In [14]:
# kym_scraper.py  ―  KnowYourMeme entry 수집기 (2025-04 리뉴얼 대응판)
import re, json, csv, time
from pathlib import Path
from typing import List, Dict

import cloudscraper              # cf-bypass
from bs4 import BeautifulSoup
import pandas as pd


# ────────────────────────────── 공통 유틸 ──────────────────────────────
SECTION_NAMES = {
    'about'            : 'about',
    'origin'           : 'origin_text',      # ← ‘origin’과 이름 충돌 방지
    'precursors'       : 'precursors',
    'spread'           : 'spread',
    'search interest'  : 'search_interest',
    'notable examples' : 'notable_examples',
}

STATS_FIELDS = {                     # dt 텍스트 → csv 컬럼
    'status' : 'status',
    'type'   : 'type',
    'year'   : 'year',
    'origin' : 'origin',
    'region' : 'region',
    'added'  : 'added',
    'views'  : 'views',
}

CSV_COL_ORDER = [
    'url', 'title',
    'type', 'status', 'year', 'origin', 'region', 'added', 'views',
    'tags',
    'about', 'origin_text', 'precursors', 'spread',
    'search_interest', 'notable_examples',
    'header_image',
]

def text_normalize(txt: str) -> str:
    """여러 줄·연속 공백 → 단일 space 로 압축"""
    return re.sub(r'\s+', ' ', txt).strip()


# ────────────────────────────── 파싱 함수 ──────────────────────────────
def extract_sections(soup: BeautifulSoup) -> Dict[str, str]:
    data = {}
    for human, machine in SECTION_NAMES.items():
        h = soup.find(
            lambda t: t.name in ('h2', 'h3') and
            text_normalize(t.get_text()).lower() == human
        )
        if not h:           # 없는 섹션은 빈 문자열
            data[machine] = ''
            continue
        chunks = []
        for sib in h.find_next_siblings():
            if sib.name in ('h2', 'h3'):
                break
            chunks.append(text_normalize(sib.get_text(' ', strip=True)))
        data[machine] = ' '.join(chunks).strip()
    return data


def extract_stats(soup: BeautifulSoup) -> Dict[str, str]:
    """사이드바(Stats) 정보 추출 – OLD · NEW 레이아웃 모두 지원"""
    found = {v: '' for v in STATS_FIELDS.values()}

    for dl in soup.find_all('dl'):
        dts = dl.find_all('dt')
        for dt in dts:
            key = text_normalize(dt.get_text()).rstrip(':').lower()
            if key in STATS_FIELDS:
                dd = dt.find_next_sibling('dd')
                if dd:
                    found[STATS_FIELDS[key]] = text_normalize(dd.get_text())
        # 모든 항목을 채웠으면 더 이상 탐색 X
        if all(found.values()):
            break
    return found


def parse_entry(html: str, url: str) -> Dict[str, str]:
    soup = BeautifulSoup(html, 'lxml')

    # ── ① title · header image ────────────────────────────────────────
    title = (
        soup.find('meta', property='og:title') or
        soup.find('title') or
        soup.find('h1')
    )
    title_text = text_normalize(title.get('content', '') if title else title.get_text() if title else '')

    header = soup.find('meta', property='og:image')
    header_img = header['content'] if header else ''

    # ── ② stats 블록 -----------------------------------------------------------------
    stats = extract_stats(soup)

    # ── ③ tags -----------------------------------------------------------------------
    tags = ''
    for scr in soup.find_all('script', type='application/ld+json'):
        try:
            data = json.loads(scr.string)
            if isinstance(data, dict) and data.get('keywords'):
                tags = data['keywords']
                break
        except Exception:
            pass

    # ── ④ 본문 섹션 ------------------------------------------------------------------
    sections = extract_sections(soup)

    return {
        'url'          : url,
        'title'        : title_text,
        'header_image' : header_img,
        'tags'         : tags,
        **stats,
        **sections,
    }


# ────────────────────────────── 스크레이퍼 본체 ──────────────────────────
def scrape_urls(urls: List[str], delay: float = 1.0) -> List[Dict[str, str]]:
    scraper = cloudscraper.create_scraper(
        browser={"browser": "chrome", "platform": "windows", "desktop": True},
        delay=delay,
    )
    rows = []
    for i, url in enumerate(urls, 1):
        try:
            print(f'[{i:>5}/{len(urls)}]  GET {url}')
            html = scraper.get(url, timeout=30).text
            rows.append(parse_entry(html, url))
        except Exception as e:
            print('  ↳ ERROR:', e)
    return rows


# ────────────────────────────── main CLI ───────────────────────────────
def main(in_csv: str = 'test.csv',
         out_csv: str = 'kym_entries.csv',
         delay  : float = 1.0):

    urls = pd.read_csv(in_csv)['url'].dropna().unique().tolist()
    rows = scrape_urls(urls, delay=delay)

    df = pd.DataFrame(rows)
    # 누락 컬럼 보충 & 순서 고정
    for col in CSV_COL_ORDER:
        if col not in df.columns:
            df[col] = ''
    df = df[CSV_COL_ORDER]

    df.to_csv(out_csv, index=False, encoding='utf-8')
    print(f'✓ saved → {out_csv}')


if __name__ == '__main__':
    main('kym_memes_all.csv', 'kym_entries_full.csv', delay=1.0)


[    1/3959]  GET https://knowyourmeme.com/memes/%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face
[    2/3959]  GET https://knowyourmeme.com/memes/%D0%B4%D1%80%D1%83%D0%B3-apyr
[    3/3959]  GET https://knowyourmeme.com/memes/%D0%BD%D0%B5%D1%82-no-poster
[    4/3959]  GET https://knowyourmeme.com/memes/%E0%B2%A0_%E0%B2%A0-look-of-disapproval
[    5/3959]  GET https://knowyourmeme.com/memes/05x-a-presses-but-first-we-need-to-talk-about-parallel-universes
[    6/3959]  GET https://knowyourmeme.com/memes/09-f9-11-02-9d-74-e3-5b-d8-41-56-c5-63-56-88-c0
[    7/3959]  GET https://knowyourmeme.com/memes/1-2-buckle-my-shoe-3-4-buckle-some-more-5-6-nike-kicks
[    8/3959]  GET https://knowyourmeme.com/memes/1-2-oatmeal
[    9/3959]  GET https://knowyourmeme.com/memes/1-am-by-civ-pop-out-at-one-in-the-morning-ai-bear-song
[   10/3959]  GET https://knowyourmeme.com/memes/1-billion-lions-vs-1-of-every-pokemon
[   11/3959]  GET https://knowyourmeme.com/memes/1-guy-1-jar
[   12/3959]  GET https://knowyourmeme.

In [2]:
# kym_scraper.py  ―  KnowYourMeme entry 수집기 (2025-04 리뉴얼 대응판)
import re, json, csv, time
from pathlib import Path
from typing import List, Dict

import cloudscraper              # cf-bypass
from bs4 import BeautifulSoup
import pandas as pd


# ────────────────────────────── 공통 유틸 ──────────────────────────────
SECTION_NAMES = {
    'about'            : 'about',
    'origin'           : 'origin_text',      # ← ‘origin’과 이름 충돌 방지
    'precursors'       : 'precursors',
    'spread'           : 'spread',
    'search interest'  : 'search_interest',
    'notable examples' : 'notable_examples',
}

STATS_FIELDS = {                     # dt 텍스트 → csv 컬럼
    'status' : 'status',
    'type'   : 'type',
    'year'   : 'year',
    'origin' : 'origin',
    'region' : 'region',
    'added'  : 'added',
    'views'  : 'views',
}

CSV_COL_ORDER = [
    'url', 'title',
    'type', 'status', 'year', 'origin', 'region', 'added', 'views',
    'tags',
    'about', 'origin_text', 'precursors', 'spread',
    'search_interest', 'notable_examples',
    'header_image',
]

def text_normalize(txt: str) -> str:
    """여러 줄·연속 공백 → 단일 space 로 압축"""
    return re.sub(r'\s+', ' ', txt).strip()


# ────────────────────────────── 파싱 함수 ──────────────────────────────
def extract_sections(soup: BeautifulSoup) -> Dict[str, str]:
    data = {}
    for human, machine in SECTION_NAMES.items():
        h = soup.find(
            lambda t: t.name in ('h2', 'h3') and
            text_normalize(t.get_text()).lower() == human
        )
        if not h:           # 없는 섹션은 빈 문자열
            data[machine] = ''
            continue
        chunks = []
        for sib in h.find_next_siblings():
            if sib.name in ('h2', 'h3'):
                break
            chunks.append(text_normalize(sib.get_text(' ', strip=True)))
        data[machine] = ' '.join(chunks).strip()
    return data


def extract_stats(soup: BeautifulSoup) -> Dict[str, str]:
    """사이드바(Stats) 정보 추출 – OLD · NEW 레이아웃 모두 지원"""
    found = {v: '' for v in STATS_FIELDS.values()}

    for dl in soup.find_all('dl'):
        dts = dl.find_all('dt')
        for dt in dts:
            key = text_normalize(dt.get_text()).rstrip(':').lower()
            if key in STATS_FIELDS:
                dd = dt.find_next_sibling('dd')
                if dd:
                    found[STATS_FIELDS[key]] = text_normalize(dd.get_text())
        # 모든 항목을 채웠으면 더 이상 탐색 X
        if all(found.values()):
            break
    return found


def parse_entry(html: str, url: str) -> Dict[str, str]:
    soup = BeautifulSoup(html, 'lxml')

    # ── ① title · header image ────────────────────────────────────────
    title = (
        soup.find('meta', property='og:title') or
        soup.find('title') or
        soup.find('h1')
    )
    title_text = text_normalize(title.get('content', '') if title else title.get_text() if title else '')

    header = soup.find('meta', property='og:image')
    header_img = header['content'] if header else ''

    # ── ② stats 블록 -----------------------------------------------------------------
    stats = extract_stats(soup)

    # ── ③ tags -----------------------------------------------------------------------
    tags = ''
    for scr in soup.find_all('script', type='application/ld+json'):
        try:
            data = json.loads(scr.string)
            if isinstance(data, dict) and data.get('keywords'):
                tags = data['keywords']
                break
        except Exception:
            pass

    # ── ④ 본문 섹션 ------------------------------------------------------------------
    sections = extract_sections(soup)

    return {
        'url'          : url,
        'title'        : title_text,
        'header_image' : header_img,
        'tags'         : tags,
        **stats,
        **sections,
    }


# ────────────────────────────── 스크레이퍼 본체 ──────────────────────────
def scrape_urls(urls: List[str], delay: float = 1.0) -> List[Dict[str, str]]:
    scraper = cloudscraper.create_scraper(
        browser={"browser": "chrome", "platform": "windows", "desktop": True},
        delay=delay,
    )
    rows = []
    for i, url in enumerate(urls, 1):
        try:
            print(f'[{i:>5}/{len(urls)}]  GET {url}')
            html = scraper.get(url, timeout=30).text
            rows.append(parse_entry(html, url))
        except Exception as e:
            print('  ↳ ERROR:', e)
    return rows


# ────────────────────────────── main CLI ───────────────────────────────
def main(in_csv: str = 'test.csv',
         out_csv: str = 'kym_entries.csv',
         delay  : float = 1.0):

    urls = pd.read_csv(in_csv)['url'].dropna().unique().tolist()
    rows = scrape_urls(urls, delay=delay)

    df = pd.DataFrame(rows)
    # 누락 컬럼 보충 & 순서 고정
    for col in CSV_COL_ORDER:
        if col not in df.columns:
            df[col] = ''
    df = df[CSV_COL_ORDER]

    df.to_csv(out_csv, index=False, encoding='utf-8')
    print(f'✓ saved → {out_csv}')


if __name__ == '__main__':
    main('kym_memes_all.csv', 'kym_entries_full_try2.csv', delay=1.0)


[    1/3959]  GET https://knowyourmeme.com/memes/%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face
[    2/3959]  GET https://knowyourmeme.com/memes/%D0%B4%D1%80%D1%83%D0%B3-apyr
[    3/3959]  GET https://knowyourmeme.com/memes/%D0%BD%D0%B5%D1%82-no-poster
[    4/3959]  GET https://knowyourmeme.com/memes/%E0%B2%A0_%E0%B2%A0-look-of-disapproval
[    5/3959]  GET https://knowyourmeme.com/memes/05x-a-presses-but-first-we-need-to-talk-about-parallel-universes
[    6/3959]  GET https://knowyourmeme.com/memes/09-f9-11-02-9d-74-e3-5b-d8-41-56-c5-63-56-88-c0
[    7/3959]  GET https://knowyourmeme.com/memes/1-2-buckle-my-shoe-3-4-buckle-some-more-5-6-nike-kicks
[    8/3959]  GET https://knowyourmeme.com/memes/1-2-oatmeal
[    9/3959]  GET https://knowyourmeme.com/memes/1-am-by-civ-pop-out-at-one-in-the-morning-ai-bear-song
[   10/3959]  GET https://knowyourmeme.com/memes/1-billion-lions-vs-1-of-every-pokemon
[   11/3959]  GET https://knowyourmeme.com/memes/1-guy-1-jar
[   12/3959]  GET https://knowyourmeme.