In [7]:
import requests, pandas as pd, time
from lxml import html
from urllib.parse import urljoin
from tqdm import tqdm

BASE   = "https://knowyourmeme.com"
LIST   = f"{BASE}/categories/meme"
HDRS   = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..."}
SEL    = ("body > main > article > article > section"
          " > div.contents-container > div.contents"
          " > section > div:nth-child(1) > a:nth-child(odd)")

def detail_urls(list_html: str):
    tree  = html.fromstring(list_html)
    return [urljoin(BASE, a.get("href")) for a in tree.cssselect(SEL)]

def parse_detail(url: str):
    t = html.fromstring(requests.get(url, headers=HDRS, timeout=20).text)
    return {
        "title": t.xpath('normalize-space(//h1)'),
        "about": t.xpath('normalize-space(//*[@id="entry_section_about"]/p[1])'),
        "added": (t.xpath('string(//section[@id="entry_about"]//span[@class="time"]/@datetime)') or "")[:10],
        "url":   url,
    }

def crawl(pages=1, pause=1.5):
    rows = []
    for p in range(1, pages+1):
        url  = LIST if p == 1 else f"{LIST}/page/{p}"
        res  = requests.get(url, headers=HDRS, timeout=20)
        for durl in tqdm(detail_urls(res.text), desc=f"Page {p}"):
            try:
                rows.append(parse_detail(durl))
            except Exception as e:
                print("skip:", durl, e)
            time.sleep(pause)
        time.sleep(pause)
    return pd.DataFrame(rows)

if __name__ == "__main__":
    df = crawl(pages=1)          # 필요하면 pages 값을 늘리세요
    df.to_csv("kym_meme.csv", index=False, encoding="utf-8-sig")
    print("saved", len(df), "rows")


Page 1: 0it [00:00, ?it/s]


saved 0 rows


In [15]:
import requests
from bs4 import BeautifulSoup
BASE = 'https://knowyourmeme.com'
LIST   = f"{BASE}/categories/meme"

page = requests.get(BASE)
soup = BeautifulSoup(page.text, 'html.parser')
soup

<!DOCTYPE html>

<html lang="en" xmlns="https://www.w3.org/1999/xhtml" xmlns:fb="https://www.facebook.com/2008/fbml">
<head>
<title>
Internet Meme Database | Know Your Meme
</title>
<link href="https://knowyourmeme.com/" rel="canonical"/>
<link href="https://a.kym-cdn.com" rel="preconnect">
<link href="https://i.kym-cdn.com" rel="preconnect"/>
<link href="https://ads.blogherads.com" rel="preconnect"/>
<link href="https://a.kym-cdn.com" rel="dns-prefetch"/>
<link href="https://i.kym-cdn.com" rel="dns-prefetch"/>
<link href="https://ads.blogherads.com" rel="dns-prefetch"/>
<meta content="Know Your Meme" property="og:title"/>
<meta content="Know Your Meme" property="og:site_name"/>
<meta content="https://a.kym-cdn.com/assets/kym-logo-large-2be3f3818691470a0369e154647ca0f0.png" property="og:image"/>
<meta content="article" property="og:type"/>
<meta content="104675392961482" property="fb:app_id"/>
<meta content="88519108736" property="fb:pages"/>
<meta content="https://www.facebook.com/kno

In [38]:
import csv, re, time
from urllib.parse import urljoin

import cloudscraper            # Cloudflare 우회용 (pip install cloudscraper)
from bs4 import BeautifulSoup

BASE  = "https://knowyourmeme.com"
LIST  = f"{BASE}/categories/meme"
DELAY = 1.0                    # 페이지당 예의상 딜레이(초)

# ────────────────────────────────────────────────────────
# 1) Cloudflare 우회 세션 만들기
# ────────────────────────────────────────────────────────
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True},
)

# 밈 URL 패턴: /memes/… 또는 /memes/subcultures/… 도 포함하려면 r"^/memes/"
HREF_RE = re.compile(r"^/memes/[^/?#]+")

def extract_links(html_text: str) -> list[str]:
    """HTML 한 페이지에서 밈 상세 href를 절대경로로 변환해 반환"""
    soup = BeautifulSoup(html_text, "html.parser")
    links = [
        urljoin(BASE, a["href"])
        for a in soup.find_all("a", href=HREF_RE)
    ]
    return links

def crawl_all() -> list[str]:
    """페이지네이션을 따라가며 모든 밈 URL 수집"""
    collected: set[str] = set()
    page = 1
    while True:
        url = LIST if page == 1 else f"{LIST}/page/{page}"
        print(f"📥  {url}")
        res = scraper.get(url, timeout=20)
        if res.status_code != 200:
            print("🚫  HTTP error", res.status_code)
            break

        links = extract_links(res.text)
        if not links:                      # 더 이상 밈이 없으면 끝
            break

        collected.update(links)
        page += 1
        time.sleep(DELAY)

    return sorted(collected)

if __name__ == "__main__":
    meme_urls = crawl_all()
    print(f"✅  collected {len(meme_urls):,} meme links")

    # CSV 저장
    with open("kym_memes.csv", "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["url"])
        writer.writerows([[u] for u in meme_urls])

    print("💾  saved → kym_memes.csv")


📥  https://knowyourmeme.com/categories/meme
📥  https://knowyourmeme.com/categories/meme/page/2
📥  https://knowyourmeme.com/categories/meme/page/3
📥  https://knowyourmeme.com/categories/meme/page/4
📥  https://knowyourmeme.com/categories/meme/page/5
📥  https://knowyourmeme.com/categories/meme/page/6
📥  https://knowyourmeme.com/categories/meme/page/7
📥  https://knowyourmeme.com/categories/meme/page/8
📥  https://knowyourmeme.com/categories/meme/page/9
📥  https://knowyourmeme.com/categories/meme/page/10
📥  https://knowyourmeme.com/categories/meme/page/11
📥  https://knowyourmeme.com/categories/meme/page/12
📥  https://knowyourmeme.com/categories/meme/page/13
📥  https://knowyourmeme.com/categories/meme/page/14
📥  https://knowyourmeme.com/categories/meme/page/15
📥  https://knowyourmeme.com/categories/meme/page/16
📥  https://knowyourmeme.com/categories/meme/page/17
📥  https://knowyourmeme.com/categories/meme/page/18
📥  https://knowyourmeme.com/categories/meme/page/19
📥  https://knowyourmeme.com/

In [31]:
import csv, re, time
from urllib.parse import urljoin

import cloudscraper            # Cloudflare 우회용 (pip install cloudscraper)
from bs4 import BeautifulSoup

BASE  = "https://knowyourmeme.com"
PATH = "/categories/meme"  
QS   = "?sort=views&status=confirmed"
LIST  = f"{BASE}/categories/meme"
DELAY = 1.0                    # 페이지당 예의상 딜레이(초)

# ────────────────────────────────────────────────────────
# 1) Cloudflare 우회 세션 만들기
# ────────────────────────────────────────────────────────
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True},
)

# 밈 URL 패턴: /memes/… 또는 /memes/subcultures/… 도 포함하려면 r"^/memes/"
HREF_RE = re.compile(r"^/memes/[^/?#]+")

def extract_links(html_text: str) -> list[str]:
    """HTML 한 페이지에서 밈 상세 href를 절대경로로 변환해 반환"""
    soup = BeautifulSoup(html_text, "html.parser")
    links = [
        urljoin(BASE, a["href"])
        for a in soup.find_all("a", href=HREF_RE)
    ]
    return links

def crawl_all() -> list[str]:
    """페이지네이션을 따라가며 모든 밈 URL 수집"""
    collected: set[str] = set()
    page = 1
    while True:
        if page == 1:
            url = f"{BASE}{PATH}{QS}"
        else:
            url = f"{BASE}{PATH}/page/{page}{QS}"
        print(f"📥  {url}")
        res = scraper.get(url, timeout=20)
        if res.status_code != 200:
            print("🚫  HTTP error", res.status_code)
            break

        links = extract_links(res.text)
        if not links:                      # 더 이상 밈이 없으면 끝
            break

        collected.update(links)
        page += 1
        time.sleep(DELAY)

    return sorted(collected)

if __name__ == "__main__":
    meme_urls = crawl_all()
    print(f"✅  collected {len(meme_urls):,} meme links")

    # CSV 저장
    with open("kym_memes_views.csv", "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["url"])
        writer.writerows([[u] for u in meme_urls])

    print("💾  saved → kym_memes.csv")


📥  https://knowyourmeme.com/categories/meme?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/2?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/3?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/4?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/5?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/6?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/7?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/8?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/9?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/10?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/11?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/12?sort=views&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/13?sort

In [35]:
import csv, re, time
from urllib.parse import urljoin

import cloudscraper            # Cloudflare 우회용 (pip install cloudscraper)
from bs4 import BeautifulSoup

BASE  = "https://knowyourmeme.com"
PATH = "/categories/meme"  
QS   = "?sort=images&status=confirmed"
LIST  = f"{BASE}/categories/meme"
DELAY = 1.0                    # 페이지당 예의상 딜레이(초)

# ────────────────────────────────────────────────────────
# 1) Cloudflare 우회 세션 만들기
# ────────────────────────────────────────────────────────
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True},
)

# 밈 URL 패턴: /memes/… 또는 /memes/subcultures/… 도 포함하려면 r"^/memes/"
HREF_RE = re.compile(r"^/memes/[^/?#]+")

def extract_links(html_text: str) -> list[str]:
    """HTML 한 페이지에서 밈 상세 href를 절대경로로 변환해 반환"""
    soup = BeautifulSoup(html_text, "html.parser")
    links = [
        urljoin(BASE, a["href"])
        for a in soup.find_all("a", href=HREF_RE)
    ]
    return links

def crawl_all() -> list[str]:
    """페이지네이션을 따라가며 모든 밈 URL 수집"""
    collected: set[str] = set()
    page = 1
    while True:
        if page == 1:
            url = f"{BASE}{PATH}{QS}"
        else:
            url = f"{BASE}{PATH}/page/{page}{QS}"
        print(f"📥  {url}")
        res = scraper.get(url, timeout=20)
        if res.status_code != 200:
            print("🚫  HTTP error", res.status_code)
            break

        links = extract_links(res.text)
        if not links:                      # 더 이상 밈이 없으면 끝
            break

        collected.update(links)
        page += 1
        time.sleep(DELAY)

    return sorted(collected)

if __name__ == "__main__":
    meme_urls = crawl_all()
    print(f"✅  collected {len(meme_urls):,} meme links")

    # CSV 저장
    with open("kym_memes_images.csv", "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["url"])
        writer.writerows([[u] for u in meme_urls])

    print("💾  saved → kym_memes.csv")


📥  https://knowyourmeme.com/categories/meme?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/2?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/3?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/4?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/5?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/6?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/7?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/8?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/9?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/10?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/11?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/page/12?sort=images&status=confirmed
📥  https://knowyourmeme.com/categories/meme/

In [40]:
import csv, re, time
from urllib.parse import urljoin

import cloudscraper
from bs4 import BeautifulSoup

##############################################################################
# 설정
##############################################################################
BASE   = "https://knowyourmeme.com"
PATH   = "/categories/meme"
CATS   = ["views", "oldest", "chronological", "reverse-chronological",
          "comments", "images", "videos"]
DELAY  = 1.0                 # 요청 간 딜레이 (초)
OUTCSV = "kym_memes_all.csv"

# /memes/… (subculture 포함) 링크만 필터
HREF_RE = re.compile(r"^/memes(?:/subcultures)?/[^/?#]+")

##############################################################################
# 공통 세션 (Cloudflare 우회)
##############################################################################
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True},
)

##############################################################################
# 헬퍼 함수
##############################################################################
def extract_links(html_text: str) -> list[str]:
    soup = BeautifulSoup(html_text, "html.parser")
    return [urljoin(BASE, a["href"])
            for a in soup.find_all("a", href=HREF_RE)]

def crawl_one_sort(sort_key: str) -> set[str]:
    """주어진 sort 파라미터에서 모든 페이지를 돌며 링크 집합 반환"""
    collected = set()
    page = 1
    while True:
        qs  = f"?sort={sort_key}&status=confirmed"
        url = (f"{BASE}{PATH}{qs}" if page == 1
               else f"{BASE}{PATH}/page/{page}{qs}")
        print(f"📥  [{sort_key}] {url}")
        res = scraper.get(url, timeout=20)
        if res.status_code != 200:
            print(" 🚫  HTTP", res.status_code, "→ 중단")
            break

        links = extract_links(res.text)
        if not links:
            break

        collected.update(links)
        page += 1
        time.sleep(DELAY)

    print(f" ✅  {sort_key}: {len(collected):,} links")
    return collected

##############################################################################
# 메인 크롤링 루프
##############################################################################
if __name__ == "__main__":
    all_links: set[str] = set()
    for cat in CATS:
        all_links |= crawl_one_sort(cat)      # 집합 합집합(중복 제거)

    print(f"\n🎉  TOTAL unique links: {len(all_links):,}")

    # CSV 저장
    with open(OUTCSV, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["url"])
        writer.writerows([[u] for u in sorted(all_links)])

    print(f"💾  saved → {OUTCSV}")


📥  [views] https://knowyourmeme.com/categories/meme?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/2?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/3?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/4?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/5?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/6?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/7?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/8?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/9?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/10?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/meme/page/11?sort=views&status=confirmed
📥  [views] https://knowyourmeme.com/categories/m

In [39]:
pages = ["views","oldest","chronological","reverse-chronological","comments","images","videos"]

In [46]:
# kym_scraper.py  ── 핵심 부분만 발췌
import re, json, csv, time, datetime as dt
from pathlib import Path
from bs4 import BeautifulSoup
import cloudscraper, pandas as pd


################################################################################
# 1)  엔트리 HTML → dict  (필드가 없으면 '' 반환)
################################################################################
SECTION_NAMES = {
    'about'            : 'about',
    'origin'           : 'origin',
    'precursors'       : 'precursors',
    'spread'           : 'spread',
    'search interest'  : 'search_interest',
    'notable examples' : 'notable_examples'
}

def text_normalize(s: str) -> str:
    return re.sub(r'\s+', ' ', s).strip()

def extract_section(soup: BeautifulSoup, header: str) -> str:
    """h2/h3 제목이 `header` 와 일치하는 블록을 찾아 내용을 이어붙임"""
    h = soup.find(lambda t: t.name in ('h2', 'h3') and
                            text_normalize(t.get_text()).lower() == header)
    if not h:
        return ''
    chunks = []
    for sib in h.find_next_siblings():
        if sib.name in ('h2', 'h3'):
            break
        chunks.append(text_normalize(sib.get_text(' ', strip=True)))
    return ' '.join(chunks).strip()

def parse_entry(html: str, url: str) -> dict:
    soup = BeautifulSoup(html, 'lxml')

    # ── ①  타이틀 / 대표 이미지 --------------------------------------------------
    title  = soup.find('meta', property='og:title')
    header = soup.find('meta', property='og:image')
    result = {
        'url'          : url,
        'title'        : title['content']  if title  else '',
        'header_image' : header['content'] if header else ''
    }

    # ── ②  좌측(혹은 우측) 메타 정보 ---------------------------------------------
    stats = soup.select_one('aside.stats dl')
    for field in ('Type', 'Status', 'Year', 'Origin', 'Added', 'Views'):
        result[field.lower()] = ''
    if stats:
        dts = [d.get_text(strip=True) for d in stats.find_all('dt')]
        dds = [d.get_text(strip=True) for d in stats.find_all('dd')]
        meta = dict(zip(dts, dds))
        result.update({
            'type'   : meta.get('Type',   ''),
            'status' : meta.get('Status', ''),
            'year'   : meta.get('Year',   ''),
            'origin' : meta.get('Origin', ''),
            'added'  : meta.get('Added',  ''),
            'views'  : meta.get('Views',  '')
        })

    # ── ③  태그(keyword) ---------------------------------------------------------
    tags = ''
    for scr in soup.find_all('script', type='application/ld+json'):
        try:
            data = json.loads(scr.string)
            if isinstance(data, dict) and data.get('keywords'):
                tags = data['keywords']
                break
        except Exception:
            pass
    result['tags'] = tags

    # ── ④  본문 섹션들 -----------------------------------------------------------
    for h_txt, col in SECTION_NAMES.items():
        result[col] = extract_section(soup, h_txt)

    return result


################################################################################
# 2)  URL 리스트 → CSV
################################################################################
def scrape_urls(urls, delay: float = 1.0) -> list[dict]:
    scraper = cloudscraper.create_scraper(
        browser={"browser": "chrome", "platform": "windows", "desktop": True},
        delay=delay
    )
    rows = []
    for i, url in enumerate(urls, 1):
        try:
            print(f'[{i:>5}/{len(urls)}]  GET {url}')
            html = scraper.get(url, timeout=30).text
            rows.append(parse_entry(html, url))
        except Exception as e:
            print('  ↳ ERROR:', e)
    return rows


################################################################################
# 3)  main
################################################################################
def main(in_csv: str = 'kym_memes_all.csv',
         out_csv: str = 'kym_entries.csv',
         delay : float = 1.0):

    urls = pd.read_csv(in_csv)['url'].dropna().unique().tolist()
    rows = scrape_urls(urls, delay=delay)

    # 컬럼 순서 고정
    cols = ['url','title','type','status','year','origin','added','views',
            'tags','about','origin_text','precursors','spread',
            'search_interest','notable_examples','header_image']
    pd.DataFrame(rows).reindex(columns=cols).to_csv(out_csv, index=False)
    print(f'✓ saved → {out_csv}')


if __name__ == '__main__':
    main('test.csv', 'kym_entries.csv', delay=1.0)



[    1/10]  GET https://knowyourmeme.com/memes/%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face
[    2/10]  GET https://knowyourmeme.com/memes/%D0%B4%D1%80%D1%83%D0%B3-apyr
[    3/10]  GET https://knowyourmeme.com/memes/%D0%BD%D0%B5%D1%82-no-poster
[    4/10]  GET https://knowyourmeme.com/memes/%E0%B2%A0_%E0%B2%A0-look-of-disapproval
[    5/10]  GET https://knowyourmeme.com/memes/05x-a-presses-but-first-we-need-to-talk-about-parallel-universes
[    6/10]  GET https://knowyourmeme.com/memes/09-f9-11-02-9d-74-e3-5b-d8-41-56-c5-63-56-88-c0
[    7/10]  GET https://knowyourmeme.com/memes/1-2-buckle-my-shoe-3-4-buckle-some-more-5-6-nike-kicks
[    8/10]  GET https://knowyourmeme.com/memes/1-2-oatmeal
[    9/10]  GET https://knowyourmeme.com/memes/1-am-by-civ-pop-out-at-one-in-the-morning-ai-bear-song
[   10/10]  GET https://knowyourmeme.com/memes/1-billion-lions-vs-1-of-every-pokemon
✓ saved → kym_entries.csv


In [44]:
page = requests.get("https://knowyourmeme.com/memes/%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face")
soup = BeautifulSoup(page.text, 'html.parser')
soup

<!DOCTYPE html>

<html lang="en" xmlns="https://www.w3.org/1999/xhtml" xmlns:fb="https://www.facebook.com/2008/fbml">
<head>
<title>
( ͡° ͜ʖ ͡°) / Lenny Face | Know Your Meme
</title>
<link href="https://knowyourmeme.com/memes/%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face" rel="canonical"/>
<link href="https://a.kym-cdn.com" rel="preconnect">
<link href="https://i.kym-cdn.com" rel="preconnect"/>
<link href="https://ads.blogherads.com" rel="preconnect"/>
<link href="https://a.kym-cdn.com" rel="dns-prefetch"/>
<link href="https://i.kym-cdn.com" rel="dns-prefetch"/>
<link href="https://ads.blogherads.com" rel="dns-prefetch"/>
<link as="image" href="https://i.kym-cdn.com/entries/icons/mobile/000/011/764/LennyFace.jpg" rel="preload"/>
<link as="image" href="https://i.kym-cdn.com/featured_items/icons/wide/000/028/421/cover16.jpg" rel="preload"/>
<link as="image" href="https://i.kym-cdn.com/featured_items/icons/wide/000/028/420/maxresdefault.jpg" rel="preload"/>
<link as="image" href="https://i.kym-c