In [None]:
from bs4 import BeautifulSoup
import requests
import re
import os
from collections import deque
import json

In [None]:
SAVE_DIR = r"../raw_data"
EXCLUDED_PATH = os.path.join(SAVE_DIR, "excluded_links.json")
DEPTH_LIMIT = 3

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [None]:
SEED = ["https://vi.wikipedia.org/wiki/L%E1%BB%87_Quy%C3%AAn_(ca_s%C4%A9,_sinh_1981)", 
        "https://vi.wikipedia.org/wiki/Miu_L%C3%AA", 
        "https://vi.wikipedia.org/wiki/H%C3%B2a_Minzy",
        "https://vi.wikipedia.org/wiki/M%E1%BB%B9_Linh",
        "https://vi.wikipedia.org/wiki/Only_C",
        "https://vi.wikipedia.org/wiki/JustaTee",
        "https://vi.wikipedia.org/wiki/Ch%E1%BA%BF_Linh",
        "https://vi.wikipedia.org/wiki/%C4%90%C3%A0m_V%C4%A9nh_H%C6%B0ng",
        "https://vi.wikipedia.org/wiki/Mr._Siro"
        ]

In [None]:
def remove_characters(text):
    if isinstance(text, str):
        # B·ªè c·ª•m [s·ª≠a|s·ª≠a m√£ ngu·ªìn]
        cleaned = text.replace("[s·ª≠a|s·ª≠a m√£ ngu·ªìn]", "")
        # Xo√° ngo·∫∑c v√† d·∫•u c√°ch/d·∫•u ngo·∫∑c k√©p ·ªü ƒë·∫ßu & cu·ªëi
        cleaned = re.sub(r'^[\s\(\)\[\]\'"]+|[\s\(\)\[\]\'"]+$', '', cleaned)
        return cleaned.strip()
    return text


In [None]:
def get_years(active_years):
    is_active = False
    if not active_years:
        return None, None

    if isinstance(active_years, str):
        active_years = [active_years]

    start_years = []
    end_years = []

    for period in active_years:
        if not period:
            continue
        p = period.strip()

        # Chu·∫©n ho√° c√°c lo·∫°i dash th√†nh hyphen th∆∞·ªùng
        p = re.sub(r'[‚Äì‚Äî‚àí]', '-', p)

        # üîπ L·∫•y t·∫•t c·∫£ nƒÉm v√† c·∫£ t·ª´ "nay"
        tokens = re.findall(r'\b(?:19|20)\d{2}\b|\b(?:nay|hi·ªán t·∫°i|present|now)\b', p, re.IGNORECASE)

        # N·∫øu kh√¥ng c√≥ token n√†o, th·ª≠ ki·ªÉm tra d·∫°ng ƒë·∫∑c bi·ªát "2015-"
        if not tokens:
            if re.search(r'\b(?:19|20)\d{2}\b\s*-\s*$', p):
                start = int(re.search(r'(?:19|20)\d{2}', p).group())
                start_years.append(start)
            continue

        # X·ª≠ l√Ω token ƒë·∫ßu ti√™n (nƒÉm b·∫Øt ƒë·∫ßu)
        first = tokens[0]
        if re.match(r'(?:19|20)\d{2}', first):
            start_years.append(int(first))

        # X·ª≠ l√Ω token cu·ªëi c√πng (nƒÉm tan r√£)
        last = tokens[-1]
        if re.match(r'(?:19|20)\d{2}', last):
            end_years.append(int(last))
        elif re.match(r'(nay|hi·ªán t·∫°i|present|now)', last, re.IGNORECASE):
            is_active = True
            # n·∫øu l√† 'nay' th√¨ kh√¥ng c√≥ nƒÉm tan r√£
            pass
        elif len(tokens) == 1:
            # ch·ªâ c√≥ m·ªôt nƒÉm, coi l√† ho·∫°t ƒë·ªông trong nƒÉm ƒë√≥
            end_years.append(int(first))

    if not start_years:
        return None, None

    start = min(start_years)
    end = None if is_active else max(end_years)
    return start, end


In [None]:
def load_excluded_links():
    if os.path.exists(EXCLUDED_PATH):
        with open(EXCLUDED_PATH, "r", encoding="utf-8") as f:
            return set(json.load(f))
    return set()

In [None]:
def save_excluded_links(excluded_links):
    with open(EXCLUDED_PATH, "w", encoding="utf-8") as f:
        json.dump(sorted(list(excluded_links)), f, ensure_ascii=False, indent=2)

In [None]:
def crawl_valid_links(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # ‚ùå C√°c ph·∫ßn c·∫ßn b·ªè qua
    stop_headings = ["Ch√∫ th√≠ch", "Tham kh·∫£o", "Li√™n k·∫øt ngo√†i"]

    content = soup.find('div', id='mw-content-text')
    if not content:
        print("‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y n·ªôi dung ch√≠nh trong trang!")
        return []

    all_links = []

    for element in content.find_all(['p', 'ul', 'ol', 'div', 'h2', 'h3'], recursive=True):
        # N·∫øu g·∫∑p heading d·ª´ng, th√¨ d·ª´ng lu√¥n vi·ªác duy·ªát
        if element.name in ['h2', 'h3']:
            heading_text = element.get_text(strip=True)
            if any(stop in heading_text for stop in stop_headings):
                print(f"üõë D·ª´ng t·∫°i m·ª•c: {heading_text}")
                break

        # L·∫•y link trong c√°c ƒëo·∫°n c√≤n l·∫°i
        for link in element.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/') and not any(x in href for x in [':', '#']):
                full_url = "https://vi.wikipedia.org" + href
                all_links.append(full_url)

    # Lo·∫°i b·ªè tr√πng l·∫∑p
    all_links = list(dict.fromkeys(all_links))
    print(f"üîç T√¨m th·∫•y {len(all_links)} ƒë∆∞·ªùng d·∫´n h·ª£p l·ªá trong trang ch√≠nh.")

    excluded_links = load_excluded_links()
    print(f"üìÇ B·ªè qua {len(excluded_links)} link ƒë√£ b·ªã lo·∫°i tr∆∞·ªõc ƒë√≥...")

    valid_links = []
    new_excluded = set()
    keywords = [
        "ca sƒ© vi·ªát nam", "nam ca sƒ© vi·ªát nam", "n·ªØ ca sƒ© vi·ªát nam",
        "ca sƒ© g·ªëc vi·ªát", "ca sƒ© h·∫£i ngo·∫°i", "nh·∫°c sƒ© vi·ªát nam"
    ]

    # ƒê∆∞a t·∫•t c·∫£ keywords v·ªÅ d·∫°ng ch·ªØ th∆∞·ªùng
    keywords_lower = [k.lower() for k in keywords]

    for link in all_links:
        if link in excluded_links:
            print(f"‚è© B·ªè qua (ƒë√£ lo·∫°i tr∆∞·ªõc): {link}")
            continue

        try:
            sub_resp = requests.get(link, headers=headers, timeout=6)
            sub_soup = BeautifulSoup(sub_resp.text, 'html.parser')
            cat_div = sub_soup.find('div', id='mw-normal-catlinks')

            if cat_div:
                cat_text = cat_div.get_text(strip=True).lower()
                if any(k in cat_text for k in keywords_lower):
                    valid_links.append(link)
                    print(f"‚úÖ Gi·ªØ l·∫°i: {link}")
                else:
                    print(f"‚ùå Lo·∫°i b·ªè: {link}")
                    new_excluded.add(link)
            else:
                print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y danh m·ª•c: {link}")
                new_excluded.add(link)

        except Exception as e:
            print(f"‚ö†Ô∏è L·ªói khi truy c·∫≠p {link}: {e}")
            new_excluded.add(link)

    excluded_links.update(new_excluded)
    save_excluded_links(excluded_links)

    return valid_links


In [None]:
def crawl_singer_info(start_urls, depth_limit=DEPTH_LIMIT):
    singers = []
    visited = set()  # tr√°nh tr√πng l·∫∑p
    queue = deque([(url, depth_limit) for url in start_urls])

    while queue:
        url, depth = queue.popleft()  # l·∫•y ph·∫ßn t·ª≠ ƒë·∫ßu (BFS)
        if url in visited or depth <= 0:
            continue
        visited.add(url)

        try:
            print(f"Crawling {url} (depth={depth})...")
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Tr·ªè v√†o info box
            info_box = soup.find("table", {"class": "infobox"})
            if not info_box:
                continue

            info_rows = info_box.find_all("tr")
            singer_info = {}
            singer_info['depth'] = depth
            singer_info['name'] = soup.find("h1", {"id": "firstHeading"}).get_text(strip=True)

            for row in info_rows:
                header = row.find("th")
                data = row.find("td")
                if header and data:
                    key = header.get_text(strip=True)

                    # --- TR∆Ø·ªúNG H·ª¢P 1: C√≥ <div class="hlist"> ---
                    hlist_div = data.find("div", {"class": "hlist"})
                    if hlist_div:
                        items = [li.get_text(strip=True) for li in hlist_div.find_all("li")]
                        singer_info[key] = items
                        continue

                    # --- TR∆Ø·ªúNG H·ª¢P 2: C√≥ <ul> ---
                    ul_tag = data.find("ul")
                    if ul_tag:
                        items = [li.get_text(strip=True) for li in ul_tag.find_all("li")]
                        singer_info[key] = items
                        continue

                    # --- TR∆Ø·ªúNG H·ª¢P 3: C√≥ <br> ---
                    if data.find("br"):
                        parts = [text.strip() for text in data.stripped_strings]
                        singer_info[key] = [p for p in parts if p]
                    else:
                        value = data.get_text(separator=' ', strip=True)
                        singer_info[key] = value

            # --- Th√™m c√°c tr∆∞·ªùng b·ªï sung ---
            singer_info['nƒÉm th√†nh l·∫≠p'], singer_info['nƒÉm tan r√£'] = get_years(singer_info.get('NƒÉm ho·∫°t ƒë·ªông'))
            singer_info['link'] = url
            singer_info['relations'] = []

            singers.append(singer_info)

            # --- Th√™m c√°c ca sƒ© li√™n quan v√†o h√†ng ƒë·ª£i (n·∫øu c√≤n depth) ---
            if depth - 1 > 0:
                colab_links = crawl_valid_links(url)
                singer_info['collaborated_singers'] = colab_links
                for link in colab_links:
                    if link not in visited:
                        queue.append((link, depth - 1))

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}")

    return singers

In [None]:
singers = crawl_singer_info(SEED)
print(singers)
#save singer to JSON file


In [None]:
def identify_relationships(singers):
    for singer in singers:
        for link in singer.get('collaborated_singers', []):
            colab_singer = next((s for s in singers if s['link'] == link), None)
            if colab_singer:
                is_same_genre = False  
                for genre in singer.get("Th·ªÉ lo·∫°i", []):
                    if "Th·ªÉ lo·∫°i" in colab_singer and genre in colab_singer["Th·ªÉ lo·∫°i"]:
                        is_same_genre = True
                        break
                if is_same_genre:
                    singer['relations'].append({
                        'singer_link': link,
                        'type': 'same_genre'
                    })
                    colab_singer['relations'].append({
                        'singer_link': singer['link'],
                        'type': 'same_genre'
                    })
                else:
                    singer['relations'].append({
                        'singer_link': link,
                        'type': 'collaborated'
                    })
                    colab_singer['relations'].append({
                        'singer_link': singer['link'],
                        'type': 'collaborated'
                    })

In [None]:

identify_relationships(singers)

In [None]:
import json
with open('../raw_data/singer_demo.json', 'w', encoding='utf-8') as f:
    json.dump(singers, f, ensure_ascii=False, indent=4)

In [None]:
print(len(singers))

In [None]:
#keyword c·ªßa nh·ªØng ca sƒ© VNCH
south_vietnamese_keyword = [
    "vi·ªát nam c·ªông ho√†", "s√†i g√≤n", "nam k√¨", "nam k·ª≥"
]

In [2]:
# T√¨m c√°c ca sƒ© thu·ªôc VNCH
south_vietnamese_singers = [
    singer for singer in singers
    if any(
        key in singer and (
            (isinstance(singer[key], list) and any(
                any(kw.lower() in item.lower() for kw in south_vietnamese_keyword)
                for item in singer[key]
            )) or
            (isinstance(singer[key], str) and any(
                kw.lower() in singer[key].lower() for kw in south_vietnamese_keyword
            ))
        )
        for key in ["Qu·ªëc t·ªãch", "N∆°i sinh", "N∆°i ho·∫°t ƒë·ªông"]
    )
]

NameError: name 'singers' is not defined