In [None]:
!pip install bs4 requests

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import os
from collections import deque
import json

In [None]:
SAVE_DIR = r"../raw_data"
EXCLUDED_PATH = os.path.join(SAVE_DIR, "excluded_links.json")
DEPTH_LIMIT = 5

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [None]:
SEED = ["https://vi.wikipedia.org/wiki/L%E1%BB%87_Quy%C3%AAn_(ca_s%C4%A9,_sinh_1981)", 
        "https://vi.wikipedia.org/wiki/Miu_L%C3%AA", 
        "https://vi.wikipedia.org/wiki/H%C3%B2a_Minzy",
        "https://vi.wikipedia.org/wiki/M%E1%BB%B9_Linh",
        "https://vi.wikipedia.org/wiki/Only_C",
        "https://vi.wikipedia.org/wiki/JustaTee",
        "https://vi.wikipedia.org/wiki/Ch%E1%BA%BF_Linh",
        "https://vi.wikipedia.org/wiki/%C4%90%C3%A0m_V%C4%A9nh_H%C6%B0ng",
        "https://vi.wikipedia.org/wiki/Mr._Siro"
        ]

In [None]:
def remove_characters(text):
    if isinstance(text, str):
        # B·ªè c·ª•m [s·ª≠a|s·ª≠a m√£ ngu·ªìn]
        cleaned = text.replace("[s·ª≠a|s·ª≠a m√£ ngu·ªìn]", "")
        # Xo√° ngo·∫∑c v√† d·∫•u c√°ch/d·∫•u ngo·∫∑c k√©p ·ªü ƒë·∫ßu & cu·ªëi
        cleaned = re.sub(r'^[\s\(\)\[\]\'"]+|[\s\(\)\[\]\'"]+$', '', cleaned)
        return cleaned.strip()
    return text


In [None]:
def get_years(active_years):
    is_active = False
    if not active_years:
        return None, None

    if isinstance(active_years, str):
        active_years = [active_years]

    start_years = []
    end_years = []

    for period in active_years:
        if not period:
            continue
        p = period.strip()

        # Chu·∫©n ho√° c√°c lo·∫°i dash th√†nh hyphen th∆∞·ªùng
        p = re.sub(r'[‚Äì‚Äî‚àí]', '-', p)

        # üîπ L·∫•y t·∫•t c·∫£ nƒÉm v√† c·∫£ t·ª´ "nay"
        tokens = re.findall(r'\b(?:19|20)\d{2}\b|\b(?:nay|hi·ªán t·∫°i|present|now)\b', p, re.IGNORECASE)

        # N·∫øu kh√¥ng c√≥ token n√†o, th·ª≠ ki·ªÉm tra d·∫°ng ƒë·∫∑c bi·ªát "2015-"
        if not tokens:
            if re.search(r'\b(?:19|20)\d{2}\b\s*-\s*$', p):
                start = int(re.search(r'(?:19|20)\d{2}', p).group())
                start_years.append(start)
            continue

        # X·ª≠ l√Ω token ƒë·∫ßu ti√™n (nƒÉm b·∫Øt ƒë·∫ßu)
        first = tokens[0]
        if re.match(r'(?:19|20)\d{2}', first):
            start_years.append(int(first))

        # X·ª≠ l√Ω token cu·ªëi c√πng (nƒÉm tan r√£)
        last = tokens[-1]
        if re.match(r'(?:19|20)\d{2}', last):
            end_years.append(int(last))
        elif re.match(r'(nay|hi·ªán t·∫°i|present|now)', last, re.IGNORECASE):
            is_active = True
            # n·∫øu l√† 'nay' th√¨ kh√¥ng c√≥ nƒÉm tan r√£
            pass
        elif len(tokens) == 1:
            # ch·ªâ c√≥ m·ªôt nƒÉm, coi l√† ho·∫°t ƒë·ªông trong nƒÉm ƒë√≥
            end_years.append(int(first))

    if not start_years:
        return None, None

    start = min(start_years)
    end = None if is_active else max(end_years)
    return start, end


In [None]:
def load_excluded_links():
    if os.path.exists(EXCLUDED_PATH):
        with open(EXCLUDED_PATH, "r", encoding="utf-8") as f:
            return set(json.load(f))
    return set()

In [None]:
def save_excluded_links(excluded_links):
    with open(EXCLUDED_PATH, "w", encoding="utf-8") as f:
        json.dump(sorted(list(excluded_links)), f, ensure_ascii=False, indent=2)

In [None]:
def load_crawled_links():
    """ƒê·ªçc danh s√°ch link ƒë√£ t·ª´ng crawl."""
    if os.path.exists("crawled_links.json"):
        with open("crawled_links.json", "r", encoding="utf-8") as f:
            return set(json.load(f))
    return set()

def save_crawled_links(data):
    """L∆∞u danh s√°ch link ƒë√£ crawl."""
    with open("crawled_links.json", "w", encoding="utf-8") as f:
        json.dump(list(data), f, ensure_ascii=False, indent=2)


# ====== H√ÄM CH√çNH ======
def crawl_valid_links(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    stop_headings = ["Ch√∫ th√≠ch", "Tham kh·∫£o", "Li√™n k·∫øt ngo√†i"]

    content = soup.find('div', id='mw-content-text')
    if not content:
        print("‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y n·ªôi dung ch√≠nh trong trang!")
        return []

    all_links = []

    for element in content.find_all(['p', 'ul', 'ol', 'div', 'h2', 'h3'], recursive=True):
        if element.name in ['h2', 'h3']:
            heading_text = element.get_text(strip=True)
            if any(stop in heading_text for stop in stop_headings):
                print(f"üõë D·ª´ng t·∫°i m·ª•c: {heading_text}")
                break

        for link in element.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/') and not any(x in href for x in [':', '#']):
                full_url = "https://vi.wikipedia.org" + href
                all_links.append(full_url)

    # Lo·∫°i b·ªè tr√πng l·∫∑p trong trang hi·ªán t·∫°i
    all_links = list(dict.fromkeys(all_links))
    print(f"üîç T√¨m th·∫•y {len(all_links)} ƒë∆∞·ªùng d·∫´n h·ª£p l·ªá trong trang ch√≠nh.")

    excluded_links = load_excluded_links()
    crawled_links = load_crawled_links()

    print(f"üìÇ B·ªè qua {len(excluded_links)} link ƒë√£ b·ªã lo·∫°i tr∆∞·ªõc ƒë√≥...")
    print(f"üß≠ B·ªè qua {len(crawled_links)} link ƒë√£ ƒë∆∞·ª£c crawl tr∆∞·ªõc ƒë√≥...")

    valid_links = []
    new_excluded = set()
    new_crawled = set()

    keywords = [
        "ca sƒ© vi·ªát nam", "nam ca sƒ© vi·ªát nam", "n·ªØ ca sƒ© vi·ªát nam",
        "ca sƒ© g·ªëc vi·ªát", "ca sƒ© h·∫£i ngo·∫°i", "nh·∫°c sƒ© vi·ªát nam",
        "ban nh·∫°c vi·ªát nam", "ban nh·∫°c rock vi·ªát nam",
        "nh√† s·∫£n xu·∫•t thu √¢m vi·ªát nam", "nh√† s·∫£n xu·∫•t √¢m nh·∫°c vi·ªát nam",
        "nh·∫°c sƒ© h√≤a √¢m ph·ªëi kh√≠ vi·ªát nam", "rapper vi·ªát nam"
    ]
    keywords_lower = [k.lower() for k in keywords]

    for link in all_links:
        if link in excluded_links or link in crawled_links:
            print(f"‚è© B·ªè qua (ƒë√£ lo·∫°i ho·∫∑c ƒë√£ crawl): {link}")
            continue

        try:
            sub_resp = requests.get(link, headers=headers, timeout=6)
            sub_soup = BeautifulSoup(sub_resp.text, 'html.parser')
            cat_div = sub_soup.find('div', id='mw-normal-catlinks')

            if cat_div:
                cat_text = cat_div.get_text(strip=True).lower()
                if any(k in cat_text for k in keywords_lower):
                    valid_links.append(link)
                    print(f"‚úÖ Gi·ªØ l·∫°i: {link}")
                else:
                    print(f"‚ùå Lo·∫°i b·ªè: {link}")
                    new_excluded.add(link)
            else:
                print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y danh m·ª•c: {link}")
                new_excluded.add(link)

        except Exception as e:
            print(f"‚ö†Ô∏è L·ªói khi truy c·∫≠p {link}: {e}")
            new_excluded.add(link)

        # D√π gi·ªØ hay lo·∫°i, v·∫´n ƒë√°nh d·∫•u l√† ƒë√£ crawl
        new_crawled.add(link)

    # C·∫≠p nh·∫≠t d·ªØ li·ªáu
    excluded_links.update(new_excluded)
    crawled_links.update(new_crawled)
    save_excluded_links(excluded_links)
    save_crawled_links(crawled_links)

    print(f"üíæ ƒê√£ l∆∞u {len(new_crawled)} link m·ªõi v√†o danh s√°ch ƒë√£ crawl.")
    print(f"‚úÖ T·ªïng s·ªë link h·ª£p l·ªá m·ªõi: {len(valid_links)}")

    return valid_links

In [None]:
def crawl_singer_award(soup):
    # S·ª≠ d·ª•ng set() ƒë·ªÉ t·ª± ƒë·ªông ch·ªëng tr√πng l·∫∑p l√† ch√≠nh x√°c!
    awards = set()
    tables = soup.find_all("table", class_="wikitable")

    for table in tables:
        for row in table.find_all("tr"):
            cells = row.find_all(["td", "th"])
            if len(cells) < 2:
                continue  # B·ªè qua h√†ng kh√¥ng c√≥ ƒë·ªß √¥

            award_cell = cells[1]

            # Logic g·ªëc c·ªßa b·∫°n: ch·ªâ l·∫•y nh·ªØng √¥ c√≥ th·∫ª <sup>
            if award_cell.find("sup"):
                text_parts = []
                # L·∫∑p qua c√°c n·ªôi dung con (g·ªìm text v√† tag)
                for content in award_cell.contents:
                    if content.name == "sup":
                        break  # D·ª´ng l·∫°i khi g·∫∑p <sup> ƒë·∫ßu ti√™n
                    
                    # L·∫•y text, d√π n√≥ l√† chu·ªói tr·∫ßn hay n·∫±m trong tag kh√°c (nh∆∞ <a>, <b>)
                    if isinstance(content, str):
                        text_parts.append(content)
                    elif hasattr(content, "get_text"):
                        text_parts.append(content.get_text())

                # 1. Gh√©p t·∫•t c·∫£ c√°c ph·∫ßn text l·∫°i
                text = "".join(text_parts)
                
                # 2. Chu·∫©n h√≥a kho·∫£ng tr·∫Øng:
                #    Bi·∫øn "Gi·∫£i th∆∞·ªüng\n   Ca sƒ©" th√†nh "Gi·∫£i th∆∞·ªüng Ca sƒ©"
                text = " ".join(text.split())

                if text:
                    awards.add(text)  # set s·∫Ω t·ª± lo vi·ªác ch·ªëng tr√πng l·∫∑p

    return list(awards)


In [None]:
def load_crawled_links():
    """ƒê·ªçc danh s√°ch link ƒë√£ t·ª´ng crawl."""
    if os.path.exists("crawled_links.json"):
        with open("crawled_links.json", "r", encoding="utf-8") as f:
            return set(json.load(f))
    return set()

def save_crawled_links(data):
    """L∆∞u danh s√°ch link ƒë√£ crawl."""
    with open("crawled_links.json", "w", encoding="utf-8") as f:
        json.dump(list(data), f, ensure_ascii=False, indent=2)

In [None]:
def crawl_valid_links(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    stop_headings = ["Ch√∫ th√≠ch", "Tham kh·∫£o", "Li√™n k·∫øt ngo√†i"]
    content = soup.find('div', id='mw-content-text')
    if not content:
        print("‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y n·ªôi dung ch√≠nh trong trang!")
        return []

    all_links = []
    for element in content.find_all(['p', 'ul', 'ol', 'div', 'h2', 'h3'], recursive=True):
        if element.name in ['h2', 'h3']:
            heading_text = element.get_text(strip=True)
            if any(stop in heading_text for stop in stop_headings):
                print(f"üõë D·ª´ng t·∫°i m·ª•c: {heading_text}")
                break
        for link in element.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/') and not any(x in href for x in [':', '#']):
                all_links.append("https://vi.wikipedia.org" + href)

    all_links = list(dict.fromkeys(all_links))
    print(f"üîç T√¨m th·∫•y {len(all_links)} ƒë∆∞·ªùng d·∫´n h·ª£p l·ªá trong trang ch√≠nh.")

    excluded_links = load_excluded_links()
    crawled_links = load_crawled_links()

    print(f"üìÇ B·ªè qua {len(excluded_links)} link ƒë√£ b·ªã lo·∫°i tr∆∞·ªõc ƒë√≥...")
    print(f"üß≠ B·ªè qua {len(crawled_links)} link ƒë√£ ƒë∆∞·ª£c crawl tr∆∞·ªõc ƒë√≥...")

    valid_links_new = []
    new_excluded = set()
    new_crawled = set()

    keywords = [
        "ca sƒ© vi·ªát nam", "nam ca sƒ© vi·ªát nam", "n·ªØ ca sƒ© vi·ªát nam",
        "ca sƒ© g·ªëc vi·ªát", "ca sƒ© h·∫£i ngo·∫°i", "nh·∫°c sƒ© vi·ªát nam",
        "ban nh·∫°c vi·ªát nam", "ban nh·∫°c rock vi·ªát nam",
        "nh√† s·∫£n xu·∫•t thu √¢m vi·ªát nam", "nh√† s·∫£n xu·∫•t √¢m nh·∫°c vi·ªát nam",
        "nh·∫°c sƒ© h√≤a √¢m ph·ªëi kh√≠ vi·ªát nam", "rapper vi·ªát nam"
    ]
    keywords_lower = [k.lower() for k in keywords]

    for link in all_links:
        if link in excluded_links or link in crawled_links:
            print(f"‚è© B·ªè qua (ƒë√£ lo·∫°i ho·∫∑c ƒë√£ crawl): {link}")
            continue

        try:
            sub_resp = requests.get(link, headers=headers, timeout=6)
            sub_soup = BeautifulSoup(sub_resp.text, 'html.parser')
            cat_div = sub_soup.find('div', id='mw-normal-catlinks')

            if cat_div:
                cat_text = cat_div.get_text(strip=True).lower()
                if any(k in cat_text for k in keywords_lower):
                    valid_links_new.append(link)
                    print(f"‚úÖ Gi·ªØ l·∫°i: {link}")
                else:
                    print(f"‚ùå Lo·∫°i b·ªè: {link}")
                    new_excluded.add(link)
            else:
                print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y danh m·ª•c: {link}")
                new_excluded.add(link)

        except Exception as e:
            print(f"‚ö†Ô∏è L·ªói khi truy c·∫≠p {link}: {e}")
            new_excluded.add(link)

        new_crawled.add(link)

    excluded_links.update(new_excluded)
    crawled_links.update(new_crawled)
    save_excluded_links(excluded_links)
    save_crawled_links(crawled_links)

    print(f"üíæ ƒê√£ l∆∞u {len(new_crawled)} link m·ªõi v√†o danh s√°ch ƒë√£ crawl.")
    print(f"‚úÖ T·ªïng s·ªë link h·ª£p l·ªá m·ªõi: {len(valid_links_new)}")

    # --- üîÅ B·ªï sung: Bao g·ªìm c·∫£ link h·ª£p l·ªá c≈© ---
    valid_links_all = set(valid_links_new)

    # ƒê·ªçc l·∫°i t·∫•t c·∫£ link ƒë√£ crawl, l·ªçc ra nh·ªØng link h·ª£p l·ªá c≈© (n·∫øu b·∫°n c√≥ danh s√°ch ri√™ng l∆∞u h·ª£p l·ªá)
    # N·∫øu ch∆∞a c√≥, ta c√≥ th·ªÉ coi valid_links_all l√† t·ªïng h·ª£p hi·ªán t·∫°i
    # ‚áí T·ª©c l√† ch·ªâ th√™m nh·ªØng link h·ª£p l·ªá c≈© (ƒë√£ ƒë∆∞·ª£c l∆∞u trong file kh√°c)
    if os.path.exists("valid_links.json"):
        with open("valid_links.json", "r", encoding="utf-8") as f:
            old_valid = set(json.load(f))
            valid_links_all.update(old_valid)

    # C·∫≠p nh·∫≠t file l∆∞u link h·ª£p l·ªá t·ªïng h·ª£p
    with open("valid_links.json", "w", encoding="utf-8") as f:
        json.dump(list(valid_links_all), f, ensure_ascii=False, indent=2)

    print(f"üì¶ T·ªïng h·ª£p t·∫•t c·∫£ link h·ª£p l·ªá: {len(valid_links_all)}")

    return list(valid_links_all)


In [None]:
def crawl_singer_info(start_urls, depth_limit=DEPTH_LIMIT):
    singers = []
    visited = set()  # tr√°nh tr√πng l·∫∑p
    queue = deque([(url, depth_limit) for url in start_urls])

    while queue:
        url, depth = queue.popleft()  # l·∫•y ph·∫ßn t·ª≠ ƒë·∫ßu (BFS)
        if url in visited or depth <= 0:
            continue
        visited.add(url)

        try:
            print(f"Crawling {url} (depth={depth})...")
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Tr·ªè v√†o info box
            info_box = soup.find("table", {"class": "infobox"})
            if not info_box:
                continue

            info_rows = info_box.find_all("tr")
            singer_info = {}
            singer_info['depth'] = depth
            singer_info['name'] = soup.find("h1", {"id": "firstHeading"}).get_text(strip=True)

            for row in info_rows:
                header = row.find("th")
                data = row.find("td")
                if header and data:
                    key = header.get_text(strip=True)

                    # --- TR∆Ø·ªúNG H·ª¢P 1: C√≥ <div class="hlist"> ---
                    hlist_div = data.find("div", {"class": "hlist"})
                    if hlist_div:
                        items = [li.get_text(strip=True) for li in hlist_div.find_all("li")]
                        singer_info[key] = items
                        continue

                    # --- TR∆Ø·ªúNG H·ª¢P 2: C√≥ <ul> ---
                    ul_tag = data.find("ul")
                    if ul_tag:
                        items = [li.get_text(strip=True) for li in ul_tag.find_all("li")]
                        singer_info[key] = items
                        continue

                    # --- TR∆Ø·ªúNG H·ª¢P 3: C√≥ <br> ---
                    if data.find("br"):
                        parts = [text.strip() for text in data.stripped_strings]
                        singer_info[key] = [p for p in parts if p]
                    else:
                        value = data.get_text(separator=' ', strip=True)
                        singer_info[key] = value

            # --- Th√™m c√°c tr∆∞·ªùng b·ªï sung ---
            singer_info['nƒÉm th√†nh l·∫≠p'], singer_info['nƒÉm tan r√£'] = get_years(singer_info.get('NƒÉm ho·∫°t ƒë·ªông'))
            singer_info['link'] = url
            singer_info['relations'] = []
            singer_info['awards'] = crawl_singer_award(soup)

            cat_div = soup.find('div', id='mw-normal-catlinks')

            if cat_div:
                cat_text = cat_div.get_text(strip=True).lower()
                # if any cat_text have "nam", add singer_info['gi·ªõi t√≠nh'] = 'nam', "n·ªØ" t∆∞∆°ng t·ª±
                if "nam" in cat_text:
                    singer_info['gi·ªõi t√≠nh'] = 'nam'
                elif "n·ªØ" in cat_text:
                    singer_info['gi·ªõi t√≠nh'] = 'n·ªØ'

            singers.append(singer_info)

            # --- Th√™m c√°c ca sƒ© li√™n quan v√†o h√†ng ƒë·ª£i (n·∫øu c√≤n depth) ---
            if depth - 1 > 0:
                colab_links = crawl_valid_links(url)
                singer_info['collaborated_singers'] = colab_links
                for link in colab_links:
                    if link not in visited:
                        queue.append((link, depth - 1))

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}")

    return singers

In [None]:
singers = crawl_singer_info(SEED)
print(singers)
#save singer to JSON file


In [None]:
import json

In [None]:
# rename fields for better clarity
for singer in singers:
    if "nƒÉm th√†nh l·∫≠p" in singer and "nƒÉm tan r√£" in singer:
        singer["nƒÉm ra m·∫Øt"] = singer.pop("nƒÉm th√†nh l·∫≠p", None)
        singer["nƒÉm gi·∫£i ngh·ªá"] = singer.pop("nƒÉm tan r√£", None)

    singer["Ngh·ªÅ nghi·ªáp kh√°c"] = singer.pop("Ngh·ªÅ nghi·ªáp", None)

    # extract origin (if not present) from other related fields
    if "Nguy√™n qu√°n" not in singer:
        if "Qu√™ qu√°n" in singer:
            singer["Nguy√™n qu√°n"] = singer.pop("Qu√™ qu√°n")
        elif "qu√™ qu√°n" in singer:
            singer["Nguy√™n qu√°n"] = singer.pop("qu√™ qu√°n")
        elif "N∆°i sinh" in singer:
            singer["Nguy√™n qu√°n"] = singer.pop("N∆°i sinh")
        elif "n∆°i sinh" in singer:
            singer["Nguy√™n qu√°n"] = singer.pop("n∆°i sinh")
        else:
            singer["Nguy√™n qu√°n"] = ""

    # --- Chu·∫©n h√≥a: ch·ªâ gi·ªØ ph·∫ßn tr∆∞·ªõc d·∫•u ph·∫©y (lo·∫°i b·ªè qu·ªëc gia) ---
    origin = singer.get("Nguy√™n qu√°n", "")
    if isinstance(origin, str) and "," in origin:
        # L·∫•y ph·∫ßn tr∆∞·ªõc d·∫•u ph·∫©y ƒë·∫ßu ti√™n
        singer["Nguy√™n qu√°n"] = origin.split(",")[0].strip()


In [None]:
import re
from datetime import datetime 

date_pattern = re.compile(
    r'(?P<day>\d{1,2})[\s\xa0]*th√°ng[\s\xa0]*(?P<month>\d{1,2})[,Ôºå\s\xa0]*(?:nƒÉm[\s\xa0]*)?(?P<year>\d{4})',
    re.IGNORECASE
)

for singer in singers:
    if "ng√†y sinh" in singer:
        s = singer["ng√†y sinh"]
        if isinstance(s, str):
            s = s.replace('\xa0', ' ')
            match = date_pattern.search(s)
            if match:
                print(f"T√¨m th·∫•y: {match.group(0)}")
                day = int(match.group("day"))
                month = int(match.group("month"))
                year = int(match.group("year"))
                try:
                    singer["ng√†y sinh"] = datetime(year, month, day).date().isoformat()
                except ValueError:
                    singer["ng√†y sinh"] = ""
                continue

    elif "Sinh" in singer:
        sinh_data = singer.get("Sinh", [])
        if isinstance(sinh_data, list):
            for s in sinh_data:
                if not isinstance(s, str):
                    continue
                s = s.replace('\xa0', ' ')  # üîß lo·∫°i b·ªè non-breaking space
                print(f"Ki·ªÉm tra chu·ªói: {s}")
                match = date_pattern.search(s)
                if match:
                    print(f"T√¨m th·∫•y: {match.group(0)}")
                    day = int(match.group("day"))
                    month = int(match.group("month"))
                    year = int(match.group("year"))
                    try:
                        singer["ng√†y sinh"] = datetime(year, month, day).date().isoformat()
                    except ValueError:
                        singer["ng√†y sinh"] = ""
                    break
    else:
        singer["ng√†y sinh"] = ""


In [None]:

# M·∫´u regex x√≥a m·ªçi n·ªôi dung trong ngo·∫∑c (), [] c√πng kho·∫£ng tr·∫Øng k√®m theo
clean_pattern = re.compile(r'\s*(\([^)]*\)|\[[^\]]*\])\s*')

for singer in singers:
    name = singer.get("name", "")
    if not isinstance(name, str):
        continue

    # X√≥a n·ªôi dung trong ngo·∫∑c tr√≤n ho·∫∑c ngo·∫∑c vu√¥ng
    cleaned = re.sub(clean_pattern, '', name).strip()

    # X√≥a kho·∫£ng tr·∫Øng d∆∞ (n·∫øu c√≥ nhi·ªÅu kho·∫£ng tr·∫Øng)
    cleaned = re.sub(r'\s+', ' ', cleaned)

    singer["name"] = cleaned

    

In [None]:
import re

# Regex x√≥a n·ªôi dung trong ngo·∫∑c () ho·∫∑c [] c√πng kho·∫£ng tr·∫Øng k√®m theo
clean_pattern = re.compile(r'\s*(\([^)]*\)|\[[^\]]*\])\s*')

for singer in singers:
    data = singer.get("H√£ng ƒëƒ©a", None)
    if data is None:
        singer["H√£ng ƒëƒ©a"] = "ƒê·ªôc l·∫≠p"
        continue

    # N·∫øu l√† list
    if isinstance(data, list):
        cleaned_list = []
        for item in data:
            if not isinstance(item, str):
                continue

            # X√≥a n·ªôi dung trong ngo·∫∑c
            cleaned = re.sub(clean_pattern, '', item).strip()
            cleaned = re.sub(r'\s+', ' ', cleaned)

            # B·ªè qua n·∫øu chu·ªói r·ªóng ho·∫∑c ch·ªâ ch·ª©a ngo·∫∑c, s·ªë, d·∫•u g·∫°ch
            if not cleaned or re.fullmatch(r'[\(\)\[\]\s\d\-‚Äì,]*', item.strip()):
                continue

            cleaned_list.append(cleaned)

        singer["H√£ng ƒëƒ©a"] = cleaned_list

    # N·∫øu l√† chu·ªói
    elif isinstance(data, str):
        cleaned = re.sub(clean_pattern, '', data).strip()
        cleaned = re.sub(r'\s+', ' ', cleaned)

        # N·∫øu ch·ªâ c√≥ ngo·∫∑c ho·∫∑c r·ªóng ‚Üí g√°n r·ªóng
        if not cleaned or re.fullmatch(r'[\(\)\[\]\s\d\-‚Äì,]*', data.strip()):
            singer["H√£ng ƒëƒ©a"] = ""
        else:
            singer["H√£ng ƒëƒ©a"] = cleaned


In [None]:
#Doi ten "dong nhac" thanh "the loai"
for singer in singers:
    if 'dong nhac' in singer:
        singer['the loai'] = singer.pop('dong nhac')

In [None]:
def identify_relationships(singers):
    # ƒê·∫£m b·∫£o c√≥ m·∫£ng 'relations'
    for s in singers:
        if 'relations' not in s:
            s['relations'] = []

    for singer in singers:
        for link in singer.get('collaborated_singers', []):
            colab_singer = next((s for s in singers if s['link'] == link), None)
            if not colab_singer:
                continue

            relation_types = []

            # --- C√πng th·ªÉ lo·∫°i ---
            singer_genres = set(singer.get("Th·ªÉ lo·∫°i", []))
            colab_genres = set(colab_singer.get("Th·ªÉ lo·∫°i", []))
            if singer_genres and colab_genres and singer_genres.intersection(colab_genres):
                relation_types.append("same_genre")

            # --- C·ªông t√°c ---
            relation_types.append("collaborated")

            # --- C√πng nguy√™n qu√°n ---
            if singer.get("Nguy√™n qu√°n") and colab_singer.get("Nguy√™n qu√°n"):
                if singer["Nguy√™n qu√°n"] == colab_singer["Nguy√™n qu√°n"]:
                    relation_types.append("same_origin")

            # --- C√πng h√£ng ƒëƒ©a ---
            label_a = singer.get("H√£ng ƒëƒ©a")
            label_b = colab_singer.get("H√£ng ƒëƒ©a")

            def normalize_label(label):
                """Chuy·ªÉn v·ªÅ chu·ªói th∆∞·ªùng, lo·∫°i b·ªè kho·∫£ng tr·∫Øng v√† l·∫•y ph·∫ßn ƒë·∫ßu n·∫øu l√† danh s√°ch"""
                if not label:
                    return ""
                if isinstance(label, list):
                    label = label[0] if label else ""
                return str(label).strip().lower()

            label_a_norm = normalize_label(label_a)
            label_b_norm = normalize_label(label_b)

            if label_a_norm and label_b_norm and label_a_norm == label_b_norm and label_a_norm != "ƒë·ªôc l·∫≠p":
                relation_types.append("same_label")

            # --- C√πng n∆°i ƒë√†o t·∫°o ---
            if singer.get("ƒê√†o t·∫°o") and colab_singer.get("ƒê√†o t·∫°o"):
                if singer["ƒê√†o t·∫°o"] == colab_singer["ƒê√†o t·∫°o"]:
                    relation_types.append("same_institution")

            # --- Th√™m quan h·ªá v√†o singer ---
            existing_relation = next((r for r in singer['relations'] if r['singer_link'] == link), None)
            if existing_relation:
                for t in relation_types:
                    if t not in existing_relation['type']:
                        existing_relation['type'].append(t)
            else:
                singer['relations'].append({
                    'singer_link': link,
                    'type': relation_types
                })

            # --- Quan h·ªá ng∆∞·ª£c ---
            reverse_relation = next((r for r in colab_singer['relations'] if r['singer_link'] == singer['link']), None)
            if reverse_relation:
                for t in relation_types:
                    if t not in reverse_relation['type']:
                        reverse_relation['type'].append(t)
            else:
                colab_singer['relations'].append({
                    'singer_link': singer['link'],
                    'type': relation_types.copy()
                })


In [None]:
identify_relationships(singers)

In [None]:
import json

In [None]:
with open('../raw_data/singers.json', 'r', encoding='utf-8') as f:
    singers = json.load(f)



In [None]:
for singer in singers:
    singer.pop("Sinh", None)
    singer.pop("Ph·ªëi ng·∫´u", None)
    singer.pop("Nh·∫°c c·ª•", None)
    singer.pop("B·∫°n ƒë·ªùi", None)
    singer.pop("Con c√°i", None)
    singer.pop("Gi·∫£i th∆∞·ªüng", None)
    singer.pop("Website", None)
    singer.pop("collaborated_singers", None)

In [None]:
with open('../raw_data/singers.json', 'w', encoding='utf-8') as f:
    json.dump(singers, f, ensure_ascii=False, indent=4)