In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime
import time
import csv

class VnExpressCrawler:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.base_url = "https://vnexpress.net"
    
    def get_political_news_links(self, limit=5):
        """L·∫•y danh s√°ch link b√†i b√°o ch√≠nh tr·ªã m·ªõi nh·∫•t"""
        try:
            url = "https://vnexpress.net/thoi-su/chinh-tri"
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # T√¨m c√°c link b√†i b√°o (c·∫≠p nh·∫≠t selector ph√π h·ª£p v·ªõi c·∫•u tr√∫c m·ªõi)
            article_links = []
            # Th·ª≠ t√¨m c√°c th·∫ª article c√≥ class 'item-news'
            articles = soup.find_all('article', class_='item-news')
            for article in articles:
                link_tag = article.find('a', href=True)
                if link_tag:
                    full_url = link_tag['href']
                    if not full_url.startswith('http'):
                        full_url = self.base_url + full_url
                    article_links.append(full_url)
                if len(article_links) >= limit:
                    break
            # N·∫øu kh√¥ng t√¨m th·∫•y, fallback sang selector c≈©
            if not article_links:
                articles = soup.find_all('h3', class_='title-news')
                for article in articles:
                    link_tag = article.find('a')
                    if link_tag and link_tag.get('href'):
                        full_url = link_tag['href']
                        if not full_url.startswith('http'):
                            full_url = self.base_url + full_url
                        article_links.append(full_url)
                    if len(article_links) >= limit:
                        break
            return article_links
            
        except Exception as e:
            print(f"L·ªói khi l·∫•y danh s√°ch b√†i b√°o: {e}")
            return []
    
    def extract_article_info(self, url):
        """Crawl th√¥ng tin chi ti·∫øt t·ª´ m·ªôt b√†i b√°o"""
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # L·∫•y ti√™u ƒë·ªÅ
            title = ""
            title_tag = soup.find('h1', class_='title-detail')
            if title_tag:
                title = title_tag.get_text(strip=True)
            
            # L·∫•y t√≥m t·∫Øt
            summary = ""
            summary_tag = soup.find('p', class_='description')
            if summary_tag:
                summary = summary_tag.get_text(strip=True)
            
            # L·∫•y th·ªùi gian ƒëƒÉng
            publish_time = ""
            time_tag = soup.find('span', class_='date')
            if time_tag:
                publish_time = time_tag.get_text(strip=True)
            
            # L·∫•y n·ªôi dung ch√≠nh
            content = ""
            content_div = soup.find('article', class_='fck_detail')
            if content_div:
                paragraphs = content_div.find_all('p', class_='Normal')
                content = ' '.join([p.get_text(strip=True) for p in paragraphs[:3]])  # L·∫•y 3 ƒëo·∫°n ƒë·∫ßu
            
            # Tr√≠ch xu·∫•t c√°c ƒë·ªëi t∆∞·ª£ng ƒë∆∞·ª£c ƒë·ªÅ c·∫≠p (t√™n ng∆∞·ªùi, ch·ª©c v·ª•)
            mentioned_entities = self.extract_entities(title + " " + summary + " " + content)
            # Lo·∫°i b·ªè tr∆∞·ªùng positions kh·ªèi mentioned_entities
            if 'positions' in mentioned_entities:
                del mentioned_entities['positions']
            
            return {
                'url': url,
                'title': title,
                'summary': summary,
                'publish_time': publish_time,
                'content_preview': content,
                'mentioned_entities': mentioned_entities,
                'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            
        except Exception as e:
            print(f"L·ªói khi crawl b√†i b√°o {url}: {e}")
            return None
    
    def extract_entities(self, text):
        """Tr√≠ch xu·∫•t c√°c th·ª±c th·ªÉ ƒë∆∞·ª£c ƒë·ªÅ c·∫≠p trong b√†i b√°o"""
        entities = {
            'leaders': [],
            'organizations': [],
            'locations': [],
        }
        
        # C√°c pattern ƒë·ªÉ t√¨m l√£nh ƒë·∫°o v√† ch·ª©c v·ª•
        leader_patterns = [
            r'T·ªïng [Bb]√≠ th∆∞ ([A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]+ [A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]+)',
            r'Ch·ªß t·ªãch n∆∞·ªõc ([A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]+ [A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]+)',
            r'Th·ªß t∆∞·ªõng ([A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]+ [A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]+)',
            r'B·ªô tr∆∞·ªüng ([A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]+ [A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]+)',
            r'ƒê·∫°i bi·ªÉu ([A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]+ [A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]+)',
            r'T·ªïng th·ªëng ([A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-zA-Z\s]+)'
        ]
        
        for pattern in leader_patterns:
            matches = re.findall(pattern, text)
            entities['leaders'].extend(matches)
        
        # T√¨m c√°c t·ªï ch·ª©c
        org_patterns = [
            r'(Qu·ªëc h·ªôi|Ch√≠nh ph·ªß|H·ªôi ƒë·ªìng Qu·ªëc gia|B·ªô Ch√≠nh tr·ªã|Ban Ch·∫•p h√†nh Trung ∆∞∆°ng|ASEAN|ƒê·∫£ng C·ªông s·∫£n Vi·ªát Nam)',
            r'B·ªô ([A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±\s]+)'
        ]
        
        for pattern in org_patterns:
            matches = re.findall(pattern, text)
            if isinstance(matches[0] if matches else '', tuple):
                entities['organizations'].extend([match for match in matches])
            else:
                entities['organizations'].extend(matches)
        
        # T√¨m ƒë·ªãa danh
        location_patterns = [
            r'(Vi·ªát Nam|H√† N·ªôi|TP HCM|Qu·∫£ng Ng√£i|Hungary|Ph√°p|Malaysia|ƒê√¥ng Nam √Å)',
            r'(t·ªânh|th√†nh ph·ªë) ([A-Z√Ä√Å·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨ƒê√à√â·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞][a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±\s]+)'
        ]
        
        for pattern in location_patterns:
            matches = re.findall(pattern, text)
            if matches:
                if isinstance(matches[0], tuple):
                    entities['locations'].extend([' '.join(match) for match in matches])
                else:
                    entities['locations'].extend(matches)
        
        # Lo·∫°i b·ªè duplicates
        for key in entities:
            entities[key] = list(set(entities[key]))
        
        return entities
    
    def crawl_news(self, limit=5):
        """Crawl tin t·ª©c ch√≠nh tr·ªã t·ª´ VnExpress"""
        print(f"B·∫Øt ƒë·∫ßu crawl {limit} b√†i b√°o ch√≠nh tr·ªã t·ª´ VnExpress...")
        
        # L·∫•y danh s√°ch link
        article_links = self.get_political_news_links(limit)
        
        if not article_links:
            print("Kh√¥ng t√¨m th·∫•y b√†i b√°o n√†o!")
            return []
        
        print(f"T√¨m th·∫•y {len(article_links)} b√†i b√°o. B·∫Øt ƒë·∫ßu crawl...")
        
        articles_data = []
        for i, link in enumerate(article_links, 1):
            print(f"ƒêang crawl b√†i {i}/{len(article_links)}: {link}")
            article_data = self.extract_article_info(link)
            
            if article_data:
                articles_data.append(article_data)
                print(f"‚úì Crawl th√†nh c√¥ng: {article_data['title'][:50]}...")
            else:
                print(f"‚úó L·ªói khi crawl b√†i b√°o")
            
            # Ngh·ªâ 1 gi√¢y gi·ªØa c√°c request ƒë·ªÉ tr√°nh b·ªã block
            time.sleep(1)
        
        return articles_data
    
    
    def save_to_csv(self, data, filename="vnexpress_political_news.csv"):
        """L∆∞u d·ªØ li·ªáu v√†o file CSV"""
        try:
            with open(filename, 'w', encoding='utf-8', newline='') as f:
                writer = csv.writer(f)
                # Header
                writer.writerow([
                    'url', 'title', 'summary', 'publish_time', 'content_preview', 'leaders', 'organizations', 'locations', 'crawl_time'
                ])
                for article in data:
                    writer.writerow([
                        article.get('url', ''),
                        article.get('title', ''),
                        article.get('summary', ''),
                        article.get('publish_time', ''),
                        article.get('content_preview', ''),
                        ', '.join(article.get('mentioned_entities', {}).get('leaders', [])),
                        ', '.join(article.get('mentioned_entities', {}).get('organizations', [])),
                        ', '.join(article.get('mentioned_entities', {}).get('locations', [])),
                        article.get('crawl_time', '')
                    ])
            print(f"ƒê√£ l∆∞u d·ªØ li·ªáu v√†o file {filename}")
        except Exception as e:
            print(f"L·ªói khi l∆∞u file CSV: {e}")
    
    def print_summary(self, articles_data):
        """In t√≥m t·∫Øt k·∫øt qu·∫£ crawl"""
        print("\n" + "="*80)
        print("T√ìM T·∫ÆT K·∫æT QU·∫¢ CRAWL TIN T·ª®C CH√çNH TR·ªä VNEXPRESS")
        print("="*80)
        
        for i, article in enumerate(articles_data, 1):
            print(f"\nüî∏ B√ÄI {i}:")
            print(f"Ti√™u ƒë·ªÅ: {article['title']}")
            print(f"URL: {article['url']}")
            print(f"Th·ªùi gian: {article['publish_time']}")
            print(f"T√≥m t·∫Øt: {article['summary'][:100]}...")
            
            if article['mentioned_entities']['leaders']:
                print(f"L√£nh ƒë·∫°o ƒë∆∞·ª£c ƒë·ªÅ c·∫≠p: {', '.join(article['mentioned_entities']['leaders'])}")
            
            if article['mentioned_entities']['organizations']:
                print(f"T·ªï ch·ª©c ƒë∆∞·ª£c ƒë·ªÅ c·∫≠p: {', '.join(article['mentioned_entities']['organizations'])}")
            
            if article['mentioned_entities']['locations']:
                print(f"ƒê·ªãa danh ƒë∆∞·ª£c ƒë·ªÅ c·∫≠p: {', '.join(article['mentioned_entities']['locations'])}")
            
            print("-" * 60)

# S·ª≠ d·ª•ng crawler
if __name__ == "__main__":
    crawler = VnExpressCrawler()
    
    # Crawl 5 b√†i b√°o ch√≠nh tr·ªã m·ªõi nh·∫•t
    articles = crawler.crawl_news(limit=5)
    
    if articles:
        # In t√≥m t·∫Øt
        crawler.print_summary(articles)
        
        # L∆∞u v√†o file CSV
        crawler.save_to_csv(articles)
        
        print(f"\n‚úÖ Ho√†n th√†nh crawl {len(articles)} b√†i b√°o!")
        print("üìÑ D·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o file 'vnexpress_political_news.json' v√† 'vnexpress_political_news.csv'")
    else:
        print("‚ùå Kh√¥ng crawl ƒë∆∞·ª£c b√†i b√°o n√†o!")

B·∫Øt ƒë·∫ßu crawl 5 b√†i b√°o ch√≠nh tr·ªã t·ª´ VnExpress...
T√¨m th·∫•y 5 b√†i b√°o. B·∫Øt ƒë·∫ßu crawl...
ƒêang crawl b√†i 1/5: https://vnexpress.net/ong-nguyen-van-tho-lam-pho-chu-tich-thuong-truc-ubnd-tp-hcm-4910346.html
‚úì Crawl th√†nh c√¥ng: √îng Nguy·ªÖn VƒÉn Th·ªç l√†m Ph√≥ ch·ªß t·ªãch th∆∞·ªùng tr·ª±c UB...
ƒêang crawl b√†i 2/5: https://vnexpress.net/viet-nam-thai-lan-thuc-day-hop-tac-quoc-phong-thuc-chat-di-vao-chieu-sau-4910310.html
‚úì Crawl th√†nh c√¥ng: Vi·ªát Nam - Th√°i Lan th√∫c ƒë·∫©y h·ª£p t√°c qu·ªëc ph√≤ng th...
ƒêang crawl b√†i 3/5: https://vnexpress.net/tong-bi-thu-yeu-cau-sua-chinh-sach-tien-luong-can-bo-cong-chuc-phu-hop-voi-mo-hinh-moi-4910138.html
‚úì Crawl th√†nh c√¥ng: T·ªïng B√≠ th∆∞ y√™u c·∫ßu s·ª≠a ch√≠nh s√°ch ti·ªÅn l∆∞∆°ng c√°n ...
ƒêang crawl b√†i 4/5: https://vnexpress.net/cong-bo-6-bai-toan-lon-ve-khoa-hoc-cong-nghe-trong-linh-vuc-quoc-phong-4909909.html
‚úì Crawl th√†nh c√¥ng: C√¥ng b·ªë 6 b√†i to√°n l·ªõn v·ªÅ khoa h·ªçc c√¥ng ngh·ªá trong...
