In [None]:
import pandas as pd
import os
import time
import re
import json
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
import google.generativeai as genai
import networkx as nx

# Load environment variables
load_dotenv()

In [None]:
class APIKeyManager:
    """
    Qu·∫£n l√Ω nhi·ªÅu Google API keys v√† t·ª± ƒë·ªông chuy·ªÉn ƒë·ªïi khi g·∫∑p l·ªói.
    
    Quy t·∫Øc:
    - M·ªói key ƒë∆∞·ª£c th·ª≠ t·ªëi ƒëa 2 l·∫ßn
    - Sau 2 l·∫ßn l·ªói ‚Üí t·ª± ƒë·ªông chuy·ªÉn key ti·∫øp theo
    - H·∫øt key ‚Üí b√°o l·ªói
    """
    
    MAX_RETRIES_PER_KEY = 2  # S·ªë l·∫ßn th·ª≠ t·ªëi ƒëa cho m·ªói key
    
    def __init__(self):
        """Kh·ªüi t·∫°o v√† load t·∫•t c·∫£ API keys t·ª´ .env"""
        # Load c√°c API keys t·ª´ environment
        self.keys = [
            ("GOOGLE_API_KEY", os.getenv("GOOGLE_API_KEY")),
            ("GOOGLE_API_KEY_2", os.getenv("GOOGLE_API_KEY_2")),
            ("GOOGLE_API_KEY_3", os.getenv("GOOGLE_API_KEY_3")),
            ("GOOGLE_API_KEY_4", os.getenv("GOOGLE_API_KEY_4")),
        ]
        
        # Ch·ªâ gi·ªØ l·∫°i c√°c key h·ª£p l·ªá (kh√¥ng None)
        self.keys = [(name, key) for name, key in self.keys if key]
        
        if not self.keys:
            raise ValueError("‚ùå Kh√¥ng t√¨m th·∫•y API key! Ki·ªÉm tra file .env")
        
        # Kh·ªüi t·∫°o tr·∫°ng th√°i
        self.current_index = 0
        self.error_counts = {name: 0 for name, _ in self.keys}  # ƒê·∫øm l·ªói m·ªói key
        
        print(f"‚úì Ph√°t hi·ªán {len(self.keys)} API keys")
        self._activate_key(0)
    
    def _activate_key(self, index):
        """K√≠ch ho·∫°t API key t·∫°i v·ªã tr√≠ index"""
        if index >= len(self.keys):
            raise Exception("‚ùå ƒê√£ h·∫øt t·∫•t c·∫£ API keys!")
        
        self.current_index = index
        key_name, key_value = self.keys[index]
        
        # C·∫•u h√¨nh Google AI v·ªõi key m·ªõi
        genai.configure(api_key=key_value)
        
        print(f"üîë ƒêang s·ª≠ d·ª•ng: {key_name} (Key {index + 1}/{len(self.keys)})")
    
    def get_current_key(self):
        """L·∫•y API key hi·ªán t·∫°i"""
        return self.keys[self.current_index][1]
    
    def get_models(self):
        """
        T·∫°o c√°c model AI v·ªõi API key hi·ªán t·∫°i.
        
        Returns:
            tuple: (model, model_more_temp, model_pro)
        """
        current_key = self.get_current_key()
        
        model = ChatGoogleGenerativeAI(
            model="gemini-2.0-flash-lite", 
            temperature=0.02,
            google_api_key=current_key
        )
        
        model_more_temp = ChatGoogleGenerativeAI(
            model="gemini-2.0-flash-lite", 
            temperature=0.1,
            google_api_key=current_key
        )
        
        model_pro = ChatGoogleGenerativeAI(
            model="gemini-2.5-pro-exp-03-25", 
            temperature=0.1,
            google_api_key=current_key
        )
        
        return model, model_more_temp, model_pro
    
    def on_error(self):
        """
        X·ª≠ l√Ω khi g·∫∑p l·ªói API.
        
        Returns:
            bool: True n·∫øu c√≥ th·ªÉ ti·∫øp t·ª•c (ƒë√£ chuy·ªÉn key ho·∫∑c c√≤n retry),
                  False n·∫øu ƒë√£ h·∫øt key
        """
        key_name, _ = self.keys[self.current_index]
        self.error_counts[key_name] += 1
        
        error_count = self.error_counts[key_name]
        print(f"‚ö† L·ªói l·∫ßn {error_count} v·ªõi {key_name}")
        
        # N·∫øu ƒë√£ ƒë·∫°t gi·ªõi h·∫°n retry cho key n√†y
        if error_count >= self.MAX_RETRIES_PER_KEY:
            print(f"‚õî {key_name} ƒë√£ l·ªói {error_count}/{self.MAX_RETRIES_PER_KEY} l·∫ßn")
            
            # Th·ª≠ chuy·ªÉn sang key ti·∫øp theo
            next_index = self.current_index + 1
            
            if next_index < len(self.keys):
                print(f"üîÑ Chuy·ªÉn sang key ti·∫øp theo...")
                self._activate_key(next_index)
                return True  # ƒê√£ chuy·ªÉn key th√†nh c√¥ng
            else:
                print("‚ùå ƒê√É H·∫æT T·∫§T C·∫¢ API KEYS!")
                return False  # Kh√¥ng c√≤n key n√†o
        
        # V·∫´n c√≤n retry cho key hi·ªán t·∫°i
        return True
    
    def reset_error_count(self):
        """Reset ƒë·∫øm l·ªói c·ªßa key hi·ªán t·∫°i (sau khi th√†nh c√¥ng)"""
        key_name, _ = self.keys[self.current_index]
        self.error_counts[key_name] = 0

# Kh·ªüi t·∫°o API Manager
api_manager = APIKeyManager()

print("‚úì ƒê√£ kh·ªüi t·∫°o APIKeyManager v·ªõi 4 API keys")

In [None]:
# ============================================================
# PROMPT TEMPLATES - Tr√≠ch xu·∫•t th·ª±c th·ªÉ v√† m·ªëi quan h·ªá
# ============================================================

# Template tr√≠ch xu·∫•t th·ª±c th·ªÉ t·ª´ tin t·ª©c
entity_extraction_template = PromptTemplate.from_template("""B·∫°n ƒëang l√†m vi·ªác d∆∞·ªõi b·ªëi c·∫£nh ph√¢n t√≠ch kinh t·∫ø. 
B·∫°n ƒë∆∞·ª£c cho m·ªôt ho·∫∑c nhi·ªÅu b√†i b√°o, bao g·ªìm t·ª±a ƒë·ªÅ v√† m√¥ t·∫£ ng·∫Øn g·ªçn v·ªÅ b√†i b√°o ƒë√≥, ngo√†i ra b·∫°n c√≥
th√¥ng tin v·ªÅ ng√†y xu·∫•t b·∫£n c·ªßa b√†i b√°o, v√† lo·∫°i ch·ªß ƒë·ªÅ m√† b√†i b√°o ƒëang ƒë·ªÅ c·∫≠p t·ªõi.

H·∫°n ch·∫ø t·∫°o m·ªõi m·ªôt th·ª±c th·ªÉ, ch·ªâ t·∫°o li√™n k·∫øt t·ªõi 5 th·ª±c th·ªÉ. Lu√¥n ∆∞u ti√™n li√™n k·∫øt v·ªõi c√°c th·ª±c th·ªÉ ƒë√£ c√≥: {existing_entities}

B·∫°n c·∫ßn ph√¢n t√≠ch b√†i b√°o, ƒë∆∞a ra t√™n c·ªßa nh·ªØng th·ª±c th·ªÉ (v√≠ d·ª• nh∆∞ c·ªï phi·∫øu, ng√†nh ngh·ªÅ, c√¥ng ty, qu·ªëc gia, t·ªânh th√†nh...)
s·∫Ω b·ªã ·∫£nh h∆∞·ªüng tr·ª±c ti·∫øp b·ªüi th√¥ng tin c·ªßa b√†i b√°o, theo h∆∞·ªõng t√≠ch c·ª±c ho·∫∑c ti√™u c·ª±c.

V·ªõi m·ªói th·ª±c th·ªÉ, ·ªü ph·∫ßn T√™n th·ª±c th·ªÉ, h·∫°n ch·∫ø d√πng d·∫•u ch·∫•m, g·∫°ch ngang, d·∫•u v√† &, d·∫•u ch·∫•m ph·∫©y ;. V√† c·∫ßn ghi th√™m qu·ªëc gia, ƒë·ªãa ph∆∞∆°ng c·ª• th·ªÉ v√† ng√†nh ngh·ªÅ c·ªßa n√≥ (n·∫øu c√≥).
T√™n ch·ªâ n√≥i t·ªõi m·ªôt th·ª±c th·ªÉ duy nh·∫•t. Ph·∫ßn T√™n kh√¥ng ƒë∆∞·ª£c qu√° ph·ª©c t·∫°p, ƒë∆°n gi·∫£n nh·∫•t c√≥ th·ªÉ.
N·∫øu th·ª±c th·ªÉ n√†o thu·ªôc danh m·ª•c c·ªï phi·∫øu sau: {portfolio}, h√£y ghi r√µ t√™n c·ªï phi·∫øu.
V√≠ d·ª•: SSI-Ch·ª©ng kho√°n; Ng√†nh c√¥ng nghi·ªáp Vi·ªát Nam; Ng∆∞·ªùi d√πng M·ªπ; Ng√†nh th√©p Ch√¢u √Å; Ng√†nh du l·ªãch H·∫° Long, ...

Ghi nh·ªõ, H·∫°n ch·∫ø t·∫°o m·ªõi m·ªôt th·ª±c th·ªÉ, ch·ªâ t·∫°o li√™n k·∫øt t·ªõi 5 th·ª±c th·ªÉ. Lu√¥n c·ªë li√™n k·∫øt v·ªõi c√°c th·ª±c th·ªÉ ƒë√£ c√≥.

Ph·∫ßn gi·∫£i th√≠ch m·ªói th·ª±c th·ªÉ, b·∫Øt bu·ªôc ƒë√°nh gi√° s·ªë li·ªáu ƒë∆∞·ª£c ghi, nhi·ªÅu ho·∫∑c √≠t, tƒÉng ho·∫∑c gi·∫£m, g·∫•p bao nhi√™u l·∫ßn, ...
C·∫ßn c·ªë g·∫Øng li√™n k·∫øt v·ªõi nhi·ªÅu th·ª±c th·ªÉ kh√°c. Tuy nhi√™n kh√¥ng suy ngo√†i ph·∫°m vi b√†i b√°o. Kh√¥ng t·ª± ch√®n s·ªë li·ªáu ngo√†i b√†i b√°o.
Kh√¥ng d√πng d·∫•u hai ch·∫•m trong ph·∫ßn gi·∫£i th√≠ch, ch·ªâ d√πng hai ch·∫•m : ƒë·ªÉ t√°ch gi·ªØa T√™n th·ª±c th·ªÉ v√† ph·∫ßn gi·∫£i th√≠ch.
                                                          
ƒê∆∞a ra theo ƒë·ªãnh d·∫°ng sau:
[[POSITIVE]]
[Entity 1]: [Explanation]
...
[Entity N]: [Explanation]

[[NEGATIVE]]
[Entity A]: [Explanation]
..
[Entity Z]: [Explanation]
                                                          
M·ªôt v√≠ d·ª• cho b√†i b√°o:

(B·∫ÆT ƒê·∫¶U V√ç D·ª§)

Ng√†y ƒëƒÉng: 2025-04-07T22:51:00+07:00
Lo·∫°i ch·ªß ƒë·ªÅ: Kinh t·∫ø
T·ª±a ƒë·ªÅ: N·ªó l·ª±c hi·ªán th·ª±c h√≥a m·ª•c ti√™u th√¥ng tuy·∫øn cao t·ªëc t·ª´ Cao B·∫±ng ƒë·∫øn C√† Mau 

M√¥ t·∫£: Nh·∫±m ho√†n th√†nh m·ª•c ti√™u ƒë·∫øn nƒÉm 2025 c·∫£ n∆∞·ªõc c√≥ tr√™n 3.000 km ƒë∆∞·ªùng cao t·ªëc, B·ªô X√¢y d·ª±ng, c√°c ƒë·ªãa ph∆∞∆°ng v√† doanh nghi·ªáp ƒëang tri·ªÉn khai thi c√¥ng 28 d·ª± √°n/d·ª± √°n th√†nh ph·∫ßn v·ªõi t·ªïng chi·ªÅu d√†i kho·∫£ng 1.188 km. 
ƒê·∫øn nay, ti·∫øn ƒë·ªô ƒëa s·ªë c√°c d·ª± √°n b√°m s√°t k·∫ø ho·∫°ch, nhi·ªÅu d·ª± √°n ƒëƒÉng k√Ω ho√†n th√†nh th√¥ng tuy·∫øn trong nƒÉm 2025. C√≥ th·ªÉ n√≥i ng√†nh giao th√¥ng v·∫≠n t·∫£i ƒëang c·ªë g·∫Øng h·∫øt s·ª©c.

Danh s√°ch th·ª±c th·ªÉ s·∫Ω b·ªã ·∫£nh h∆∞·ªüng:

[[POSITIVE]]
B·ªô X√¢y d·ª±ng Vi·ªát Nam: √Åp l·ª±c qu·∫£n l√Ω 28 d·ª± √°n v·ªõi t·ªïng chi·ªÅu d√†i 1188 km, nh·∫±m hi·ªán th·ª±c h√≥a m·ª•c ti√™u ƒë·∫°t 3000 km cao t·ªëc v√†o nƒÉm 2025. S·ªë l∆∞·ª£ng d·ª± √°n tƒÉng g·∫•p nhi·ªÅu l·∫ßn so v·ªõi giai ƒëo·∫°n tr∆∞·ªõc, ƒë√≤i h·ªèi ƒëi·ªÅu ph·ªëi ngu·ªìn l·ª±c v√† ki·ªÉm so√°t ti·∫øn ƒë·ªô ch·∫∑t ch·∫Ω h∆°n.
Ch√≠nh quy·ªÅn ƒë·ªãa ph∆∞∆°ng Vi·ªát Nam: Tr·ª±c ti·∫øp ph·ªëi h·ª£p tri·ªÉn khai c√°c d·ª± √°n t·∫°i t·ª´ng t·ªânh th√†nh. C·∫ßn n√¢ng cao nƒÉng l·ª±c qu·∫£n l√Ω v√† s·ª≠ d·ª•ng ng√¢n s√°ch c√¥ng hi·ªáu qu·∫£ ƒë·ªÉ ƒë·∫£m b·∫£o ti·∫øn ƒë·ªô thi c√¥ng theo k·∫ø ho·∫°ch chung qu·ªëc gia.
Doanh nghi·ªáp x√¢y d·ª±ng Vi·ªát Nam: ƒê∆∞·ª£c h∆∞·ªüng l·ª£i tr·ª±c ti·∫øp khi nh·∫≠n kh·ªëi l∆∞·ª£ng h·ª£p ƒë·ªìng thi c√¥ng l·ªõn. Doanh thu v√† nƒÉng l·ª±c thi c√¥ng c√≥ th·ªÉ tƒÉng nhanh h∆°n so v·ªõi c√°c giai ƒëo·∫°n tr∆∞·ªõc ƒë√¢y, nh·ªù nhu c·∫ßu ƒë·∫ßu t∆∞ h·∫° t·∫ßng tƒÉng m·∫°nh.

[[NEGATIVE]]
B·ªô X√¢y d·ª±ng Vi·ªát Nam: R·ªßi ro ch·∫≠m ti·∫øn ƒë·ªô v√† ƒë·ªôi v·ªën n·∫øu ƒëi·ªÅu ph·ªëi kh√¥ng hi·ªáu qu·∫£ do s·ªë l∆∞·ª£ng d·ª± √°n tƒÉng g·∫•p nhi·ªÅu l·∫ßn.
Ch√≠nh quy·ªÅn ƒë·ªãa ph∆∞∆°ng Vi·ªát Nam: C√≥ th·ªÉ g·∫∑p kh√≥ khƒÉn trong gi·∫£i ph√≥ng m·∫∑t b·∫±ng v√† qu·∫£n l√Ω v·ªën ƒë·∫ßu t∆∞ n·∫øu nƒÉng l·ª±c t·ªï ch·ª©c y·∫øu.

(K·∫æT TH√öC V√ç D·ª§)

Ng√†y ƒëƒÉng: {date}
Lo·∫°i ch·ªß ƒë·ªÅ: {group}
T·ª±a ƒë·ªÅ: {title}

M√¥ t·∫£: {description}


Danh s√°ch th·ª±c th·ªÉ s·∫Ω b·ªã ·∫£nh h∆∞·ªüng:
""")

print("‚úì ƒê√£ ƒë·ªãnh nghƒ©a entity_extraction_template")

In [None]:
PORTFOLIO_STOCKS = ["FPT", "SSI", "VCB", "VHM", "HPG", "GAS", "MSN", "MWG", "GVR", "VIC"]
PORTFOLIO_SECTOR = ["C√¥ng ngh·ªá", "Ch·ª©ng kho√°n", "Ng√¢n h√†ng", "B·∫•t ƒë·ªông s·∫£n", "V·∫≠t li·ªáu c∆° b·∫£n", 
                     "D·ªãch v·ª• H·∫° t·∫ßng", "Ti√™u d√πng c∆° b·∫£n", "B√°n l·∫ª", "Ch·∫ø bi·∫øn", "B·∫•t ƒë·ªçng s·∫£n"]
BASE_DELAY = 30
MAX_RETRIES = 3

def create_chains(api_manager):
    """
    T·∫°o chains v·ªõi models t·ª´ APIKeyManager
    """
    model, model_more_temp, model_pro = api_manager.get_models()
    
    # T·∫°o chain tr√≠ch xu·∫•t th·ª±c th·ªÉ
    chain_entity = entity_extraction_template | model
    
    return chain_entity

# Kh·ªüi t·∫°o chain
chain_entity = create_chains(api_manager)

In [None]:
def invoke_chain_with_retry(chain, prompt, api_manager, base_delay=BASE_DELAY):
    """
    G·ªçi chain v·ªõi c∆° ch·∫ø retry t·ª± ƒë·ªông v√† t√≠ch h·ª£p APIKeyManager
    """
    total_attempts = 0
    max_total_attempts = len(api_manager.keys) * api_manager.MAX_RETRIES_PER_KEY
    
    while total_attempts < max_total_attempts:
        try:
            # Th·ª≠ g·ªçi API
            response = chain.invoke(prompt)
            
            # Th√†nh c√¥ng -> reset error count
            api_manager.reset_error_count()
            return response
            
        except Exception as e:
            total_attempts += 1
            error_msg = str(e)
            
            # B√°o l·ªói cho API manager
            switched = api_manager.on_error()
            
            if total_attempts >= max_total_attempts:
                print(f"‚ùå ƒê√£ th·ª≠ t·∫•t c·∫£ {len(api_manager.keys)} API keys ({total_attempts} l·∫ßn) nh∆∞ng v·∫´n l·ªói")
                print(f"   L·ªói cu·ªëi: {error_msg}")
                return None
            
            # Ch·ªù tr∆∞·ªõc khi retry
            if switched:
                delay = base_delay
                print(f"‚è≥ ƒê·ª£i {delay}s tr∆∞·ªõc khi th·ª≠ key m·ªõi...")
            else:
                retry_num = api_manager.error_counts.get(api_manager.current_index, 0)
                delay = base_delay * (1.5 ** (retry_num - 1))
                print(f"‚è≥ ƒê·ª£i {delay:.0f}s tr∆∞·ªõc khi retry ({retry_num}/{api_manager.MAX_RETRIES_PER_KEY})...")
            
            time.sleep(delay)

def parse_entity_response(response):
    """
    Ph√¢n t√≠ch response t·ª´ entity extraction prompt
    
    Returns:
        dict: {"POSITIVE": [(entity, explanation), ...], "NEGATIVE": [(entity, explanation), ...]}
    """
    if response is None:
        print("Response is None")
        return {"POSITIVE": [], "NEGATIVE": []}
        
    sections = {"POSITIVE": [], "NEGATIVE": []}
    current_section = None
    str_resp = response.content
    
    for line in str(str_resp).splitlines():
        line = line.strip()
        if not line:
            continue
        if "[[POSITIVE]]" in line.upper():
            current_section = "POSITIVE"
            continue
        if "[[NEGATIVE]]" in line.upper():
            current_section = "NEGATIVE"
            continue
        if current_section and ':' in line:
            entity = line.split(":", 1)[0].strip()
            # Skip invalid entities
            if not entity or "kh√¥ng c√≥ th·ª±c th·ªÉ n√†o" in entity.lower():
                continue
            # content = all line except entity
            content = line.split(entity, 1)[-1].strip(':').strip()
            sections[current_section].append((entity, content))

    return sections

def merge_entity(entity, canonical_set):
    """
    Tr·∫£ v·ªÅ phi√™n b·∫£n canonical c·ªßa entity n·∫øu ƒë√£ t·ªìn t·∫°i (case-insensitive),
    n·∫øu kh√¥ng th√¨ th√™m v√† tr·∫£ v·ªÅ entity m·ªõi.
    """
    normalized_entity = str(entity).strip('[').strip(']').strip(' ').lower()
    for exist in canonical_set:
        if exist.lower() == normalized_entity:
            return exist
    canonical_set.add(normalized_entity)
    return normalized_entity

def graph_entities_to_str(G, max_entities=50):
    """
    Chuy·ªÉn ƒë·ªïi c√°c entities trong graph th√†nh chu·ªói ƒë·ªÉ ƒë∆∞a v√†o prompt
    """
    entities = [node for node in G.nodes() if not node.startswith("Article_")]
    # Gi·ªõi h·∫°n s·ªë l∆∞·ª£ng ƒë·ªÉ kh√¥ng l√†m prompt qu√° d√†i
    entities = entities[:max_entities]
    return ", ".join(entities) if entities else "Ch∆∞a c√≥ th·ª±c th·ªÉ n√†o"

print("‚úì ƒê√£ ƒë·ªãnh nghƒ©a c√°c h√†m ti·ªán √≠ch")

In [None]:
# ============================================================
# H√ÄM CH√çNH - Tr√≠ch xu·∫•t th·ª±c th·ªÉ t·ª´ tin t·ª©c
# ============================================================

def extract_entities_from_news(
    csv_path="summarized_news_with_stocks.csv",
    output_path="entities_extracted.csv",
    start_date=None,
    end_date=None,
    max_articles=None
):
    """
    Tr√≠ch xu·∫•t th·ª±c th·ªÉ t·ª´ tin t·ª©c ƒë√£ t√≥m t·∫Øt
    
    Parameters:
    -----------
    csv_path : str
        ƒê∆∞·ªùng d·∫´n ƒë·∫øn file CSV ch·ª©a tin t·ª©c ƒë√£ t√≥m t·∫Øt
    output_path : str
        ƒê∆∞·ªùng d·∫´n file CSV k·∫øt qu·∫£
    start_date : str, optional
        Ng√†y b·∫Øt ƒë·∫ßu (format: YYYY-MM-DD)
    end_date : str, optional
        Ng√†y k·∫øt th√∫c (format: YYYY-MM-DD)
    max_articles : int, optional
        S·ªë l∆∞·ª£ng b√†i b√°o t·ªëi ƒëa ƒë·ªÉ x·ª≠ l√Ω (ƒë·ªÉ test)
    
    Returns:
    --------
    tuple: (entities_df, graph, canonical_entities)
        - entities_df: DataFrame ch·ª©a c√°c th·ª±c th·ªÉ ƒë√£ tr√≠ch xu·∫•t
        - graph: NetworkX graph ch·ª©a m·ªëi quan h·ªá
        - canonical_entities: Set c√°c th·ª±c th·ªÉ canonical
    """
    print(f"üìñ ƒêang ƒë·ªçc d·ªØ li·ªáu t·ª´ {csv_path}...")
    
    # ƒê·ªçc d·ªØ li·ªáu tin t·ª©c ƒë√£ t√≥m t·∫Øt
    df = pd.read_csv(csv_path)
    print(f"‚úì ƒê√£ ƒë·ªçc {len(df)} tin t·ª©c")
    
    # Chuy·ªÉn ƒë·ªïi c·ªôt date sang datetime
    df['parsed_date'] = pd.to_datetime(df['date'])
    df['only_date'] = df['parsed_date'].dt.date
    
    # L·ªçc theo kho·∫£ng th·ªùi gian
    if start_date:
        start_dt = pd.to_datetime(start_date).date()
        df = df[df['only_date'] >= start_dt]
        print(f"‚úì L·ªçc t·ª´ ng√†y {start_date}: c√≤n {len(df)} tin")
    
    if end_date:
        end_dt = pd.to_datetime(end_date).date()
        df = df[df['only_date'] <= end_dt]
        print(f"‚úì L·ªçc ƒë·∫øn ng√†y {end_date}: c√≤n {len(df)} tin")
    
    # Gi·ªõi h·∫°n s·ªë l∆∞·ª£ng n·∫øu c·∫ßn (ƒë·ªÉ test)
    if max_articles:
        df = df.head(max_articles)
        print(f"‚úì Gi·ªõi h·∫°n xu·ªëng {len(df)} tin ƒë·ªÉ x·ª≠ l√Ω")
    
    # S·∫Øp x·∫øp theo th·ªùi gian
    df = df.sort_values('date')
    
    # Kh·ªüi t·∫°o graph v√† canonical entities
    G = nx.DiGraph()
    canonical_entities = set()
    
    # Build portfolio string
    portfolio_str_full = ", ".join([f"{stock}-{sector}" for stock, sector in zip(PORTFOLIO_STOCKS, PORTFOLIO_SECTOR)])
    
    # K·∫øt qu·∫£
    all_entities = []
    
    print(f"\n{'='*60}")
    print(f"üîç B·∫ÆT ƒê·∫¶U TR√çCH XU·∫§T TH·ª∞C TH·ªÇ")
    print(f"{'='*60}\n")
    
    # X·ª≠ l√Ω t·ª´ng b√†i b√°o
    for idx, row in df.iterrows():
        article_idx = idx + 1
        article_node = f"Article_{article_idx}: {row['title']}"
        article_timestamp = row['parsed_date']
        
        # Th√™m node b√†i b√°o v√†o graph
        if not G.has_node(article_node):
            G.add_node(article_node, type="article", timestamp=article_timestamp)
        
        print(f"[{article_idx}/{len(df)}] üì∞ {row['title'][:60]}...")
        
        # T·∫°o group t·ª´ stockCodes n·∫øu c√≥, kh√¥ng th√¨ ƒë·ªÉ "Chung"
        group = row.get('stockCodes', 'Chung') if row.get('stockCodes') else 'Chung'
        
        # Phase 1: Extract initial entities
        max_entity_retries = MAX_RETRIES
        entity_retry_count = 0
        entities_dict = {"POSITIVE": [], "NEGATIVE": []}
        
        while entity_retry_count < max_entity_retries:
            prompt_text = {
                "portfolio": portfolio_str_full,
                "date": row['date'],
                "group": group,
                "title": row['title'],
                "description": row['description'],
                "existing_entities": graph_entities_to_str(G)
            }
            
            response_text = invoke_chain_with_retry(chain_entity, prompt_text, api_manager)
            time.sleep(1)  # Rate limiting
            
            if response_text is None:
                print(f"   ‚ùå B·ªè qua tin {article_idx} do l·ªói API")
                break
            
            entities_dict = parse_entity_response(response_text)
            
            # Check if we got any entities
            total_entities = len(entities_dict.get("POSITIVE", [])) + len(entities_dict.get("NEGATIVE", []))
            if total_entities > 0:
                print(f"   ‚úì Tr√≠ch xu·∫•t ƒë∆∞·ª£c {total_entities} th·ª±c th·ªÉ")
                break
                
            entity_retry_count += 1
            print(f"   ‚ö† Kh√¥ng c√≥ th·ª±c th·ªÉ. Th·ª≠ l·∫°i {entity_retry_count}/{max_entity_retries}")
            time.sleep(BASE_DELAY)
        
        if entity_retry_count == max_entity_retries and total_entities == 0:
            print(f"   ‚ùå Th·∫•t b·∫°i sau {max_entity_retries} l·∫ßn th·ª≠")
            continue
        
        # Process entities
        for impact in ["POSITIVE", "NEGATIVE"]:
            for ent, content in entities_dict.get(impact, []):
                # Skip invalid entities
                if not ent or "kh√¥ng c√≥ th·ª±c th·ªÉ n√†o" in ent.lower():
                    continue
                
                # Normalize entity
                canon_ent = merge_entity(ent, canonical_entities)
                
                # Determine node type
                node_type = "stock" if any(str(canon_ent).lower().find(stock.lower()) != -1 for stock in PORTFOLIO_STOCKS) else "entity"
                
                # Add node to graph
                if not G.has_node(canon_ent):
                    G.add_node(canon_ent, type=node_type, timestamp=article_timestamp)
                
                # Add edge from article to entity
                if not G.has_edge(article_node, canon_ent):
                    G.add_edge(article_node, canon_ent, impact=impact, timestamp=article_timestamp)
                
                # L∆∞u v√†o k·∫øt qu·∫£
                all_entities.append({
                    "article_id": article_idx,
                    "article_title": row['title'],
                    "date": row['date'],
                    "entity": canon_ent,
                    "entity_type": node_type,
                    "impact": impact,
                    "explanation": content
                })
    
    # T·∫°o DataFrame k·∫øt qu·∫£
    entities_df = pd.DataFrame(all_entities)
    
    # L∆∞u file
    entities_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    
    print(f"\n{'='*60}")
    print(f"‚úÖ HO√ÄN TH√ÄNH!")
    print(f"üìä T·ªïng s·ªë entities: {len(entities_df)}")
    print(f"üîπ Unique entities: {len(canonical_entities)}")
    print(f"üìà Graph nodes: {len(G.nodes())}")
    print(f"üîó Graph edges: {len(G.edges())}")
    print(f"üíæ ƒê√£ l∆∞u v√†o: {output_path}")
    print(f"{'='*60}")
    
    return entities_df, G, canonical_entities

print("‚úì ƒê√£ ƒë·ªãnh nghƒ©a h√†m extract_entities_from_news()")

## üöÄ Ch·∫°y tr√≠ch xu·∫•t th·ª±c th·ªÉ

### H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng:

1. **Test v·ªõi s·ªë l∆∞·ª£ng nh·ªè:** Th·ª≠ v·ªõi `max_articles=10` ƒë·ªÉ ki·ªÉm tra
2. **Ch·∫°y ƒë·∫ßy ƒë·ªß:** B·ªè `max_articles` ƒë·ªÉ x·ª≠ l√Ω to√†n b·ªô
3. **L·ªçc theo th·ªùi gian:** D√πng `start_date` v√† `end_date`

### V√≠ d·ª•:

In [None]:
# Test v·ªõi 10 tin t·ª©c ƒë·∫ßu ti√™n
entities_df, G, canonical_entities = extract_entities_from_news(
    csv_path="summarized_news_with_stocks.csv",
    output_path="entities_extracted.csv",
    start_date="2022-09-30",  # Ng√†y b·∫Øt ƒë·∫ßu
    end_date=None,             # None = ƒë·∫øn cu·ªëi     
)

## üìä Ph√¢n t√≠ch k·∫øt qu·∫£

Xem c√°c th·ª±c th·ªÉ ƒë√£ tr√≠ch xu·∫•t:

In [None]:
# Xem t·ªïng quan
print(f"üìä T·ªîNG QUAN K·∫æT QU·∫¢")
print(f"{'='*60}\n")

print(f"T·ªïng s·ªë entities tr√≠ch xu·∫•t: {len(entities_df)}")
print(f"Unique entities: {len(canonical_entities)}")
print(f"Graph nodes: {len(G.nodes())}")
print(f"Graph edges: {len(G.edges())}")

print(f"\nüìà PH√ÇN LO·∫†I THEO IMPACT:\n")
print(entities_df['impact'].value_counts())

print(f"\nüìå PH√ÇN LO·∫†I THEO ENTITY TYPE:\n")
print(entities_df['entity_type'].value_counts())

print(f"\n{'='*60}")
print("M·∫™U 10 ENTITIES ƒê·∫¶U TI√äN:")
print(f"{'='*60}\n")

for idx, row in entities_df.head(10).iterrows():
    print(f"[{row['article_id']}] üì∞ {row['article_title'][:50]}...")
    print(f"üè∑Ô∏è  Entity: {row['entity']} ({row['entity_type']})")
    print(f"{'‚úÖ' if row['impact'] == 'POSITIVE' else '‚ùå'} Impact: {row['impact']}")
    print(f"üìù {row['explanation'][:100]}...")
    print(f"{'-'*60}\n")

In [None]:
# Xem c√°c entity ƒë∆∞·ª£c ƒë·ªÅ c·∫≠p nhi·ªÅu nh·∫•t
from collections import Counter

entity_counts = Counter(entities_df['entity'])

print(f"üî• TOP 15 ENTITIES ƒê∆Ø·ª¢C ƒê·ªÄ C·∫¨P NHI·ªÄU NH·∫§T:\n")
print(f"{'='*60}\n")

for i, (entity, count) in enumerate(entity_counts.most_common(15), 1):
    # ƒê·∫øm positive v√† negative
    pos_count = len(entities_df[(entities_df['entity'] == entity) & (entities_df['impact'] == 'POSITIVE')])
    neg_count = len(entities_df[(entities_df['entity'] == entity) & (entities_df['impact'] == 'NEGATIVE')])
    
    print(f"{i:2d}. {entity}")
    print(f"    T·ªïng: {count} l·∫ßn | ‚úÖ {pos_count} | ‚ùå {neg_count}")
    print()

## üîç T√¨m ki·∫øm v√† ph√¢n t√≠ch

### T√¨m th√¥ng tin v·ªÅ m·ªôt entity c·ª• th·ªÉ:

In [None]:
def search_entity(entity_name, entities_df):
    """
    T√¨m ki·∫øm t·∫•t c·∫£ th√¥ng tin v·ªÅ m·ªôt entity
    """
    # T√¨m ki·∫øm case-insensitive
    results = entities_df[entities_df['entity'].str.lower().str.contains(entity_name.lower(), na=False)]
    
    if len(results) == 0:
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y entity: {entity_name}")
        return None
    
    print(f"üîç T√åM TH·∫§Y {len(results)} MENTIONS V·ªÄ '{entity_name.upper()}'\n")
    print(f"{'='*60}\n")
    
    for idx, row in results.iterrows():
        print(f"[{row['article_id']}] üì∞ {row['article_title']}")
        print(f"üìÖ {row['date']}")
        print(f"üè∑Ô∏è  {row['entity']} ({row['entity_type']})")
        print(f"{'‚úÖ' if row['impact'] == 'POSITIVE' else '‚ùå'} {row['impact']}")
        print(f"üìù {row['explanation']}")
        print(f"{'-'*60}\n")
    
    return results

# V√≠ d·ª•: T√¨m th√¥ng tin v·ªÅ FPT
# search_entity("FPT", entities_df)

In [None]:
# Ph√¢n t√≠ch Graph: Xem c√°c entities k·∫øt n·ªëi v·ªõi nhau
def analyze_graph(G):
    """
    Ph√¢n t√≠ch knowledge graph ƒë√£ x√¢y d·ª±ng
    """
    print(f"üìä PH√ÇN T√çCH KNOWLEDGE GRAPH\n")
    print(f"{'='*60}\n")
    
    # Th·ªëng k√™ c∆° b·∫£n
    print(f"T·ªïng s·ªë nodes: {len(G.nodes())}")
    print(f"T·ªïng s·ªë edges: {len(G.edges())}")
    
    # Ph√¢n lo·∫°i nodes
    node_types = {}
    for node, data in G.nodes(data=True):
        node_type = data.get('type', 'unknown')
        node_types[node_type] = node_types.get(node_type, 0) + 1
    
    print(f"\nüìå Ph√¢n lo·∫°i nodes:")
    for ntype, count in node_types.items():
        print(f"   {ntype}: {count}")
    
    # T√¨m nodes c√≥ nhi·ªÅu k·∫øt n·ªëi nh·∫•t (degree centrality)
    entity_nodes = [n for n, d in G.nodes(data=True) if d.get('type') != 'article']
    
    if entity_nodes:
        # In-degree: s·ªë l∆∞·ª£ng b√†i b√°o li√™n k·∫øt ƒë·∫øn entity n√†y
        in_degrees = [(node, G.in_degree(node)) for node in entity_nodes]
        in_degrees.sort(key=lambda x: x[1], reverse=True)
        
        print(f"\nüî• TOP 10 ENTITIES ƒê∆Ø·ª¢C NH·∫ÆC ƒê·∫æN NHI·ªÄU NH·∫§T (b·ªüi c√°c b√†i b√°o):\n")
        for i, (node, degree) in enumerate(in_degrees[:10], 1):
            print(f"{i:2d}. {node}: {degree} b√†i b√°o")
    
    print(f"\n{'='*60}")

# Ch·∫°y ph√¢n t√≠ch
analyze_graph(G)

## üíæ L∆∞u Graph ƒë·ªÉ s·ª≠ d·ª•ng sau

L∆∞u graph v√†o file pickle ƒë·ªÉ s·ª≠ d·ª•ng cho c√°c b∆∞·ªõc ti·∫øp theo (relation extraction, attention mechanism...):

In [None]:
import pickle

# L∆∞u graph
with open('knowledge_graph.pkl', 'wb') as f:
    pickle.dump(G, f)
print("‚úì ƒê√£ l∆∞u knowledge graph v√†o 'knowledge_graph.pkl'")

# L∆∞u canonical entities
with open('canonical_entities.pkl', 'wb') as f:
    pickle.dump(canonical_entities, f)
print("‚úì ƒê√£ l∆∞u canonical_entities v√†o 'canonical_entities.pkl'")

# ƒê·ªÉ load l·∫°i sau:
# with open('knowledge_graph.pkl', 'rb') as f:
#     G = pickle.load(f)
# with open('canonical_entities.pkl', 'rb') as f:
#     canonical_entities = pickle.load(f)