In [9]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. PyTorch can use the GPU.")
    print(f"Number of available GPUs: {torch.cuda.device_count()}")
    print(f"Current GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch will use the CPU.")


CUDA is available. PyTorch can use the GPU.
Number of available GPUs: 1
Current GPU Name: NVIDIA A100-SXM4-40GB


## Scrapping

In [10]:
import json
import re
import time
import random
import os
from urllib.parse import urljoin, urlsplit, urlunsplit

# Third-party imports
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd

# ==========================================
# 1. CONFIGURATION
# ==========================================
BASE_URL = "https://www.investing.com/news/stock-market-news"
DOMAIN = "https://www.investing.com"
BROWSER_CONFIG = {"browser": "chrome", "platform": "windows", "desktop": True}
OUTPUT_FILENAME = "csv_checkpoint/investing_news_realtime.csv"

# ==========================================
# 2. UTILITY FUNCTIONS
# ==========================================
def normalize_link(url):
    """
    Normalizes a URL by removing query parameters and fragments.
    """
    parts = urlsplit(url)
    return urlunsplit((parts.scheme, parts.netloc, parts.path, "", ""))

def extract_clean_text(raw_html):
    """
    Parses HTML content, removes unnecessary tags (scripts, styles, etc.),
    and extracts clean paragraph text from the article body.
    """
    if not raw_html:
        return ""
    
    soup = BeautifulSoup(raw_html, "html.parser")

    # Remove non-content tags
    for tag in soup(["script", "style", "noscript", "iframe", "header", "footer"]): 
        tag.decompose()

    ignore_phrases = [
        "generated with the support of AI", 
        "reviewed by an editor",
        "Join our investing challenges", 
        "InvestingPro",
        "For more information see our T&C", 
        "Position:"
    ]

    # Attempt to locate the main article body using common selectors
    article_body = (
        soup.find("div", class_="WYSIWYG articlePage") or
        soup.find("div", class_="article_container") or
        soup.find("div", id="articleContent") or
        soup.find("div", class_="article-content") or
        soup.body
    )

    paragraphs = []
    if article_body:
        for p in article_body.find_all("p"):
            text = p.get_text(" ", strip=True)
            # Filter out short texts or ignored phrases
            if len(text) > 30 and not any(phrase in text for phrase in ignore_phrases):
                paragraphs.append(text)

    return "\n\n".join(paragraphs).strip()

def load_existing_links(filename):
    """
    Loads the existing CSV file and returns a set of links that have already been scraped.
    """
    if not os.path.exists(filename):
        return set()
    
    try:
        df = pd.read_csv(filename)
        if "Link" in df.columns:
            # Normalize links in the file to ensure matching works correctly
            return set(df["Link"].apply(normalize_link).dropna())
    except Exception as e:
        print(f"Warning: Could not read existing file {filename}: {e}")
    
    return set()

# ==========================================
# 3. MAIN SCRAPER FUNCTION
# ==========================================
def run_incremental_scraper(max_pages=50):
    """
    Scrapes news articles starting from page 1.
    Stops automatically when it encounters an article that is already in the CSV file.
    """
    scraper = cloudscraper.create_scraper(browser=BROWSER_CONFIG)
    scraper.headers.update({"Accept-Language": "en-US,en;q=0.9"})

    # 1. Load existing data to check for duplicates
    existing_links = load_existing_links(OUTPUT_FILENAME)
    print(f"Status: Loaded {len(existing_links)} existing articles from {OUTPUT_FILENAME}")

    new_articles = []
    seen_links_session = set()
    stop_scraping = False
    
    # Loop through pages (limited by max_pages to prevent infinite loops if something goes wrong)
    for page in range(1, max_pages + 1):
        if stop_scraping:
            break

        current_url = f"{BASE_URL}/{page}"
        print(f"\n[Page {page}] Scanning for new links -> {current_url}")
        
        try:
            response = scraper.get(current_url, timeout=20)
            if response.status_code != 200:
                print(f"Error: Could not access page {page} (Status: {response.status_code})")
                continue
                
            soup = BeautifulSoup(response.text, "html.parser")
            title_links = soup.find_all("a", attrs={"data-test": "article-title-link"})
            
            if not title_links:
                print("Info: No articles found on this page. Ending scrape.")
                break

            print(f"Info: Found {len(title_links)} links on page {page}")

            current_page_candidates = []

            # --- STEP 1: Filter Links ---
            for a_tag in title_links:
                href = a_tag.get("href")
                if not href: continue
                
                full_link = normalize_link(href if href.startswith("http") else urljoin(DOMAIN, href))
                
                # CHECK: If we find a link that is already in our file, we have reached old news.
                if full_link in existing_links:
                    print(f"Stop Signal: Found existing article '{a_tag.get_text(strip=True)[:30]}...'. Stopping.")
                    stop_scraping = True
                    break # Break the link loop
                
                # Check for session duplicates (e.g. pinned posts appearing on multiple pages)
                if full_link in seen_links_session or "comment" in full_link:
                    continue

                # Prepare item for scraping
                title = a_tag.get_text(strip=True)
                
                # Extract metadata
                container = (
                    a_tag.find_parent("article") or 
                    a_tag.find_parent("li") or 
                    a_tag.find_parent("div", class_=lambda x: x and "article" in x)
                )
                
                date_val, source_name = "Unknown", "Unknown"
                if container:
                    t = container.find("time", attrs={"data-test": "article-publish-date"})
                    s = container.find("span", attrs={"data-test": "news-provider-name"})
                    if t: date_val = t.get("datetime") or t.get_text(strip=True)
                    if s: source_name = s.get_text(strip=True)

                item = {
                    "Page": page,
                    "Date": date_val,
                    "Source": source_name,
                    "Title": title,
                    "Link": full_link
                }
                current_page_candidates.append(item)
                seen_links_session.add(full_link)

            # --- STEP 2: Scrape Content for New Links ---
            if current_page_candidates:
                print(f"Status: Found {len(current_page_candidates)} NEW articles on page {page}. Extracting content...")
                
                for i, item in enumerate(current_page_candidates, start=1):
                    print(f"    [{i}/{len(current_page_candidates)}] Fetching: {item['Title'][:50]}...")
                    
                    try:
                        scraper.headers.update({"Referer": BASE_URL})
                        resp = scraper.get(item["Link"], timeout=20)
                        content = extract_clean_text(resp.text)
                        
                        if content and len(content) > 100:
                            item["Content"] = content
                            new_articles.append(item)
                        else:
                            print(f"        Warning: Content too short/empty.")
                    except Exception as e:
                        print(f"        Error fetching article: {e}")
                    
                    time.sleep(random.uniform(1.5, 3))

            else:
                if not stop_scraping:
                    print("Info: No valid new links found on this page (might be duplicates or ads).")

        except Exception as e:
            print(f"Critical Error processing page {page}: {e}")
            continue

        # Delay between pages
        if not stop_scraping:
            time.sleep(random.uniform(2, 4))

    # --- STEP 3: Save New Data ---
    if new_articles:
        df_new = pd.DataFrame(new_articles)
        
        # Check if file exists to determine if we need header
        file_exists = os.path.isfile(OUTPUT_FILENAME)
        
        # Append to CSV
        df_new.to_csv(OUTPUT_FILENAME, mode='a', header=not file_exists, index=False, encoding='utf-8-sig')
        print("\n" + "="*80)
        print(f"SUCCESS: Appended {len(df_new)} new articles to {OUTPUT_FILENAME}")
        print("="*80)
    else:
        print("\n" + "="*80)
        print("No new articles found. The file is up to date.")
        print("="*80)

    return pd.DataFrame(new_articles)

# ==========================================
# EXECUTION
# ==========================================
if __name__ == "__main__":
    # Run the incremental scraper
    # It will stop automatically when it hits news that is already in the CSV
    run_incremental_scraper(max_pages=50)

Status: Loaded 10816 existing articles from csv_checkpoint/investing_news_realtime.csv

[Page 1] Scanning for new links -> https://www.investing.com/news/stock-market-news/1
Info: Found 35 links on page 1
Stop Signal: Found existing article 'Investing.com‚Äôs stocks of the ...'. Stopping.

No new articles found. The file is up to date.


In [11]:
# read csv 
import pandas as pd
df = pd.read_csv('csv_checkpoint/investing_news_realtime.csv')
print(df.shape)
df

(10816, 6)


Unnamed: 0,Page,Date,Source,Title,Link,Content
0,1,2026-01-03 09:56:10,Investing.com,BofA unveils its top 10 U.S. ideas for Q1 2026,https://www.investing.com/news/stock-market-ne...,Investing.com -- Bank of America has released ...
1,1,2026-01-03 09:55:45,Investing.com,Canaccord‚Äôs says 2026 is likely to be ‚Äôa bount...,https://www.investing.com/news/stock-market-ne...,Investing.com --¬†Canaccord Genuity analyst Geo...
2,1,2026-01-03 09:05:02,Investing.com,Is Reddit the new homepage for the Open Web?,https://www.investing.com/news/stock-market-ne...,Investing.com -- Reddit is increasingly positi...
3,1,2026-01-03 03:24:29,Reuters,"Trump blocks chips deal, cites security, China...",https://www.investing.com/news/stock-market-ne...,"WASHINGTON, Jan 2 (Reuters) - President Donald..."
4,1,2026-01-03 01:12:24,Reuters,"Top hedge funds led by D.E.Shaw, Bridgewater a...",https://www.investing.com/news/stock-market-ne...,(Corrects Point72‚Äôs return figures in second b...
...,...,...,...,...,...,...
10811,1,2026-01-10 00:25:34,Reuters,US FTC wins ruling blocking Edwards Lifescienc...,https://www.investing.com/news/stock-market-ne...,Jan 9 (Reuters) - The U.S. Federal Trade Commi...
10812,1,2026-01-10 09:00:05,Investing.com,Investing.com‚Äôs stocks of the week,https://www.investing.com/news/stock-market-ne...,Investing.com ‚Äì With the first week of full tr...
10813,1,2026-01-10 09:00:03,Investing.com,Wolfe analysts say these will be key investmen...,https://www.investing.com/news/stock-market-ne...,Investing.com -- Wolfe Research expects U.S. e...
10814,1,2026-01-10 08:30:03,Investing.com,Will SMidCaps outperform in 2026?,https://www.investing.com/news/stock-market-ne...,Investing.com -- Small- and mid-cap (SMidCaps)...


## IDX

### IDX (TF-IDF Based)

In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Tuple, Any

# ==========================================
# 1. CONFIGURATION
# ==========================================
CSV_PATH = 'csv_checkpoint/investing_news_realtime.csv'
THRESHOLD = 0.02
MAX_LABELS = 3

SECTOR_KEYWORDS = {
    "Technology": (
        "technology software semiconductor chip artificial intelligence ai cloud computing "
        "cybersecurity hardware electronics data center server processor gpu cpu saas "
        "it services digital platform quantum computing machine learning automation "
        "network infrastructure operating system application developer tech"
    ),
    "Communication Services": (
        "communication internet telecommunication telecom media entertainment streaming "
        "social media advertising broadcasting broadband wireless network cable satellite "
        "interactive media publishing movies gaming video content provider"
    ),
    "Consumer Cyclical": (
        "consumer discretionary retail e-commerce automotive vehicle electric vehicle ev "
        "car auto parts restaurant travel leisure hotel resort casino gambling apparel "
        "luxury goods home improvement department store textile footwear consumer services"
    ),
    "Financials": (
        "financial banking bank investment asset management insurance credit fintech "
        "capital markets wealth management interest rate monetary policy federal reserve "
        "fed loan mortgage equity trading brokerage payment system currency exchange "
        "private equity hedge fund venture capital audit tax"
    ),
    "Healthcare": (
        "healthcare health pharmaceutical biotech biotechnology medical device "
        "drug vaccine clinical trial fda approval hospital health insurance "
        "life sciences diagnosis therapy treatment genomics medical equipment "
        "managed care pharmacy research development r&d"
    ),
    "Energy": (
        "energy oil gas petroleum crude drilling exploration production pipeline "
        "refining refinery renewable energy solar wind biofuel carbon capture "
        "energy equipment services natural gas lng offshore onshore fuel power generation"
    ),
    "Industrials": (
        "industrial aerospace defense machinery transportation logistics airline "
        "freight railroad shipping trucking manufacturing construction engineering "
        "building products electrical equipment commercial services waste management "
        "infrastructure conglomerate supply chain"
    ),
    "Consumer Defensive": (
        "consumer staples food beverage household products personal care tobacco "
        "supermarket grocery hypermarket discount store agriculture products "
        "packaged food hygiene cleaning products soft drink alcohol brewing"
    ),
    "Real Estate": (
        "real estate reit property housing residential commercial industrial "
        "leasing tenant development management brokerage mortgage reit "
        "data center reit tower reit healthcare reit hotel reit office reit retail reit"
    ),
    "Utilities": (
        "utilities electric power water gas utility renewable utility grid "
        "transmission distribution energy infrastructure clean energy nuclear "
        "independent power producer multi-utilities"
    ),
    "Basic Materials": (
        "basic materials chemicals mining metals steel gold copper silver "
        "agriculture fertilizer construction materials packaging container "
        "paper forest products specialty chemicals industrial gases commodity "
        "aluminum iron ore lithium rare earth"
    )
}

# ==========================================
# 2. CLASSIFIER CLASS
# ==========================================
class SectorClassifier:
    def __init__(self, keywords: Dict[str, str]):
        self.sector_names = list(keywords.keys())
        self.sector_docs = list(keywords.values())
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def classify(self, df: pd.DataFrame, text_col: str, threshold: float = 0.02, max_labels: int = 3) -> pd.DataFrame:
        """
        Performs TF-IDF vectorization and cosine similarity to assign sectors.
        """
        print("üßÆ Vectorizing text and calculating similarity...")
        
        # Prepare Corpus: Combine Sector Keywords + News Content
        all_docs = self.sector_docs + df[text_col].tolist()
        tfidf_matrix = self.vectorizer.fit_transform(all_docs)

        # Separate matrices
        sector_vectors = tfidf_matrix[:len(self.sector_names)]
        news_vectors = tfidf_matrix[len(self.sector_names):]

        # Calculate Similarity
        similarity_scores = cosine_similarity(news_vectors, sector_vectors)

        # Prepare result containers
        primary_sectors = []
        confidences = []
        sector_dicts = []
        sector_counts = []

        print("üîç Analyzing sectors for each article...")
        
        for scores in similarity_scores:
            # --- Logic Part 1: Single Best Sector (Original Logic) ---
            best_idx = scores.argmax()
            max_score = scores.max()
            
            if max_score > threshold:
                primary_sectors.append(self.sector_names[best_idx])
            else:
                primary_sectors.append("Other")
            
            confidences.append(max_score)

            # --- Logic Part 2: Multi-Label Top N (Refined Logic) ---
            # 1. Filter by threshold
            qualified_indices = np.where(scores > threshold)[0]

            if len(qualified_indices) == 0:
                sector_dicts.append({'Other': 0.0})
                sector_counts.append(0)
            else:
                # 2. Sort by score descending
                qualified_scores = scores[qualified_indices]
                # argsort gives ascending, so we reverse it [::-1]
                sorted_indices_local = np.argsort(qualified_scores)[::-1]

                # 3. Take Top N
                top_indices_local = sorted_indices_local[:max_labels]
                final_indices = qualified_indices[top_indices_local]

                # 4. Create Dictionary
                current_dict = {
                    self.sector_names[i]: round(float(scores[i]), 5)
                    for i in final_indices
                }
                sector_dicts.append(current_dict)
                sector_counts.append(len(current_dict))

        # Assign back to DataFrame
        df['Sector'] = primary_sectors
        df['Confidence'] = confidences
        df['Sector_Dict'] = sector_dicts
        df['Sector_Count'] = sector_counts

        return df

# ==========================================
# 3. MAIN EXECUTION
# ==========================================
def load_and_prep_data(filepath: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(filepath)
        # Combine Title and Content, fill NaNs
        df['full_text'] = df['Title'].fillna('') + " " + df['Content'].fillna('')
        return df
    except FileNotFoundError:
        print(f"‚ùå Error: File not found at {filepath}")
        return pd.DataFrame()

if __name__ == "__main__":
    # 1. Load Data
    df_news = load_and_prep_data(CSV_PATH)

    if not df_news.empty:
        # 2. Initialize Classifier
        classifier = SectorClassifier(SECTOR_KEYWORDS)

        # 3. Process Data
        df_result = classifier.classify(
            df_news, 
            text_col='full_text', 
            threshold=THRESHOLD, 
            max_labels=MAX_LABELS
        )

        # 4. Display Results
        print("\n" + "="*50)
        print(f"‚úÖ Processing Complete. Total rows: {len(df_result)}")
        print("="*50)
        
        # Show Sample
        cols_to_show = ['Date', 'Sector', 'Confidence', 'Sector_Dict', 'Sector_Count', 'Title']
        print(df_result[cols_to_show].head(10))
        
        print("\nüìä Sector Distribution (Primary):")
        print(df_result['Sector'].value_counts())

üßÆ Vectorizing text and calculating similarity...
üîç Analyzing sectors for each article...

‚úÖ Processing Complete. Total rows: 10816
                  Date                  Sector  Confidence  \
0  2026-01-03 09:56:10              Financials    0.044771   
1  2026-01-03 09:55:45       Consumer Cyclical    0.049162   
2  2026-01-03 09:05:02              Technology    0.032434   
3  2026-01-03 03:24:29             Industrials    0.024838   
4  2026-01-03 01:12:24              Financials    0.146302   
5  2026-01-02 23:25:27  Communication Services    0.159893   
6  2026-01-02 22:13:29                   Other    0.014220   
7  2026-01-02 21:48:35               Utilities    0.134569   
8  2026-01-02 21:43:09                   Other    0.019699   
9  2026-01-02 21:31:23         Basic Materials    0.025417   

                                         Sector_Dict  Sector_Count  \
0  {'Financials': 0.04477, 'Healthcare': 0.02542,...             3   
1  {'Consumer Cyclical': 0.04916, 'Ene

In [13]:
# drop fulltext and save csv file
df_result
df_result_drop = df_result.drop("full_text", axis=1)
df_result_drop = df_result_drop.to_csv("csv_checkpoint/investing_news_tfidf.csv", index=False)

In [14]:
# read_csv 
df = pd.read_csv("csv_checkpoint/investing_news_tfidf.csv")
df

Unnamed: 0,Page,Date,Source,Title,Link,Content,Sector,Confidence,Sector_Dict,Sector_Count
0,1,2026-01-03 09:56:10,Investing.com,BofA unveils its top 10 U.S. ideas for Q1 2026,https://www.investing.com/news/stock-market-ne...,Investing.com -- Bank of America has released ...,Financials,0.044771,"{'Financials': 0.04477, 'Healthcare': 0.02542,...",3
1,1,2026-01-03 09:55:45,Investing.com,Canaccord‚Äôs says 2026 is likely to be ‚Äôa bount...,https://www.investing.com/news/stock-market-ne...,Investing.com --¬†Canaccord Genuity analyst Geo...,Consumer Cyclical,0.049162,"{'Consumer Cyclical': 0.04916, 'Energy': 0.02481}",2
2,1,2026-01-03 09:05:02,Investing.com,Is Reddit the new homepage for the Open Web?,https://www.investing.com/news/stock-market-ne...,Investing.com -- Reddit is increasingly positi...,Technology,0.032434,{'Technology': 0.03243},1
3,1,2026-01-03 03:24:29,Reuters,"Trump blocks chips deal, cites security, China...",https://www.investing.com/news/stock-market-ne...,"WASHINGTON, Jan 2 (Reuters) - President Donald...",Industrials,0.024838,{'Industrials': 0.02484},1
4,1,2026-01-03 01:12:24,Reuters,"Top hedge funds led by D.E.Shaw, Bridgewater a...",https://www.investing.com/news/stock-market-ne...,(Corrects Point72‚Äôs return figures in second b...,Financials,0.146302,"{'Financials': 0.1463, 'Utilities': 0.03505}",2
...,...,...,...,...,...,...,...,...,...,...
10811,1,2026-01-10 00:25:34,Reuters,US FTC wins ruling blocking Edwards Lifescienc...,https://www.investing.com/news/stock-market-ne...,Jan 9 (Reuters) - The U.S. Federal Trade Commi...,Healthcare,0.046365,{'Healthcare': 0.04636},1
10812,1,2026-01-10 09:00:05,Investing.com,Investing.com‚Äôs stocks of the week,https://www.investing.com/news/stock-market-ne...,Investing.com ‚Äì With the first week of full tr...,Utilities,0.098161,"{'Utilities': 0.09816, 'Energy': 0.05548, 'Tec...",3
10813,1,2026-01-10 09:00:03,Investing.com,Wolfe analysts say these will be key investmen...,https://www.investing.com/news/stock-market-ne...,Investing.com -- Wolfe Research expects U.S. e...,Financials,0.093048,"{'Financials': 0.09305, 'Consumer Cyclical': 0...",3
10814,1,2026-01-10 08:30:03,Investing.com,Will SMidCaps outperform in 2026?,https://www.investing.com/news/stock-market-ne...,Investing.com -- Small- and mid-cap (SMidCaps)...,Healthcare,0.024086,{'Healthcare': 0.02409},1


### IDX(LLM)

In [None]:
import torch
from transformers import pipeline
# MY_HUGGIEFACE_TOKEN = "" # USE you OWN HUGGIING FACE TOKEN

In [16]:
import pandas as pd
import torch
import json
import os
import gc
import ast
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from typing import List, Any

# ==========================================
# 1. CONFIGURATION
# ==========================================
class Config:
    # Files
    TFIDF_FILE = 'csv_checkpoint/investing_news_tfidf.csv'   # Input 1: ‡∏ú‡∏•‡∏à‡∏≤‡∏Å TF-IDF
    LLM_TEMP_FILE = 'csv_checkpoint/investing_news_llm.csv'  # Temp Output: ‡∏ú‡∏•‡∏à‡∏≤‡∏Å AI (Save ‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡∏ó‡∏≤‡∏á)
    FINAL_OUTPUT_FILE = 'df_final_result.csv'                # Final Output: ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
    
    # Model Settings
    MODEL_NAME = "Qwen/Qwen2.5-14B-Instruct"
    BATCH_SIZE = 16
    DEVICE = "cuda:0"
    
    EXISTING_SECTORS = [
        'Financials', 'Technology', 'Healthcare', 'Consumer Cyclical',
        'Energy', 'Industrials', 'Basic Materials', 'Communication Services',
        'Utilities', 'Consumer Defensive', 'Real Estate'
    ]

# ==========================================
# 2. STEP 1: LLM CLASSIFIER (AI Logic)
# ==========================================
def sanitize_sector_output(sector: Any) -> str:
    if isinstance(sector, list): return ",".join([str(s) for s in sector])
    elif isinstance(sector, dict): return str(sector)
    return str(sector)

def parse_llm_response(response: str) -> str:
    try:
        clean_json = response.replace("```json", "").replace("```", "").strip()
        start = clean_json.find('{')
        end = clean_json.rfind('}') + 1
        if start != -1 and end != -1:
            data = json.loads(clean_json[start:end])
            return data.get("sector", "Other")
        return "Other"
    except: return "Other"

class NewsClassifier:
    def __init__(self, model_name: str, device: str):
        print(f"üöÄ [Step 1] Loading AI Model: {model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
        if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map=device)
        self.device = device

    def batch_predict(self, titles: List[str], contents: List[str]) -> List[str]:
        prompts = []
        for t, c in zip(titles, contents):
            text = f"""Classify into JSON.
Sectors: {json.dumps(Config.EXISTING_SECTORS)}
If unrelated, use "Other".
News: "{t}"
Snippet: "{str(c)[:500]}..."
Format: {{"sector": "..."}}"""
            messages = [{"role": "user", "content": text}]
            prompts.append(self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
        
        inputs = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(self.device)
        with torch.no_grad():
            generated_ids = self.model.generate(**inputs, max_new_tokens=40, temperature=0.1, do_sample=False)
        
        input_len = inputs.input_ids.shape[1]
        generated_ids = generated_ids[:, input_len:]
        return self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    def free_memory(self):
        print("üßπ [Cleanup] Clearing VRAM...")
        del self.model
        del self.tokenizer
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            print(f"‚úÖ VRAM Cleared. Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

def run_llm_process():
    if not os.path.exists(Config.TFIDF_FILE):
        print(f"‚ùå Error: Input file {Config.TFIDF_FILE} missing.")
        return False

    df = pd.read_csv(Config.TFIDF_FILE)
    if 'AI_Sector' not in df.columns: df['AI_Sector'] = None

    # Filter only 'Other' or NaN
    mask = (df['Sector'] == 'Other') | (df['Sector'].isna())
    target_indices = df[mask].index.tolist()
    print(f"üìä Rows to classify by AI: {len(target_indices)}")

    if len(target_indices) > 0:
        classifier = NewsClassifier(Config.MODEL_NAME, Config.DEVICE)
        try:
            for i in tqdm(range(0, len(target_indices), Config.BATCH_SIZE), desc="ü§ñ AI Processing"):
                batch_idx = target_indices[i : i + Config.BATCH_SIZE]
                batch_titles = df.loc[batch_idx, 'Title'].tolist()
                batch_contents = df.loc[batch_idx, 'Content'].tolist()
                
                raw_responses = classifier.batch_predict(batch_titles, batch_contents)
                
                for idx, resp in zip(batch_idx, raw_responses):
                    clean_sector = sanitize_sector_output(parse_llm_response(resp))
                    try: df.at[idx, 'AI_Sector'] = clean_sector
                    except: df.loc[idx, 'AI_Sector'] = clean_sector

                if (i // Config.BATCH_SIZE) % 5 == 0:
                    df.to_csv(Config.LLM_TEMP_FILE, index=False)
        finally:
            classifier.free_memory() # üî• Clear VRAM immediately after loop

    # Save final LLM result
    df.to_csv(Config.LLM_TEMP_FILE, index=False)
    print(f"üíæ AI Results saved to {Config.LLM_TEMP_FILE}")
    return True

# ==========================================
# 3. STEP 2: MERGER & FINAL LOGIC
# ==========================================
class ResultMerger:
    def _determine_sector(self, row):
        # 1. Check TF-IDF result first
        sector_dict_str = row.get('Sector_Dict', '{}')
        sector_count = row.get('Sector_Count', 0)
        
        valid_keys = []
        try:
            val_dict = ast.literal_eval(sector_dict_str) if isinstance(sector_dict_str, str) else sector_dict_str
            if isinstance(val_dict, dict):
                valid_keys = list(val_dict.keys())
                if len(valid_keys) > 1 and 'Other' in valid_keys:
                    valid_keys.remove('Other')
        except: pass

        # Logic: If TF-IDF found valid sectors -> Use them. Else -> Use AI.
        if sector_count > 0 and valid_keys != ['Other'] and valid_keys:
            return ", ".join(valid_keys)
        else:
            ai_val = row.get('AI_Sector')
            return str(ai_val) if pd.notna(ai_val) and str(ai_val).strip() != "" else "Other"

    def process(self):
        print("\nüîó [Step 2] Merging & Finalizing Sectors...")
        
        # Load & Merge
        df_tfidf = pd.read_csv(Config.TFIDF_FILE)
        try:
            df_llm = pd.read_csv(Config.LLM_TEMP_FILE)
        except FileNotFoundError:
            print("‚ö†Ô∏è No LLM file found, using TF-IDF only.")
            df_llm = pd.DataFrame()

        # Vertical Concat & Deduplicate (Prioritize LLM/Last file)
        df_combined = pd.concat([df_tfidf, df_llm], ignore_index=True)
        df_combined = df_combined.drop_duplicates(subset=['Link'], keep='last')
        
        # Apply Logic
        df_combined['Combined_Sector'] = df_combined.apply(self._determine_sector, axis=1)
        
        # Save Final
        df_combined.to_csv(Config.FINAL_OUTPUT_FILE, index=False)
        print(f"‚úÖ SUCCESS! Final data saved to: {Config.FINAL_OUTPUT_FILE}")
        print(f"   Total Rows: {len(df_combined)}")

# ==========================================
# 4. MAIN PIPELINE
# ==========================================
if __name__ == "__main__":
    # 1. Run AI Process
    success = run_llm_process()
    
    # 2. Run Merge Process
    if success:
        merger = ResultMerger()
        merger.process()

üìä Rows to classify by AI: 1998
üöÄ [Step 1] Loading AI Model: Qwen/Qwen2.5-14B-Instruct...


Loading weights:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 378/579 [00:27<00:14, 13.76it/s, Materializing param=model.layers.31.mlp.up_proj.weight]             


OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacity of 39.38 GiB of which 15.38 MiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 38.94 GiB is allocated by PyTorch, and 8.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Sentiment

In [None]:
# 1. ‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå (‡∏™‡∏°‡∏°‡∏ï‡∏¥‡∏ß‡πà‡∏≤‡πÉ‡∏ä‡πâ‡πÑ‡∏ü‡∏•‡πå combined ‡∏•‡πà‡∏≤‡∏™‡∏∏‡∏î)
import pandas as pd
df = pd.read_csv('csv_checkpoint/df_final_result_idx.csv')
# df = df[df["Sector_Count"] == 0]
df["Combined_Sector"].value_counts()
# df = df[["Page", "Date", "Source",	"Title"	,"Link", "Content", "Combined_Sector"]]
# df

Combined_Sector
Financials                                         1344
Other                                              1099
Technology                                          893
Healthcare                                          724
Consumer Cyclical                                   578
                                                   ... 
Energy, Technology, Healthcare                        1
Healthcare, Communication Services, Utilities         1
Technology, Healthcare, Real Estate                   1
Industrials, Energy, Technology                       1
Industrials, Financials, Communication Services       1
Name: count, Length: 602, dtype: int64

In [1]:
import pandas as pd
import torch
import json
import gc
import warnings
import os
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from huggingface_hub import login

# ‡∏õ‡∏¥‡∏î Warning
warnings.filterwarnings('ignore')
import logging
from transformers import logging as hf_logging
hf_logging.set_verbosity_error() 
logging.getLogger("transformers.generation_utils").setLevel(logging.ERROR)


# ==========================================
# ‚öôÔ∏è CONFIG
# ==========================================
MODELS_CONFIG = [
    # {"name": "microsoft/Phi-3-mini-4k-instruct", "weight": 0.4},
    {"name": "Qwen/Qwen2.5-14B-Instruct", "weight": 0.2},
    {"name": "meta-llama/Meta-Llama-3.1-8B-Instruct", "weight": 0.2},
    {"name": "google/gemma-3-12b-it","weight": 0.2}
]

BATCH_SIZE = 16
CSV_CHECKPOINT_DIR = "csv_checkpoint"
SOURCE_FILE = os.path.join(CSV_CHECKPOINT_DIR, "df_final_result_idx.csv")
OUTPUT_FILE = os.path.join(CSV_CHECKPOINT_DIR, "sentiment_final.csv")

# ==========================================
# üõ†Ô∏è UTILS
# ==========================================
def clear_gpu():
    # ‡∏•‡∏ö‡∏ï‡∏±‡∏ß‡πÅ‡∏õ‡∏£ Global ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ
    if 'model' in globals(): del globals()['model']
    if 'tokenizer' in globals(): del globals()['tokenizer']
    torch.cuda.empty_cache()
    gc.collect()
    print("üßπ GPU Memory Cleared!")

def create_prompt(text):
    return f"""Analyze the sentiment of this financial news.
Consider the impact on the company, sector, or economy mentioned.

News: "{text}"

Return ONLY a JSON object with this format:
{{
  "category": "Positive" or "Negative" or "Neutral",
  "score": <float number between -1.0 to 1.0>
}}"""

# ==========================================
# üöÄ MAIN PIPELINE (UPDATED)
# ==========================================
def run_consensus_pipeline(df_pipe):
    print(f"üìÇ Loading data...")
    df = df_pipe.copy()
    
    # ---------------------------------------------------------
    # üîÑ CHECKPOINT SYSTEM: Load existing results if available
    # ---------------------------------------------------------
    if os.path.exists(OUTPUT_FILE):
        print(f"‚ú® Found checkpoint: {OUTPUT_FILE}")
        try:
            df_existing = pd.read_csv(OUTPUT_FILE)
            
            # ‡∏´‡∏≤‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå Score ‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏≠‡∏¢‡∏π‡πà‡πÅ‡∏•‡πâ‡∏ß
            score_cols = [c for c in df_existing.columns if c.startswith("Score_")]
            
            # Merge ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡πÄ‡∏î‡∏¥‡∏°‡∏Å‡∏•‡∏±‡∏ö‡πÄ‡∏Ç‡πâ‡∏≤‡∏°‡∏≤‡πÇ‡∏î‡∏¢‡πÉ‡∏ä‡πâ Link ‡πÄ‡∏õ‡πá‡∏ô Key (‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÉ‡∏ä‡πâ Index ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏ñ‡πâ‡∏≤ Data ‡πÑ‡∏°‡πà‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô)
            # ‡πÉ‡∏ä‡πâ Link ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ä‡∏±‡∏ß‡∏£‡πå
            if 'Link' in df.columns and 'Link' in df_existing.columns:
                # Drop duplicate links in existing data to avoid explosion
                df_existing = df_existing.drop_duplicates(subset=['Link'], keep='last')
                
                # Merge ‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå Score
                cols_to_merge = ['Link'] + score_cols
                df = df.merge(df_existing[cols_to_merge], on='Link', how='left', suffixes=('', '_old'))
                
                # Clean up merge result
                for col in score_cols:
                    if f"{col}_old" in df.columns:
                        # ‡πÄ‡∏ï‡∏¥‡∏°‡∏Ñ‡πà‡∏≤‡∏à‡∏≤‡∏Å‡∏Ç‡∏≠‡∏á‡πÄ‡∏î‡∏¥‡∏°‡∏•‡∏á‡πÉ‡∏ô‡∏ä‡πà‡∏≠‡∏á‡∏ß‡πà‡∏≤‡∏á
                        df[col] = df[col].fillna(df[f"{col}_old"])
                        df.drop(columns=[f"{col}_old"], inplace=True)
                
                print(f"‚úÖ Restored sentiment scores from checkpoint.")
            else:
                print("‚ö†Ô∏è No 'Link' column found for merging. Processing from scratch or using index alignment.")
                # Fallback: ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏°‡∏µ Link ‡πÉ‡∏ä‡πâ Index (‡πÄ‡∏™‡∏µ‡πà‡∏¢‡∏á‡∏´‡∏ô‡πà‡∏≠‡∏¢‡∏ñ‡πâ‡∏≤‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏•‡∏∑‡πà‡∏≠‡∏ô)
                if len(df) == len(df_existing):
                    for col in score_cols:
                        df[col] = df_existing[col]
                        
        except Exception as e:
            print(f"‚ö†Ô∏è Error loading checkpoint: {e}")

    # Prepare Text
    df['Full_Text'] = (df['Title'].fillna('') + "\n" + df['Content'].fillna('')).str.slice(0, 3000)

    for config in MODELS_CONFIG:
        MODEL_NAME = config['name']
        short_name = MODEL_NAME.split('/')[-1]
        col_score = f"Score_{short_name}"
        
        # ‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏® Global ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ clear_gpu() ‡∏°‡∏≠‡∏á‡πÄ‡∏´‡πá‡∏ô
        global model, tokenizer 

        # 1. ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ñ‡πâ‡∏≤‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ (‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô NaN ‡πÑ‡∏ß‡πâ‡∏Å‡πà‡∏≠‡∏ô ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏ä‡πá‡∏Ñ‡∏ß‡πà‡∏≤‡∏ó‡∏≥‡∏´‡∏£‡∏∑‡∏≠‡∏¢‡∏±‡∏á)
        if col_score not in df.columns: 
            df[col_score] = np.nan
        
        # ---------------------------------------------------------
        # üîç SMART FILTER: ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏¢‡∏±‡∏á‡πÄ‡∏õ‡πá‡∏ô NaN
        # ---------------------------------------------------------
        # ‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏ó‡∏≥‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏Ñ‡πà‡∏≤‡πÄ‡∏õ‡πá‡∏ô NaN (‡∏Ñ‡∏∑‡∏≠‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡πÄ‡∏Ñ‡∏¢‡∏ó‡∏≥ ‡∏´‡∏£‡∏∑‡∏≠‡πÄ‡∏Ñ‡∏¢‡∏ó‡∏≥‡πÅ‡∏•‡πâ‡∏ß error ‡∏à‡∏ô‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡∏Ñ‡πà‡∏≤)
        # ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡∏Ñ‡πà‡∏≤‡πÅ‡∏•‡πâ‡∏ß (‡πÅ‡∏°‡πâ‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô 0.0) ‡∏ñ‡∏∑‡∏≠‡∏ß‡πà‡∏≤‡∏ó‡∏≥‡πÅ‡∏•‡πâ‡∏ß
        unprocessed_indices = df[df[col_score].isna()].index.tolist()
        
        if len(unprocessed_indices) == 0:
            print(f"\n‚è© Skipping {short_name} (All items processed!)")
            continue
            
        print(f"\nü§ñ Starting Model: {MODEL_NAME}")
        print(f"   üìã Remaining items: {len(unprocessed_indices)} / {len(df)}")
        
        clear_gpu()
        
        try:
            # Load Tokenizer
            try:
                tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
            except:
                print("‚ö†Ô∏è Falling back to slow tokenizer...")
                tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

            tokenizer.padding_side = 'left'
            if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
            
            # Load Model
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                torch_dtype=torch.bfloat16,
                device_map="cuda:0",
                trust_remote_code=True
            )
            
            # Safety Clamp
            real_vocab_size = model.get_input_embeddings().weight.shape[0]
            MAX_VALID_ID = real_vocab_size - 1

            # Loop ‡πÄ‡∏â‡∏û‡∏≤‡∏∞ indices ‡∏ó‡∏µ‡πà‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡∏ó‡∏≥
            for i in tqdm(range(0, len(unprocessed_indices), BATCH_SIZE), desc=f"Analyzing {short_name}"):
                batch_idx = unprocessed_indices[i : i + BATCH_SIZE]
                batch_texts = df.loc[batch_idx, 'Full_Text'].tolist()
                
                prompts = []
                for text in batch_texts:
                    user_content = create_prompt(text)
                    msgs = [{"role": "user", "content": user_content}]
                    try:
                        formatted_prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
                        prompts.append(formatted_prompt)
                    except:
                        raw_prompt = f"User: {user_content}\nAssistant:"
                        prompts.append(raw_prompt)

                # Inference
                inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=2048).to(model.device)
                
                input_ids = inputs['input_ids']
                input_ids[input_ids > MAX_VALID_ID] = 0
                inputs['input_ids'] = input_ids

                with torch.no_grad():
                    outputs = model.generate(**inputs, max_new_tokens=80, temperature=0.1, do_sample=False)
                
                decoded = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
                
                # Process Results
                for idx, resp in zip(batch_idx, decoded):
                    score = 0.0 # Default fallback (Neutral)
                    try:
                        clean = resp.replace("```json", "").replace("```", "").strip()
                        start, end = clean.find('{'), clean.rfind('}') + 1
                        if start != -1 and end != -1:
                            data = json.loads(clean[start:end])
                            score = float(data.get("score", 0.0))
                        else:
                            # Fallback keyword matching
                            if "positive" in resp.lower(): score = 0.5
                            elif "negative" in resp.lower(): score = -0.5
                    except: 
                        pass
                    
                    # Update DataFrame
                    df.at[idx, col_score] = score
                
                # ---------------------------------------------------------
                # üíæ SAVE CHECKPOINT: ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ó‡∏±‡∏ô‡∏ó‡∏µ‡∏´‡∏•‡∏±‡∏á‡∏à‡∏ö Batch
                # ---------------------------------------------------------
                # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ó‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏¥‡∏°‡πÑ‡∏õ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏¢‡πÜ ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏ñ‡∏≤‡∏ô‡∏∞‡∏•‡πà‡∏≤‡∏™‡∏∏‡∏î
                df.to_csv(OUTPUT_FILE, index=False)
            
            del model
            del tokenizer
            clear_gpu()

        except Exception as e:
            print(f"‚ö†Ô∏è Failed {MODEL_NAME}: {e}")
            continue
    
    return df

# ==========================================
# üèÅ EXECUTION
# ==========================================
if __name__ == "__main__":
    # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö Folder
    if not os.path.exists(CSV_CHECKPOINT_DIR):
        os.makedirs(CSV_CHECKPOINT_DIR)
        print(f"üìÅ Created directory: {CSV_CHECKPOINT_DIR}")

    # ‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå Source (df_final_result_idx.csv)
    if os.path.exists(SOURCE_FILE):
        print(f"Reading source from: {SOURCE_FILE}")
        df = pd.read_csv(SOURCE_FILE)
        
        # ‡∏£‡∏±‡∏ô Pipeline
        result = run_consensus_pipeline(df)
        
        print("\nüéâ Analysis Completed!")
        print(f"üíæ Final result saved to: {OUTPUT_FILE}")
    else:
        print(f"‚ùå Source file not found: {SOURCE_FILE}")
        print("Please upload 'df_final_result_idx.csv' to the 'csv_checkpoint' folder.")

  from .autonotebook import tqdm as notebook_tqdm


Reading source from: csv_checkpoint/df_final_result_idx.csv
üìÇ Loading data...
‚ú® Found checkpoint: csv_checkpoint/sentiment_final.csv
‚úÖ Restored sentiment scores from checkpoint.

ü§ñ Starting Model: Qwen/Qwen2.5-14B-Instruct
   üìã Remaining items: 2781 / 10812
üßπ GPU Memory Cleared!


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 579/579 [01:03<00:00,  9.10it/s, Materializing param=model.norm.weight]                              
Analyzing Qwen2.5-14B-Instruct: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 174/174 [12:37<00:00,  4.35s/it]


üßπ GPU Memory Cleared!

‚è© Skipping Meta-Llama-3.1-8B-Instruct (All items processed!)

ü§ñ Starting Model: google/gemma-3-12b-it
   üìã Remaining items: 10812 / 10812
üßπ GPU Memory Cleared!


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1065/1065 [01:11<00:00, 14.83it/s, Materializing param=model.vision_tower.vision_model.post_layernorm.weight]                       
Analyzing gemma-3-12b-it: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 676/676 [58:02<00:00,  5.15s/it] 


üßπ GPU Memory Cleared!

üéâ Analysis Completed!
üíæ Final result saved to: csv_checkpoint/sentiment_final.csv


In [7]:
# save to csv (drop Fulltext)
# result = result.drop(columns=['Full_Text' , 'Score_gemma-3-4b-it'])
result.to_csv('csv_checkpoint/sentiment_final.csv', index=False)
result.isnull().sum()
result.head()

Unnamed: 0,Page,Date,Source,Title,Link,Content,Sector,Confidence,Sector_Dict,Sector_Count,AI_Sector,Combined_Sector,Score_Qwen2.5-14B-Instruct,Score_Meta-Llama-3.1-8B-Instruct,Score_gemma-3-12b-it
0,1,2026-01-03 09:56:10,Investing.com,BofA unveils its top 10 U.S. ideas for Q1 2026,https://www.investing.com/news/stock-market-ne...,Investing.com -- Bank of America has released ...,Financials,0.044773,"{'Financials': 0.04477, 'Healthcare': 0.02544,...",3,,"Financials, Healthcare, Energy",0.6,0.65,0.6
1,1,2026-01-03 09:55:45,Investing.com,Canaccord‚Äôs says 2026 is likely to be ‚Äôa bount...,https://www.investing.com/news/stock-market-ne...,Investing.com --¬†Canaccord Genuity analyst Geo...,Consumer Cyclical,0.049161,"{'Consumer Cyclical': 0.04916, 'Energy': 0.02483}",2,,"Consumer Cyclical, Energy",0.85,0.85,0.85
2,1,2026-01-03 09:05:02,Investing.com,Is Reddit the new homepage for the Open Web?,https://www.investing.com/news/stock-market-ne...,Investing.com -- Reddit is increasingly positi...,Technology,0.032455,{'Technology': 0.03246},1,,Technology,0.85,0.83,0.85
3,1,2026-01-03 03:24:29,Reuters,"Trump blocks chips deal, cites security, China...",https://www.investing.com/news/stock-market-ne...,"WASHINGTON, Jan 2 (Reuters) - President Donald...",Industrials,0.024836,{'Industrials': 0.02484},1,,Industrials,-0.75,-0.7,-0.7
4,1,2026-01-03 01:12:24,Reuters,"Top hedge funds led by D.E.Shaw, Bridgewater a...",https://www.investing.com/news/stock-market-ne...,(Corrects Point72‚Äôs return figures in second b...,Financials,0.146323,"{'Financials': 0.14632, 'Utilities': 0.03504}",2,,"Financials, Utilities",0.85,0.85,0.9


## News Summary

In [11]:
import pandas as pd
import torch
import gc
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

# ==========================================
# ‚öôÔ∏è SYSTEM CONFIGURATION
# ==========================================
MODEL_NAME = "Qwen/Qwen2.5-14B-Instruct" 
BATCH_SIZE = 32
MAX_OUTPUT_TOKENS = 60 

# Files
INPUT_FILE = 'csv_checkpoint/sentiment_final.csv'
OUTPUT_FILE = 'csv_checkpoint/news_summary.csv'

# ==========================================
# üõ†Ô∏è UTILITIES: GPU MANAGER
# ==========================================
def clear_resources():
    """‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏•‡πâ‡∏≤‡∏á‡∏´‡∏ô‡πà‡∏ß‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡∏à‡∏≥ GPU ‡πÅ‡∏ö‡∏ö‡∏´‡∏°‡∏î‡∏à‡∏î"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    print("üßπ GPU Memory Cleared")

# ==========================================
# üß† CORE AI ENGINE (‡∏Ñ‡∏á‡πÄ‡∏î‡∏¥‡∏°)
# ==========================================
class NewsSummarizer:
    def __init__(self, model_name):
        print(f"ü§ñ Loading Model: {model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left" 
        
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.bfloat16, 
            device_map="cuda:0", 
            trust_remote_code=True
        )
        self.model.eval()

    def generate_batch(self, titles, contents, batch_size):
        prompts = []
        for t, c in zip(titles, contents):
            prompt = f"""Task: Summarize the financial news into 1 sentence.
News: {t} - {str(c)[:1000]}...
Summary:"""
            messages = [{"role": "user", "content": prompt}]
            formatted_prompt = self.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            prompts.append(formatted_prompt)

        all_summaries = []
        total_items = len(prompts)
        
        print(f"üöÄ Starting Batch Processing: {total_items} items (Batch Size: {batch_size})")

        for i in tqdm(range(0, total_items, batch_size), desc="Summarizing"):
            batch_prompts = prompts[i : i + batch_size]
            
            inputs = self.tokenizer(
                batch_prompts, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=2048
            ).to(self.model.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs, 
                    max_new_tokens=MAX_OUTPUT_TOKENS,
                    temperature=0.1, 
                    do_sample=False, 
                    pad_token_id=self.tokenizer.pad_token_id
                )

            generated_ids = outputs[:, inputs.input_ids.shape[1]:]
            decoded_batch = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            
            clean_batch = [txt.strip().replace('\n', ' ') for txt in decoded_batch]
            all_summaries.extend(clean_batch)

        return all_summaries

# ==========================================
# üöÄ MAIN PIPELINE (UPDATED)
# ==========================================
def run_pipeline():
    # 1. Load Main Input Data
    print(f"üìÇ Loading Main Data from {INPUT_FILE}...")
    if not os.path.exists(INPUT_FILE):
        print(f"‚ùå Input file {INPUT_FILE} not found. Please run the previous step first.")
        return
    
    df_main = pd.read_csv(INPUT_FILE)
    
    # 2. Check for Existing Output (The Cache)
    if os.path.exists(OUTPUT_FILE):
        print(f"üîé Found existing output file: {OUTPUT_FILE}")
        df_existing = pd.read_csv(OUTPUT_FILE)
        
        # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡∏°‡∏µ Column ‡∏Ñ‡∏£‡∏ö‡πÑ‡∏´‡∏°
        if 'Link' in df_existing.columns and 'Short_Ans' in df_existing.columns:
            # ‡∏™‡∏£‡πâ‡∏≤‡∏á Dictionary {Link: Short_Ans} ‡∏à‡∏≤‡∏Å‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏Å‡πà‡∏≤
            # ‡πÉ‡∏ä‡πâ drop_duplicates ‡∏Å‡∏±‡∏ô‡πÄ‡∏´‡∏ô‡∏µ‡∏¢‡∏ß ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ Link ‡πÄ‡∏õ‡πá‡∏ô Unique Key
            existing_map = df_existing.dropna(subset=['Short_Ans']).drop_duplicates(subset=['Link']).set_index('Link')['Short_Ans'].to_dict()
            
            # Map ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏Å‡πà‡∏≤‡πÉ‡∏™‡πà df_main (‡∏ñ‡πâ‡∏≤‡∏°‡∏µ Link ‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ô ‡∏à‡∏∞‡πÑ‡∏î‡πâ Summary ‡πÄ‡∏î‡∏¥‡∏°‡∏°‡∏≤‡πÄ‡∏•‡∏¢)
            df_main['Short_Ans'] = df_main['Link'].map(existing_map)
            
            # ‡∏ô‡∏±‡∏ö‡∏à‡∏≥‡∏ô‡∏ß‡∏ô
            found_count = df_main['Short_Ans'].notna().sum()
            print(f"   ‚úÖ Recovered {found_count} existing summaries.")
        else:
            print("   ‚ö†Ô∏è Existing file structure incorrect. Will re-process all.")
            df_main['Short_Ans'] = None
    else:
        print("   ‚ÑπÔ∏è No existing output found. Starting fresh.")
        df_main['Short_Ans'] = None

    # 3. Identify "To-Do" Items (Filter rows with NO summary)
    # ‡πÄ‡∏á‡∏∑‡πà‡∏≠‡∏ô‡πÑ‡∏Ç: ‡πÄ‡∏õ‡πá‡∏ô NaN ‡∏´‡∏£‡∏∑‡∏≠ ‡πÄ‡∏õ‡πá‡∏ô string ‡∏ß‡πà‡∏≤‡∏á
    mask_todo = df_main['Short_Ans'].isna() | (df_main['Short_Ans'] == "")
    df_todo = df_main[mask_todo]
    
    total_rows = len(df_main)
    todo_rows = len(df_todo)
    
    print(f"\nüìä Status Report:")
    print(f"   - Total News: {total_rows}")
    print(f"   - Already Done: {total_rows - todo_rows}")
    print(f"   - To Do (GPU): {todo_rows}")

    # 4. Conditional Execution
    if todo_rows == 0:
        print("\n‚ú® All news already summarized! Nothing to do.")
        # Save again just to be sure files are synced
        df_main.to_csv(OUTPUT_FILE, index=False)
        return

    # ‡πÄ‡∏£‡∏¥‡πà‡∏°‡πÇ‡∏´‡∏•‡∏î Model ‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏°‡∏µ‡∏á‡∏≤‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏ó‡∏≥
    summarizer = NewsSummarizer(MODEL_NAME)
    
    try:
        # 5. Run Batch Summarization (‡πÄ‡∏â‡∏û‡∏≤‡∏∞ df_todo)
        print("\nüöÄ Processing new items...")
        new_summaries = summarizer.generate_batch(
            df_todo['Title'].tolist(), 
            df_todo['Content'].fillna('').tolist(), 
            BATCH_SIZE
        )
        
        # 6. Merge Results Back
        # ‡πÉ‡∏™‡πà‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏Å‡∏•‡∏±‡∏ö‡πÄ‡∏Ç‡πâ‡∏≤‡πÑ‡∏õ‡πÉ‡∏ô‡∏ï‡∏≥‡πÅ‡∏´‡∏ô‡πà‡∏á‡πÄ‡∏î‡∏¥‡∏° (Locate by mask)
        df_main.loc[mask_todo, 'Short_Ans'] = new_summaries
        
        # 7. Save Result
        df_main.to_csv(OUTPUT_FILE, index=False)
        print(f"\n‚úÖ Pipeline Complete! Saved updated data to {OUTPUT_FILE}")
        
        # Show sample of NEW summaries
        print("\nSample of NEW summaries:")
        print(df_main.loc[mask_todo, ['Title', 'Short_Ans']].head())
        
    except Exception as e:
        print(f"‚ùå Error during processing: {e}")
    
    finally:
        # 8. Cleanup
        if 'summarizer' in locals():
            del summarizer
        clear_resources()

if __name__ == "__main__":
    run_pipeline()

üìÇ Loading Main Data from csv_checkpoint/sentiment_final.csv...
üîé Found existing output file: csv_checkpoint/news_summary.csv
   ‚úÖ Recovered 10812 existing summaries.

üìä Status Report:
   - Total News: 10812
   - Already Done: 10812
   - To Do (GPU): 0

‚ú® All news already summarized! Nothing to do.


In [12]:
df = pd.read_csv("csv_checkpoint/news_summary.csv").head()
df

Unnamed: 0,Page,Date,Source,Title,Link,Content,Sector,Confidence,Sector_Dict,Sector_Count,AI_Sector,Combined_Sector,Score_Qwen2.5-14B-Instruct,Score_Meta-Llama-3.1-8B-Instruct,Score_gemma-3-12b-it,Short_Ans
0,1,2026-01-03 09:56:10,Investing.com,BofA unveils its top 10 U.S. ideas for Q1 2026,https://www.investing.com/news/stock-market-ne...,Investing.com -- Bank of America has released ...,Financials,0.044773,"{'Financials': 0.04477, 'Healthcare': 0.02544,...",3,,"Financials, Healthcare, Energy",0.6,0.65,0.6,Bank of America has unveiled its top 10 U.S. s...
1,1,2026-01-03 09:55:45,Investing.com,Canaccord‚Äôs says 2026 is likely to be ‚Äôa bount...,https://www.investing.com/news/stock-market-ne...,Investing.com --¬†Canaccord Genuity analyst Geo...,Consumer Cyclical,0.049161,"{'Consumer Cyclical': 0.04916, 'Energy': 0.02483}",2,,"Consumer Cyclical, Energy",0.85,0.85,0.85,Canaccord Genuity analyst George Gianarikas pr...
2,1,2026-01-03 09:05:02,Investing.com,Is Reddit the new homepage for the Open Web?,https://www.investing.com/news/stock-market-ne...,Investing.com -- Reddit is increasingly positi...,Technology,0.032455,{'Technology': 0.03246},1,,Technology,0.85,0.83,0.85,Needham analyst Laura Martin suggests that Red...
3,1,2026-01-03 03:24:29,Reuters,"Trump blocks chips deal, cites security, China...",https://www.investing.com/news/stock-market-ne...,"WASHINGTON, Jan 2 (Reuters) - President Donald...",Industrials,0.024836,{'Industrials': 0.02484},1,,Industrials,-0.75,-0.7,-0.7,President Trump has blocked a $3 million chip ...
4,1,2026-01-03 01:12:24,Reuters,"Top hedge funds led by D.E.Shaw, Bridgewater a...",https://www.investing.com/news/stock-market-ne...,(Corrects Point72‚Äôs return figures in second b...,Financials,0.146323,"{'Financials': 0.14632, 'Utilities': 0.03504}",2,,"Financials, Utilities",0.85,0.85,0.9,"Top hedge funds such as D.E. Shaw, Bridgewater..."


## AI Analysis

In [None]:
import pandas as pd
import torch
import json
import re
import gc
from datetime import datetime, timedelta
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

# ==========================================
# ‚öôÔ∏è CONFIGURATION & MODEL WEIGHTS
# ==========================================
LOOKBACK_DAYS = 7 
ANALYSIS_RANGE = 3  # ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏¢‡πâ‡∏≠‡∏ô‡∏´‡∏•‡∏±‡∏á 2 ‡∏ß‡∏±‡∏ô (‡∏ï‡∏≤‡∏°‡πÇ‡∏Ñ‡πâ‡∏î‡πÄ‡∏î‡∏¥‡∏°)

# ‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡∏∞‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏ñ‡∏∑‡∏≠
MODEL_CONFIGS = [
    #{"name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "short_name": "Deepseek", "weight": 0.4},
    {"name": "Qwen/Qwen2.5-14B-Instruct", "short_name": "Qwen", "weight": 0.34},
    {"name": "meta-llama/Meta-Llama-3.1-8B-Instruct", "short_name": "Llama", "weight": 0.33},
    {"name": "google/gemma-3-12b-it", "short_name": "Gemma", "weight": 0.33} 
]

# ==========================================
# 1. üì• LOAD & PREPARE DATA
# ==========================================
print("üìÇ Loading Data...")
try:
    df = pd.read_csv('csv_checkpoint/news_summary.csv')
    
    if 'Short_Ans' not in df.columns: df['Short_Ans'] = df['Content']
    if 'Date' not in df.columns: 
        df['Date'] = [datetime.now() - timedelta(days=x%12) for x in range(len(df))]
    else:
        df['Date'] = pd.to_datetime(df['Date'])

    # Explode Sectors
    df['Sector_List'] = df['Combined_Sector'].astype(str).str.split(',')
    expanded_df = df.explode('Sector_List')
    expanded_df['Target_Sector'] = expanded_df['Sector_List'].str.strip()
    
    # ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏•‡∏¥‡∏™‡∏ï‡πå‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà‡∏¢‡πâ‡∏≠‡∏ô‡∏´‡∏•‡∏±‡∏á
    latest_db_date = df['Date'].max()
    target_dates = [latest_db_date - timedelta(days=i) for i in range(ANALYSIS_RANGE)]
    target_dates.reverse() # ‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏à‡∏≤‡∏Å‡πÄ‡∏Å‡πà‡∏≤ -> ‡πÉ‡∏´‡∏°‡πà
    
    print(f"‚úÖ Data Ready. Analyzing History: {[d.strftime('%Y-%m-%d') for d in target_dates]}")

except Exception as e:
    print(f"‚ùå Error Loading Data: {e}")
    target_dates = []
    expanded_df = pd.DataFrame()

# ==========================================
# 2. üß† HELPER FUNCTIONS
# ==========================================
def get_sector_context(sector_name, full_df):
    """‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏Ç‡πà‡∏≤‡∏ß‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Sector ‡∏ô‡∏±‡πâ‡∏ô‡πÜ"""
    sector_df = full_df[full_df['Target_Sector'] == sector_name].sort_values(by='Date', ascending=False)
    
    # ‚úÖ ‡πÄ‡∏û‡∏¥‡πà‡∏°: ‡∏ô‡∏±‡∏ö‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ç‡πà‡∏≤‡∏ß‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡∏ó‡∏µ‡πà‡πÄ‡∏à‡∏≠‡πÉ‡∏ô Window ‡∏ô‡∏µ‡πâ
    news_count = len(sector_df)
    
    # Simple weighted score calc
    total_weight = sector_df['Time_Weight'].sum()
    weighted_avg_score = (sector_df['Weighted_Score'].sum() / total_weight) if total_weight > 0 else 0
    
    news_context = ""
    for _, row in sector_df.iterrows(): # ‡πÄ‡∏≠‡∏≤‡πÅ‡∏Ñ‡πà 5 ‡∏Ç‡πà‡∏≤‡∏ß‡∏•‡πà‡∏≤‡∏™‡∏∏‡∏î context string
        d_str = row['Date'].strftime('%Y-%m-%d')
        news_context += f"- {d_str}: {row.get('Title', 'N/A')} -> {str(row.get('Short_Ans', ''))[:150]}...\n"
        
    return news_context, weighted_avg_score, news_count

def parse_llm_response(response_text):
    """‡∏û‡∏¢‡∏≤‡∏¢‡∏≤‡∏°‡∏î‡∏∂‡∏á JSON ‡∏à‡∏≤‡∏Å‡∏Ñ‡∏≥‡∏ï‡∏≠‡∏ö"""
    try:
        match = re.search(r'\{.*\}', response_text, re.DOTALL)
        if match:
            data = json.loads(match.group())
            return data.get('score', 5.0), data.get('analysis', 'No analysis'), data.get('outlook', 'Neutral')
    except:
        pass
    return 5.0, "Error parsing output", "Neutral"

# ‡πÄ‡∏Å‡πá‡∏ö‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÅ‡∏¢‡∏Å‡∏ï‡∏≤‡∏° ‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà -> Sector -> Model
history_results = {}

# ==========================================
# 3. üîÑ MODEL LOOP
# ==========================================

for config in MODEL_CONFIGS:
    model_name = config['name']
    short_name = config['short_name']
    
    print(f"\n" + "="*50)
    print(f"ü§ñ Loading Model: {model_name} ({short_name})...")
    print("="*50)
    
    try:
        # Load Model
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.bfloat16,
            device_map="cuda:0",
            trust_remote_code=True,
            # token="YOUR_HUGGINGFACE_TOKEN" # ‡πÉ‡∏™‡πà token ‡∏ñ‡πâ‡∏≤‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô
        )
        
        # Loop Dates
        for target_date in tqdm(target_dates, desc=f"üìÖ Processing Days ({short_name})"):
            target_date_str = target_date.strftime('%Y-%m-%d')
            
            # Filter Data
            start_window = target_date - timedelta(days=LOOKBACK_DAYS)
            daily_df = expanded_df[
                (expanded_df['Date'] <= target_date) & 
                (expanded_df['Date'] >= start_window)
            ].copy()
            
            if daily_df.empty: continue

            # Time Weight Calculation
            daily_df['Days_Ago'] = (target_date - daily_df['Date']).dt.days
            daily_df['Time_Weight'] = daily_df['Days_Ago'].apply(lambda d: max(0.1, 1 - (d / (LOOKBACK_DAYS + 1))))
            
            if 'Consensus_Score' not in daily_df.columns:
                 score_cols = [c for c in daily_df.columns if 'Score_' in c]
                 if score_cols: daily_df['Consensus_Score'] = daily_df[score_cols].mean(axis=1)
                 else: daily_df['Consensus_Score'] = 0
            
            daily_df['Weighted_Score'] = daily_df['Consensus_Score'] * daily_df['Time_Weight']
            
            unique_sectors = daily_df['Target_Sector'].dropna().unique()

            # Loop Sectors
            for sector in unique_sectors:
                if len(str(sector)) < 2: continue
                
                # ‚úÖ ‡∏£‡∏±‡∏ö‡∏Ñ‡πà‡∏≤ news_count ‡∏°‡∏≤‡∏î‡πâ‡∏ß‡∏¢
                news_context, q_score, news_count = get_sector_context(sector, daily_df)
                
                # Prompt
                prompt = f"""
Role: Senior Financial Analyst.
Task: Analyze the market sentiment for '{sector}' with a focus on REAL-TIME MOMENTUM.

Quantitative Signal:
- Time-Weighted Sentiment Score: {q_score:.2f} (Scale: -1.0 to +1.0)
 (This score prioritizes recent news over older news)

News Feed (Sorted by Recency - Newest First):
{news_context}

Instructions:
1. **Recency Bias:** Give significantly more weight to news from the last 2-3 days (Top of the list). Old news (7-10 days ago) should be treated as "Context" but not drivers.
2. **Outlook:** Determine 'Bullish', 'Bearish', or 'Neutral'.
3. **Score:** Score: Assign a precise sentiment score (0.0 - 10.0), e.g., 7.5 or 4.2.
4. **Analysis:** Write a short executive summary (Max 3 sentences). Explicitly mention if the sentiment has shifted recently (e.g., "Started week strong but ended weak").

Output strictly in JSON format:
{{
  "outlook": "Bearish" or "Bullish" or "Neutral",
  "score": <float 0-10>,
  "analysis": "<Max 3 sentences>"
}}
"""
                # Generate
                try:
                    messages = [{"role": "user", "content": prompt}]
                    try:
                        text_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                    except:
                        text_input = f"User: {prompt}\n\nAssistant:"
                    
                    inputs = tokenizer([text_input], return_tensors="pt").to(model.device)
                    
                    with torch.no_grad():
                        outputs = model.generate(**inputs, max_new_tokens=300, temperature=0.35)
                        
                    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
                    score, analysis, outlook = parse_llm_response(response)
                    
                    # Store Results
                    if target_date_str not in history_results: history_results[target_date_str] = {}
                    if sector not in history_results[target_date_str]: history_results[target_date_str][sector] = {}
                    
                    # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏Ç‡∏≠‡∏á Model
                    history_results[target_date_str][sector][short_name] = {
                        "score": float(score),
                        "analysis": analysis,
                        "outlook": outlook
                    }
                    
                    # ‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ç‡πà‡∏≤‡∏ß (‡∏ó‡∏±‡∏ö‡∏Ñ‡πà‡∏≤‡πÄ‡∏î‡∏¥‡∏°‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢ ‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ç‡πà‡∏≤‡∏ß‡πÄ‡∏ó‡πà‡∏≤‡∏Å‡∏±‡∏ô‡∏ó‡∏∏‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÉ‡∏ô‡∏ß‡∏±‡∏ô‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏Å‡∏±‡∏ô)
                    history_results[target_date_str][sector]['news_volume'] = news_count
                    
                except Exception as e:
                    pass

        # Cleanup
        del model
        del tokenizer
        torch.cuda.empty_cache()
        gc.collect()
        print(f"üßπ Unloaded {short_name} to free VRAM.")

    except Exception as e:
        print(f"‚ö†Ô∏è Failed to run {model_name}: {e}")

# ==========================================
# 4. üìä AGGREGATION & EXPORT
# ==========================================
print("\nüßÆ Aggregating Daily History...")

final_rows = []

for date_str, sectors_data in history_results.items():
    for sector, models_data in sectors_data.items():
        # ‚úÖ ‡∏î‡∏∂‡∏á News_Volume ‡∏≠‡∏≠‡∏Å‡∏°‡∏≤
        news_vol = models_data.get('news_volume', 0)

        row_data = {
            'Report_Date': date_str,
            'Sector': sector,
            'News_Volume': news_vol  # ‚úÖ ‡πÉ‡∏™‡πà Column ‡πÉ‡∏´‡∏°‡πà‡∏ï‡∏£‡∏á‡∏ô‡∏µ‡πâ
        }
        
        total_weighted_score = 0
        total_model_weight = 0
        
        for config in MODEL_CONFIGS:
            s_name = config['short_name']
            m_weight = config['weight']
            
            res = models_data.get(s_name, {"score": 5.0, "analysis": "N/A", "outlook": "N/A"})
            
            row_data[f'Score_{s_name}'] = res['score']
            row_data[f'Reason_{s_name}'] = res['analysis']
            
            total_weighted_score += res['score'] * m_weight
            total_model_weight += m_weight
        
        final_score = total_weighted_score / total_model_weight if total_model_weight > 0 else 5.0
        row_data['Final_Daily_Score'] = round(final_score, 2)
        
        if final_score >= 6.5: row_data['Final_Outlook'] = 'Bullish'
        elif final_score <= 3.5: row_data['Final_Outlook'] = 'Bearish'
        else: row_data['Final_Outlook'] = 'Neutral'
        
        final_rows.append(row_data)

df_history = pd.DataFrame(final_rows)

if not df_history.empty:
    # ‡∏à‡∏±‡∏î‡∏•‡∏≥‡∏î‡∏±‡∏ö Column ‡πÉ‡∏´‡πâ‡∏™‡∏ß‡∏¢‡∏á‡∏≤‡∏°
    cols = ['Report_Date', 'Sector', 'News_Volume'] + [c for c in df_history.columns if c not in ['Report_Date', 'Sector', 'News_Volume']]
    df_history = df_history[cols]
    
    df_history = df_history.sort_values(by=['Report_Date', 'Final_Daily_Score'], ascending=[True, False])
    
    print("\n" + "="*80)
    print(" üèÜ FINAL 7-DAY HISTORY REPORT")
    print("="*80)
    print(df_history[['Report_Date', 'Sector', 'News_Volume', 'Final_Daily_Score', 'Final_Outlook']].head(10))

    df_history.to_csv('csv_checkpoint/sector_daily_history_7days.csv', index=False)
    print("\n‚úÖ Saved history to 'sector_daily_history_7days.csv'")
else:
    print("‚ùå No history generated.")

üìÇ Loading Data...
‚úÖ Data Ready. Analyzing History: ['2026-01-08', '2026-01-09', '2026-01-10']

ü§ñ Loading Model: Qwen/Qwen2.5-14B-Instruct (Qwen)...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 579/579 [00:47<00:00, 12.27it/s, Materializing param=model.norm.weight]                               
üìÖ Processing Days (Qwen):   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
# read csv sector_daily_history_7days.csv
import pandas as pd
df_history = pd.read_csv('csv_checkpoint/sector_daily_history_7days.csv')
df_history

Unnamed: 0,Report_Date,Sector,News_Volume,Score_Qwen,Reason_Qwen,Score_Llama,Reason_Llama,Score_Gemma,Reason_Gemma,Final_Daily_Score,Final_Outlook
0,2026-01-08,Technology,261,8.3,The technology sector has seen a strong moment...,8.2,"The recent surge in AI-related news, particula...",7.8,The Technology sector currently exhibits a str...,8.1,Bullish
1,2026-01-08,Defense,1,5.0,The most recent news indicates a significant m...,7.5,The recent news of Lockheed Martin securing a ...,7.8,The market sentiment for Defense is currently ...,6.75,Bullish
2,2026-01-08,Healthcare,170,5.3,"The healthcare sector shows mixed signals, wit...",3.8,The recent news has been dominated by negative...,7.2,Market sentiment for Healthcare is currently b...,5.43,Neutral
3,2026-01-08,Financials,341,5.3,The market sentiment for financials remains la...,4.8,The recent news on US oil companies seeking gu...,5.8,Current market sentiment for Financials appear...,5.3,Neutral
4,2026-01-08,Other,117,2.8,The market sentiment for 'Other' has shifted t...,6.8,The market sentiment for 'Other' is bearish du...,4.2,Current market sentiment for 'Other' is decide...,4.58,Neutral
5,2026-01-08,Consumer Cyclical,165,5.2,The recent news feed indicates a mixed sentime...,3.8,The recent news suggests a bearish trend in th...,4.2,Recent news paints a mixed but ultimately bear...,4.41,Neutral
6,2026-01-08,Industrials,161,3.5,The Industrials sector has shown a recent shif...,4.8,The Industrials sector is experiencing a beari...,4.2,Recent news indicates a slightly bearish senti...,4.16,Neutral
7,2026-01-08,Basic Materials,115,3.5,The market sentiment for Basic Materials has s...,4.8,The market sentiment for Basic Materials is be...,4.2,Recent market sentiment for Basic Materials le...,4.16,Neutral
8,2026-01-08,Consumer Defensive,69,3.8,The Consumer Defensive sector experienced a sh...,4.2,"The recent news on Conagra, General Mills, and...",4.2,Recent market sentiment for Consumer Defensive...,4.06,Neutral
9,2026-01-08,Communication Services,83,3.5,The recent news indicates a shift towards a be...,4.2,The Communication Services sector has taken a ...,4.2,The Communication Services sector currently ex...,3.96,Neutral
