In [1]:
import pandas as pd
import time
from tqdm.notebook import tqdm
import math
import json
import sys
import os
sys.path.append(os.path.abspath(''))
from src.LLM.llm import get_sentiment, get_batch_sentiment

In [2]:
last_processed_uid = 0
sentiment_file_path = '../../data/sentiment/news_sentiment_results.csv'
processed_news_file_path = '../../data/news/nasdaq_news_data_processed.csv'


existing_sentiment_df = pd.read_csv(sentiment_file_path)
last_processed_uid = existing_sentiment_df['UID'].max()
print(f"Initialized last_processed_uid from existing file: {last_processed_uid}")


print(f"Loading processed news data from {processed_news_file_path}...")
all_processed_news_df = pd.read_csv(processed_news_file_path, parse_dates=['Date'])
print(f"Loaded {len(all_processed_news_df)} news items.")


Initialized last_processed_uid from existing file: 10000
Loading processed news data from data/news/nasdaq_news_data_processed.csv...
Loaded 209125 news items.


In [3]:
def parse_news_for_llm(df_chunk: pd.DataFrame):
    required_parse_cols = ['Ticker', 'Title', 'Summary']
    news_list = df_chunk[required_parse_cols].to_dict(orient='records')
    return news_list


def analyze_sentiment_wrapper(df_chunk: pd.DataFrame, mode: str, model_name="grok-3-beta") -> pd.DataFrame:
    original_data = df_chunk[['UID', 'Date', 'Ticker']].reset_index(drop=True)

    parsed_news = parse_news_for_llm(df_chunk)

    results: List[Sentiment] = []

    if mode == 'single':
        print(f"Processing {len(parsed_news)} items in single mode using {model_name}...")
        with tqdm(total=len(parsed_news), desc="Single Processing") as pbar:
            for news_item in parsed_news:
                sentiment_result = get_sentiment(news_item, model_name)
                results.append(sentiment_result)
                pbar.update(1)

    elif mode == 'batch':
        print(f"Processing {len(parsed_news)} items in batch mode using {model_name}...")
        batch_response = get_batch_sentiment(parsed_news, model_name)
        sentiment_results = batch_response.sentiments 

        if len(sentiment_results) != len(parsed_news):
             print(f"Warning: Batch result count ({len(sentiment_results)}) doesn't match input count ({len(parsed_news)}). Alignment might be incorrect.")
        results.extend(sentiment_results[:len(parsed_news)]) 


    output_data = [{'Sentiment': r.sentiment, 'Reason': r.reason} for r in results]
    sentiment_df = pd.DataFrame(output_data)

    if sentiment_df.empty or len(sentiment_df) != len(original_data):
         print(f"Warning: Sentiment analysis resulted in {len(sentiment_df)} items, expected {len(original_data)}. Returning original data with NA sentiment.")
         final_df = original_data.copy()
         final_df['Sentiment'] = pd.NA
         final_df['Reason'] = pd.NA
    else:
        final_df = original_data.join(sentiment_df)

    return final_df[['UID', 'Date', 'Ticker', 'Sentiment', 'Reason']]


In [4]:
def _save_current_progress(existing_df, new_results_list, output_path, overwrite_flag):
    if not new_results_list:
        print("No new results generated yet, nothing to save.")
        return

    print("Attempting to save intermediate progress...")
    try:
        temp_new_results_df = pd.concat(new_results_list, ignore_index=True)
        temp_combined_df = pd.concat([existing_df, temp_new_results_df], ignore_index=True)

        keep_strategy = 'last' if overwrite_flag else 'first'
        initial_count = len(temp_combined_df)
        temp_combined_df.drop_duplicates(subset='UID', keep=keep_strategy, inplace=True)
        if len(temp_combined_df) < initial_count:
                print(f"(Intermediate save) Removed {initial_count - len(temp_combined_df)} duplicate UIDs (keeping '{keep_strategy}').")


        temp_combined_df.sort_values(by='UID', inplace=True)
        if 'UID' in temp_combined_df.columns:
            temp_combined_df['UID'] = temp_combined_df['UID'].astype(int)

        temp_combined_df.to_csv(output_path, index=False, date_format='%Y-%m-%d')
        print(f"Intermediate progress saved successfully to {output_path}")
    except Exception as e:
        print(f"Error saving intermediate progress: {e}")


In [5]:
def process_sentiment_by_uid(
    processed_news_df: pd.DataFrame,
    batch_size: int,
    start_uid: int,
    end_uid: int,
    overwrite: bool = False,
    model_name: str = "grok-3-beta",
    sentiment_output_path: str = '../../data/sentiment/news_sentiment_results.csv'
) -> None:
    global last_processed_uid

    print(f"\n--- Starting Sentiment Processing ---")
    print(f"Range: UID {start_uid} to {end_uid}")
    print(f"Batch Size: {batch_size}")
    print(f"Overwrite: {overwrite}")
    print(f"Model: {model_name}")
    print(f"Output Path: {sentiment_output_path}")

    existing_sentiment_df = pd.read_csv(sentiment_output_path, parse_dates=['Date'])
    
    # --- 1. Filter News Data Based on UID Range and Overwrite Flag ---
    target_news_df = processed_news_df[
        (processed_news_df['UID'] >= start_uid) & (processed_news_df['UID'] <= end_uid)
    ].copy()

    if target_news_df.empty:
        print(f"No news data found in the specified UID range ({start_uid}-{end_uid}). Nothing to process.")
        return # Exit if no data in range

    uids_to_process_df = pd.DataFrame() # Initialize an empty DataFrame

    if overwrite:
        print("Overwrite mode enabled. Processing all UIDs in the specified range.")
        uids_to_process_df = target_news_df
        # Filter out the existing results within the range that will be replaced
        if not existing_sentiment_df.empty:
            existing_sentiment_df = existing_sentiment_df[
                (existing_sentiment_df['UID'] < start_uid) | (existing_sentiment_df['UID'] > end_uid)
            ]
            print(f"Removed existing sentiments within UID range {start_uid}-{end_uid} due to overwrite.")
    else:
        existing_uids = set(existing_sentiment_df['UID'].unique())
        uids_to_process_df = target_news_df[~target_news_df['UID'].isin(existing_uids)]
        print(f"Found {len(target_news_df) - len(uids_to_process_df)} UIDs in range already processed. Skipping them.")


    if uids_to_process_df.empty:
        print("No new UIDs to process in the specified range")
        return

    total_to_process = len(uids_to_process_df)
    print(f"Total UIDs to process in this run: {total_to_process}")

    # --- 2. Process in Batches ---
    all_new_results = []
    num_batches = math.ceil(total_to_process / batch_size)

    for i in tqdm(range(num_batches), desc="Processing Batches"):
        start_index = i * batch_size
        end_index = min((i + 1) * batch_size, total_to_process)
        current_chunk_df = uids_to_process_df.iloc[start_index:end_index]


        print(f"\nProcessing batch {i+1}/{num_batches} (UIDs {current_chunk_df['UID'].min()} - {current_chunk_df['UID'].max()})...")

        max_retries = 2
        retries = 0
        batch_successful = False

        while retries <= max_retries and not batch_successful:
            try:
                batch_results_df = analyze_sentiment_wrapper(current_chunk_df, mode='batch', model_name=model_name)
                
                batch_successful = True
                
                all_new_results.append(batch_results_df)

                if batch_successful:
                    break

            except Exception as e:
                retries += 1
                print(f"\nexception encountered on batch {i+1} (Attempt {retries}/{max_retries + 1}). Error: {e}")
                if retries <= max_retries:
                    _save_current_progress(existing_sentiment_df, all_new_results, sentiment_output_path, overwrite)
                    wait_time = 60
                    print(f"Waiting for {wait_time} seconds before retrying...")
                    time.sleep(wait_time)
                    print(f"Retrying batch {i+1}...")
                else:
                    print(f"Max retries ({max_retries}) exceeded for batch {i+1}. Skipping this batch.")
                    break


    # --- 3. Combine, Deduplicate, and Save ---
    if not all_new_results:
        print("No new sentiment results were generated.")
        return

    print("Combining new results with existing data...")
    new_results_df = pd.concat(all_new_results, ignore_index=True)


    final_sentiment_df = pd.concat([existing_sentiment_df, new_results_df], ignore_index=True)


    # Sort by UID
    final_sentiment_df.sort_values(by='UID', inplace=True)

    # --- 4. Save Results ---
    try:
        print(f"Saving {len(final_sentiment_df)} sentiment results to {sentiment_output_path}...")
        final_sentiment_df.to_csv(sentiment_output_path, index=False, date_format='%Y-%m-%d')
        print("Save successful.")
        max_processed_in_run = new_results_df['UID'].max() if not new_results_df.empty else start_uid -1 # Fallback if nothing new processed
        new_last_processed = max(last_processed_uid, max_processed_in_run, end_uid if uids_to_process_df.empty else -1) # Ensure requested range end is considered

        if new_last_processed > last_processed_uid:
             last_processed_uid = new_last_processed
             print(f"Updated global last_processed_uid to: {last_processed_uid}")
        else:
             print(f"Global last_processed_uid remains: {last_processed_uid}")


    except Exception as e:
        print(f"Error saving results to {sentiment_output_path}: {e}")

    print(f"--- Sentiment Processing Finished ---")


In [6]:
process_sentiment_by_uid(
        processed_news_df=all_processed_news_df,
        batch_size=30,
        start_uid=10001,
        end_uid=20000,
        overwrite=False,
        model_name="grok-3-fast-beta",
        output_path='../../data/sentiment/news_sentiment_results.csv'
    )
print(f"\nCurrent last processed UID after run: {last_processed_uid}")


--- Starting Sentiment Processing ---
Range: UID 10001 to 20000
Batch Size: 30
Overwrite: False
Model: grok-3-fast-beta
Output Path: data/sentiment/news_sentiment_results.csv
Found 0 UIDs in range already processed. Skipping them.
Total UIDs to process in this run: 10000


Processing Batches:   0%|          | 0/334 [00:00<?, ?it/s]


Processing batch 1/334 (UIDs 10001 - 10030)...
Processing 30 items in batch mode using grok-3-fast-beta...

Processing batch 2/334 (UIDs 10031 - 10060)...
Processing 30 items in batch mode using grok-3-fast-beta...

Processing batch 3/334 (UIDs 10061 - 10090)...
Processing 30 items in batch mode using grok-3-fast-beta...

Processing batch 4/334 (UIDs 10091 - 10120)...
Processing 30 items in batch mode using grok-3-fast-beta...

Processing batch 5/334 (UIDs 10121 - 10150)...
Processing 30 items in batch mode using grok-3-fast-beta...

Processing batch 6/334 (UIDs 10151 - 10180)...
Processing 30 items in batch mode using grok-3-fast-beta...

Processing batch 7/334 (UIDs 10181 - 10210)...
Processing 30 items in batch mode using grok-3-fast-beta...

Processing batch 8/334 (UIDs 10211 - 10240)...
Processing 30 items in batch mode using grok-3-fast-beta...

Processing batch 9/334 (UIDs 10241 - 10270)...
Processing 30 items in batch mode using grok-3-fast-beta...

Processing batch 10/334 (UI

In [9]:
sentiment_df = pd.read_csv(sentiment_file_path, parse_dates=['Date'])
sentiment_df.tail(10)

Unnamed: 0,UID,Date,Ticker,Sentiment,Reason
19990,19991,2016-11-21,BIDU,Slightly Bullish,"The summary mentions Baidu, Inc. is up about 1..."
19991,19992,2016-11-21,TSCO,Bullish,Tractor Supply Company raised its share buybac...
19992,19993,2016-11-22,TXN,Bullish,Texas Instruments Inc. has a Zacks Rank #2 (Bu...
19993,19994,2016-11-22,ROST,Neutral,"The news item mentions Ross Stores, Inc. in pa..."
19994,19995,2016-11-22,COST,Neutral,The summary discusses Costco's Black Friday de...
19995,19996,2016-11-22,ADI,Neutral,The summary focuses on historical earnings rea...
19996,19997,2016-11-22,ADI,Slightly Bullish,"Analog Devices, Inc. reported earnings that co..."
19997,19998,2016-11-22,ADI,Bullish,"Analog Devices, Inc. is trading up 6.0% on the..."
19998,19999,2016-11-22,ADI,Strongly Bullish,"Analog Devices, Inc. reported Q4 earnings of $..."
19999,20000,2016-11-22,ADI,Strongly Bullish,"Analog Devices, Inc. reported better-than-expe..."
