# Constants

## LLM API

In [226]:
GEMINI_MODEL_NAME = 'gemini-2.0-flash'
# GEMINI_MODEL_NAME = 'gemini-1.5-pro'
MAX_API_RETRIES = 3
API_INITIAL_DELAY = 5 # seconds
REQUESTS_PER_MINUTE_LIMIT = 15 # For your reference or manual throttling logic

## Dataset

In [227]:
SAMPLE_SIZE_FOR_TESTING = 10

# Imports and Downloads

## Install dependencies

In [228]:
!pip install -r requirements.txt -q

## Import libraries

In [229]:
import pandas as pd
import re
import nltk 
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from dotenv import load_dotenv
from nltk.corpus import stopwords
import google.generativeai as genai
import os
import time

## Download files and load env

In [None]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

dataset = load_dataset("ashraq/financial-news")

df = pd.DataFrame(dataset['train'])
df = df.head(SAMPLE_SIZE_FOR_TESTING)

load_dotenv()

Repo card metadata block was not found. Setting CardData to empty.


1845559


True

# Preprocessing

## General overview on dataset

In [231]:
print(f"Total number of news articles: {len(df)}")

if 'stock' in df.columns:
    print(f"Number of unique stocks: {df['stock'].nunique()}")
else:
    print("'stock' column not found for statistics.")

if 'date' in df.columns and not df['date'].dropna().empty:
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
elif 'date' in df.columns:
    print("Date range: 'date' column is empty or all NaT after conversion.")
else:
    print("'date' column not found for date range statistics.")

if 'publisher' in df.columns:
    print(f"Number of unique publishers: {df['publisher'].nunique()}")
else:
    print("'publisher' column not found for statistics.")

Total number of news articles: 10
Number of unique stocks: 1
Date range: 2020-05-05 00:00:00 to 2020-06-01 00:00:00
Number of unique publishers: 2


## Data type conversion for date

In [232]:
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    print(f"Data type of 'date' column after conversion: {df['date'].dtype}")
else:
    print("'date' column not found.")

Data type of 'date' column after conversion: datetime64[ns]


## Text Cleaning

### Declarations

In [233]:
# Your original clean_text function (more aggressive cleaning)
def clean_text_original(text):
    text = str(text).lower()  # Ensure text is string and convert to lowercase
    # Remove special characters and digits (keeps only letters and whitespace)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Suggested alternative clean_text function for LLM input
# (keeps numbers and some essential punctuation, removes HTML)
def clean_text_for_llm(text):
    text = str(text) # Ensure text is string

    # Remove HTML tags first (if any)
    text = re.sub(r'<[^>]+>', '', text)

    # Decode common HTML entities (simple version for &amp; &lt; &gt;)
    # For more comprehensive decoding, consider the 'html' library:
    # import html
    # text = html.unescape(text)
    text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')

    # Convert to lowercase
    text = text.lower()

    # Keep letters, numbers, and a basic set of punctuation.
    # This regex keeps: a-z, 0-9, whitespace, and . , ' - $ % ! ?
    # Adjust the punctuation list r'[^a-zA-Z0-9\s\.,\'\-\$\%\!\?]' as needed.
    text = re.sub(r'[^a-zA-Z0-9\s\.,\'\-\$\%\!\?]', '', text)

    # Normalize whitespace (remove extra spaces, trim leading/trailing)
    text = ' '.join(text.split())
    return text

### Apply declared functions on the headlines

In [234]:
if 'headline' in df.columns:
    # Apply original aggressive cleaning
    df['clean_headline_original'] = df['headline'].apply(clean_text_original)

    # Apply LLM-friendly cleaning
    df['llm_ready_headline'] = df['headline'].apply(clean_text_for_llm)
else:
    print("'headline' column not found.")

### Tokenizing headlines

In [235]:
if 'clean_headline_original' in df.columns:
    # Note: For LLMs, you typically feed the 'llm_ready_headline' string directly.
    # This tokenization is shown as per your original script.
    df['tokens_original'] = df['clean_headline_original'].apply(word_tokenize)
else:
    print("'clean_headline_original' column not found for tokenization.")

### Removing stopwords

In [236]:
if 'tokens_original' in df.columns:
    stop_words = set(stopwords.words('english'))
    df['tokens_no_stopwords'] = df['tokens_original'].apply(lambda token_list: [word for word in token_list if word not in stop_words])
else:
    print("'tokens_original' column not found for stopword removal.")

## Dropping unnecessary columns

In [237]:
columns_to_drop = ['url']
if columns_to_drop[0] in df.columns:
    df = df.drop(columns=columns_to_drop)
    print(f"'{columns_to_drop[0]}' column dropped successfully.")
else:
    print(f"'{columns_to_drop[0]}' column not found or already dropped.")

print("\nDataFrame columns after attempting to drop 'url':")
print(df.columns.tolist())

'url' column dropped successfully.

DataFrame columns after attempting to drop 'url':
['headline', 'publisher', 'date', 'stock', 'clean_headline_original', 'llm_ready_headline', 'tokens_original', 'tokens_no_stopwords']


In [238]:
df.head()

Unnamed: 0,headline,publisher,date,stock,clean_headline_original,llm_ready_headline,tokens_original,tokens_no_stopwords
0,Agilent Technologies Announces Pricing of $5……...,GuruFocus,2020-06-01,A,agilent technologies announces pricing of mill...,agilent technologies announces pricing of $5 m...,"[agilent, technologies, announces, pricing, of...","[agilent, technologies, announces, pricing, mi..."
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,Zacks,2020-05-18,A,agilent a gears up for q earnings whats in the...,agilent a gears up for q2 earnings what's in t...,"[agilent, a, gears, up, for, q, earnings, what...","[agilent, gears, q, earnings, whats, cards]"
2,J.P. Morgan Asset Management Announces Liquida...,GuruFocus,2020-05-15,A,jp morgan asset management announces liquidati...,j.p. morgan asset management announces liquida...,"[jp, morgan, asset, management, announces, liq...","[jp, morgan, asset, management, announces, liq..."
3,"Pershing Square Capital Management, L.P. Buys ...",GuruFocus,2020-05-15,A,pershing square capital management lp buys agi...,"pershing square capital management, l.p. buys ...","[pershing, square, capital, management, lp, bu...","[pershing, square, capital, management, lp, bu..."
4,Agilent Awards Trilogy Sciences with a Golden ...,GuruFocus,2020-05-12,A,agilent awards trilogy sciences with a golden ...,agilent awards trilogy sciences with a golden ...,"[agilent, awards, trilogy, sciences, with, a, ...","[agilent, awards, trilogy, sciences, golden, t..."


# Sentiment Analysis

## Configure API Key

In [239]:
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    print("API Key not found. Please set it.")
else:
    try:
        genai.configure(api_key=api_key)
        print("Gemini API configured.")
    except Exception as e:
        print(f"Error configuring Gemini API: {e}")

# --- Initialize the Model ---
# (Ensure API key was configured successfully before this)
model = None
if api_key: # Check if API key is available
     try:
         model = genai.GenerativeModel(GEMINI_MODEL_NAME)
         print(f"Using Gemini model: {model.model_name}")
     except Exception as e:
         print(f"Error initializing Gemini model: {e}")
         print("Please ensure your API key is valid, properly configured, and the model name is correct.")

Gemini API configured.
Using Gemini model: models/gemini-2.0-flash


## Define Sentiment Analysis functions

In [240]:
def get_sentiment_score_gemini(headline_text, stock_symbol, max_retries=3, initial_delay=5):
    """
    Analyzes the sentiment of a given headline using the Gemini API
    and returns a numerical score between -1.0 and 1.0.

    Args:
        headline_text (str): The news headline to analyze.
        max_retries (int): Maximum number of retries for API calls.
        initial_delay (int): Initial delay in seconds for retries.

    Returns:
        float or None: The sentiment score, or None if analysis fails or score is invalid.
    """
    global model # Access the globally initialized model
    if not model:
        print("🔴 Error: Gemini model not initialized. Cannot perform sentiment analysis.")
        return None
    if not headline_text or not isinstance(headline_text, str) or headline_text.strip() == "":
        print(f"🟡 Invalid Input: Headline is empty or not a string. Received: {headline_text}")
        return None

    # Crafting a prompt for a numerical sentiment score
    prompt = f"""You are a financial sentiment analysis assistant.

Given a news headline about a stock, analyze its sentiment with respect to the mentioned stock symbol and return a sentiment score between -1 and 1:

1 means extremely positive and bullish.

-1 means extremely negative and bearish.

Values in between indicate mixed or neutral sentiment, with 0 being completely neutral.

Format your response exactly like this:
<a number between -1 and 1>

Example Input:
Agilent (A) Gears Up for Q2 Earnings: What's in the Cards?

stock symbol: {stock_symbol}
Now analyze this headline:
{headline_text}
"""

    num_retries = 0
    delay = initial_delay
    while num_retries <= max_retries:
        try:
            generation_config = genai.types.GenerationConfig(
                candidate_count=1,
                temperature=0.1 # Lower temperature for more deterministic, less "creative" scoring
            )
            response = model.generate_content(prompt, generation_config=generation_config)

            if response.candidates and response.candidates[0].content.parts:
                score_text = response.candidates[0].content.parts[0].text.strip()
                try:
                    score = float(score_text)
                    if -1.0 <= score <= 1.0:
                        return score  # Successfully parsed and validated score
                    else:
                        print(f"🟡 Warning: Score '{score}' from API is out of range [-1.0, 1.0] for headline: '{headline_text}'. Returning None.")
                        return None # Score is out of the expected range
                except ValueError:
                    print(f"🟡 Warning: Could not convert API response '{score_text}' to a float score for headline: '{headline_text}'. Returning None.")
                    return None # API did not return a parseable number
            else:
                # Handle blocked responses or other issues
                if response.prompt_feedback and response.prompt_feedback.block_reason:
                    print(f"🟡 Warning: Prompt blocked for headline '{headline_text}'. Reason: {response.prompt_feedback.block_reason.name}. No retry.")
                    return None # Or a specific value like -999 to indicate blockage
                print(f"🟡 Warning: No valid candidate or content part in API response for headline: '{headline_text}'.")
                return None

        except Exception as e: # Catching a broader set of exceptions
            # Check if the error message or type indicates a rate limit (429 error)
            if "429" in str(e) or "exceeded your current quota" in str(e).lower() or "resource has been exhausted" in str(e).lower():
                if num_retries < max_retries:
                    print(f"🟡 Rate limit hit for headline '{headline_text}'. Retrying in {delay}s... (Attempt {num_retries+1}/{max_retries})")
                    time.sleep(delay)
                    num_retries += 1
                    delay *= 2 # Exponential backoff
                else:
                    print(f"🔴 Error: Max retries ({max_retries}) exceeded for rate limit on headline '{headline_text}'.")
                    return None # Max retries exceeded
            else:
                # For other types of errors, don't retry
                print(f"🔴 An unexpected API error occurred for headline '{headline_text}': {e}")
                return None
    
    print(f"🔴 Error: Failed to get sentiment score for '{headline_text}' after all retries or due to an earlier non-retryable error.")
    return None


## Apply Sentiment Analysis functions on the dataset

In [241]:
if 'llm_ready_headline' in df.columns and model: # Proceed only if df and model are ready
    print(f"Processing {len(df)} headlines...")

    # df['gemini_sentiment'] = df['llm_ready_headline'].apply(get_sentiment_gemini)
    
    sentiments = []
    for index, row in df.iterrows(): # Or however you are iterating
        headline = row['llm_ready_headline']
        stock_symbol = row['stock']
        print(f"Processing headline ({index+1}/{len(df)}): {headline[:70]}...")
        sentiment = get_sentiment_score_gemini(headline, stock_symbol)
        sentiments.append(sentiment)
        # Add a delay to stay within rate limits
        time.sleep(4) # Wait for 4 seconds before the next request

    df['gemini_sentiment'] = sentiments

    print("\nDataFrame with Sentiment Analysis results:")
    print(df[['llm_ready_headline', 'gemini_sentiment']].head())
elif not model:
    print("Cannot apply sentiment analysis because the Gemini model is not initialized.")
else:
    print("DataFrame 'df' or 'llm_ready_headline' column not found.")

Processing 10 headlines...
Processing headline (1/10): agilent technologies announces pricing of $5 million of senior notes...
Processing headline (2/10): agilent a gears up for q2 earnings what's in the cards?...
Processing headline (3/10): j.p. morgan asset management announces liquidation of six exchange-tra...
Processing headline (4/10): pershing square capital management, l.p. buys agilent technologies inc...
Processing headline (5/10): agilent awards trilogy sciences with a golden ticket at labcentral...
Processing headline (6/10): agilent technologies inc a ceo and president michael r. mcmullen sold ...
Processing headline (7/10): ' stocks growing their earnings fast...
Processing headline (8/10): cypress asset management inc buys verizon communications inc, united p...
Processing headline (9/10): hendley co inc buys american electric power co inc, agilent technologi...
Processing headline (10/10): teacher retirement system of texas buys hologic inc, vanguard total st...

DataFr