# **Create the New CSV with Sentiment Label Column**

In [None]:
!pip install datasets transformers torch

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [None]:
from datasets import load_dataset
from transformers import pipeline
import pandas as pd
import re

# Load dataset in streaming mode
dataset = load_dataset("Zihan1004/FNSPID", streaming=True)
iterable_dataset = iter(dataset["train"])

# Load FinBERT model for financial sentiment analysis
sentiment_pipeline = pipeline("text-classification", model="ProsusAI/finbert")

# Function to classify a financial news headline
def classify_headline(title):
    result = sentiment_pipeline(title)[0]  # Get model prediction
    label = result["label"]  # Extract the label (positive, negative, neutral)

    # Convert labels to financial sentiment terminology
    if label == "positive":
        return "bullish"
    elif label == "negative":
        return "bearish"
    else:
        return "neutral"

# Function to extract year from the article date
def extract_year(date_string):
    if date_string:
        match = re.search(r"\b(20[1-2][0-9])\b", date_string)  # Match years 2010-2029
        if match:
            return int(match.group(1))  # Extract and convert year to integer
    return None

# Dictionary to store sentiment counts
sentiment_counts = {"bullish": 0, "bearish": 0, "neutral": 0}

# List to store classified headlines
classified_headlines = []

# Set a limit on the number of samples processed
max_samples = 5000  # Limit to 5,000 articles
processed_samples = 0  # Counter

print("Processing dataset...")

while processed_samples < max_samples:
    try:
        sample = next(iterable_dataset)  # Get next entry
        title = sample.get("Article_title", "No Title Available")  # Extract title
        date = sample.get("Date", "")  # Extract date field

        # Extract year from date and filter articles within 2010-2025
        year = extract_year(date)
        if year and 2010 <= year <= 2025:
            # Classify the headline
            sentiment = classify_headline(title)

            # Store classification result
            classified_headlines.append({"date": date, "title": title, "sentiment": sentiment})
            sentiment_counts[sentiment] += 1  # Update sentiment count

            processed_samples += 1  # Increment counter

            # Print progress every 100 samples
            if processed_samples % 100 == 0:
                print(f"Processed {processed_samples} filtered headlines...")

    except StopIteration:
        print("Finished processing the dataset early!")
        break  # Stop if dataset ends before reaching the limit

# Save results to CSV
df = pd.DataFrame(classified_headlines)
df.to_csv("filtered_classified_headlines_5000.csv", index=False)
print("\nResults saved to filtered_classified_headlines_5000.csv")

# Print final sentiment summary
print("\nSentiment Summary (2010-2025, Limited to 5,000 Articles):")
print(sentiment_counts)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


Processing dataset...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed 100 filtered headlines...
Processed 200 filtered headlines...
Processed 300 filtered headlines...
Processed 400 filtered headlines...
Processed 500 filtered headlines...
Processed 600 filtered headlines...
Processed 700 filtered headlines...
Processed 800 filtered headlines...
Processed 900 filtered headlines...
Processed 1000 filtered headlines...
Processed 1100 filtered headlines...
Processed 1200 filtered headlines...
Processed 1300 filtered headlines...
Processed 1400 filtered headlines...
Processed 1500 filtered headlines...
Processed 1600 filtered headlines...
Processed 1700 filtered headlines...
Processed 1800 filtered headlines...
Processed 1900 filtered headlines...
Processed 2000 filtered headlines...
Processed 2100 filtered headlines...
Processed 2200 filtered headlines...
Processed 2300 filtered headlines...
Processed 2400 filtered headlines...
Processed 2500 filtered headlines...
Processed 2600 filtered headlines...
Processed 2700 filtered headlines...
Processed 

In [None]:
from google.colab import drive
drive.mount('/content/drive')  # 1) Mount Google Drive

from datasets import load_dataset
from transformers import pipeline
import pandas as pd
import re
import time
import threading

# 2) Load dataset in streaming mode
dataset = load_dataset("Zihan1004/FNSPID", streaming=True)
iterable_dataset = iter(dataset["train"])

# 3) Load FinBERT model for financial sentiment analysis
sentiment_pipeline = pipeline("text-classification", model="ProsusAI/finbert")

# 4) Function to classify a financial news headline
def classify_headline(title):
    result = sentiment_pipeline(title)[0]  # Get model prediction
    label = result["label"]  # Extract the label (positive, negative, neutral)

    # Convert labels to financial sentiment terminology
    if label == "positive":
        return "bullish"
    elif label == "negative":
        return "bearish"
    else:
        return "neutral"

# 5) Function to extract year from the article date
def extract_year(date_string):
    if date_string:
        match = re.search(r"\b(20[0-9][0-9])\b", date_string)  # Match years 2000-2099
        if match:
            return int(match.group(1))  # Extract and convert year to integer
    return None

# 6) Store labeled dataset
labeled_data = []
max_headlines = 2_000_000  # Process up to 2 million headlines
counter = 0

print("Processing dataset...")

# 7) Prevent timeout function (keeps Colab alive)
def keep_alive():
    while True:
        time.sleep(600)  # Sleep for 10 minutes
        print("Keeping session alive...")

# Start keep-alive thread
keep_alive_thread = threading.Thread(target=keep_alive, daemon=True)
keep_alive_thread.start()

# 8) Main processing loop
while counter < max_headlines:
    try:
        sample = next(iterable_dataset)  # Get next entry
        title = sample.get("Article_title", "No Title Available")
        date = sample.get("Date", "")

        # Extract year (keep all years)
        year = extract_year(date)

        # Classify the headline
        sentiment = classify_headline(title)

        # Store classification result
        labeled_data.append({
            "date": date,
            "year": year,
            "title": title,
            "sentiment": sentiment,
            "publisher": sample.get("Publisher", "Unknown"),
            "stock_symbol": sample.get("Stock_symbol", "N/A"),
            "url": sample.get("Url", "No URL"),
        })

        counter += 1

        # Print progress every 10,000 samples
        if counter % 10_000 == 0:
            print(f"Processed {counter} headlines...")

        # Save a checkpoint every 100,000 headlines (into Google Drive)
        if counter % 100_000 == 0:
            print(f"Checkpoint reached: {counter} headlines. Saving CSV to Google Drive...")
            checkpoint_df = pd.DataFrame(labeled_data)
            # Save to a subfolder or directly in MyDrive
            checkpoint_path = f"/content/drive/MyDrive/checkpoint_{counter}.csv"
            checkpoint_df.to_csv(checkpoint_path, index=False)
            print(f"Checkpoint saved as {checkpoint_path}")

    except StopIteration:
        print("Finished processing the entire dataset!")
        break

# 9) Convert to DataFrame and save final CSV to Google Drive
df = pd.DataFrame(labeled_data)
final_path = "/content/drive/MyDrive/labeled_financial_news_2M.csv"
df.to_csv(final_path, index=False)
print(f"\nResults saved to {final_path}")

# 10) Print final sentiment summary
sentiment_counts = df["sentiment"].value_counts().to_dict()
print("\nSentiment Summary (Up to 2,000,000 Headlines):")
print(sentiment_counts)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Repo card metadata block was not found. Setting CardData to empty.
Device set to use cuda:0


Processing dataset...
Keeping session alive...
Processed 10000 headlines...
Processed 20000 headlines...
Keeping session alive...
Processed 30000 headlines...
Processed 40000 headlines...
Processed 50000 headlines...
Keeping session alive...
Keeping session alive...
Processed 60000 headlines...
Keeping session alive...
Keeping session alive...
Processed 70000 headlines...
Processed 80000 headlines...
Keeping session alive...
Processed 90000 headlines...
Processed 100000 headlines...
Checkpoint reached: 100000 headlines. Saving CSV to Google Drive...
Checkpoint saved as /content/drive/MyDrive/checkpoint_100000.csv
Processed 110000 headlines...
Keeping session alive...
Processed 120000 headlines...
Keeping session alive...
Keeping session alive...
Keeping session alive...
Processed 130000 headlines...
Processed 140000 headlines...
Processed 150000 headlines...
Keeping session alive...
Processed 160000 headlines...
Processed 170000 headlines...
Processed 180000 headlines...
Keeping sessio