In [None]:
# FNSPID Dataset Download and Setup
# Dataset: 15.7M financial news articles with sentiment labels (1999-2023)
# Source: https://huggingface.co/datasets/Zihan1004/FNSPID

import pandas as pd
import requests
from huggingface_hub import hf_hub_download
import zipfile
import os


  from .autonotebook import tqdm as notebook_tqdm


In [None]:


# Step 1: Download FNSPID Financial News Dataset
print("Downloading FNSPID Financial News Dataset...")
print("This is a large dataset (~30GB), ensure you have sufficient storage")

# Download methods (choose one):

# Method 1: Direct download using wget (Linux/Mac)
"""
# Run these commands in terminal:
wget https://huggingface.co/datasets/Zihan1004/FNSPID/resolve/main/Stock_news/nasdaq_exteral_data.csv
wget https://huggingface.co/datasets/Zihan1004/FNSPID/resolve/main/Stock_news/sentiment_scored_news.zip
"""

# Method 2: Python download
def download_fnspid_news():
    """Download FNSPID news data files"""

    base_url = "https://huggingface.co/datasets/Zihan1004/FNSPID/resolve/main/Stock_news/"
    files_to_download = [
        "nasdaq_exteral_data.csv",
        "sentiment_scored_news.zip"
    ]

    for filename in files_to_download:
        print(f"Downloading {filename}...")
        url = base_url + filename
        response = requests.get(url, stream=True)

        if response.status_code == 200:
            with open(filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"✅ Downloaded {filename}")
        else:
            print(f"❌ Failed to download {filename}")

# Step 2: Load and explore the dataset
def load_fnspid_data():
    """Load and explore FNSPID news data"""

    # Load main news data
    print("Loading FNSPID news data...")
    df_news = pd.read_csv("nasdaq_exteral_data.csv")

    print(f"Dataset shape: {df_news.shape}")
    print(f"Columns: {df_news.columns.tolist()}")
    print(f"Date range: {df_news['date'].min()} to {df_news['date'].max()}")

    # Show sample data
    print("\nSample data:")
    print(df_news.head())

    # Check sentiment distribution
    if 'sentiment' in df_news.columns:
        print(f"\nSentiment distribution:")
        print(df_news['sentiment'].value_counts())

    return df_news

# Step 3: Filter for recent data (2022-2024)
def filter_recent_data(df, start_year=2022, end_year=2024):
    """Filter dataset for specific years"""

    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year

    # Filter for your target years
    df_filtered = df[(df['year'] >= start_year) & (df['year'] <= end_year)]

    print(f"Filtered data ({start_year}-{end_year}):")
    print(f"Shape: {df_filtered.shape}")
    print(f"Articles per year:")
    print(df_filtered['year'].value_counts().sort_index())

    return df_filtered

# Step 4: Prepare data for DistilBERT training
def prepare_training_data(df):
    """Prepare FNSPID data for DistilBERT fine-tuning"""

    # Create text column combining title and content
    if 'title' in df.columns and 'text' in df.columns:
        df['combined_text'] = df['title'].fillna('') + ". " + df['text'].fillna('')
    elif 'news_text' in df.columns:
        df['combined_text'] = df['news_text'].fillna('')
    else:
        # Adapt based on actual column names
        text_cols = [col for col in df.columns if 'text' in col.lower() or 'content' in col.lower()]
        if text_cols:
            df['combined_text'] = df[text_cols[0]].fillna('')

    # Clean sentiment labels (adapt based on actual format)
    # FNSPID uses various sentiment scoring methods - normalize to positive/neutral/negative
    if 'sentiment_score' in df.columns:
        # Convert sentiment scores to labels
        df['sentiment_label'] = df['sentiment_score'].apply(lambda x:
            'positive' if x > 0.1 else ('negative' if x < -0.1 else 'neutral'))
    elif 'sentiment' in df.columns:
        df['sentiment_label'] = df['sentiment']

    # Filter out articles that are too short or too long
    df['text_length'] = df['combined_text'].str.len()
    df_clean = df[
        (df['text_length'] >= 100) &  # Minimum 100 characters
        (df['text_length'] <= 2000) & # Maximum 2000 characters
        (df['combined_text'].notna()) &
        (df['sentiment_label'].notna())
    ].copy()

    print(f"Cleaned dataset shape: {df_clean.shape}")
    print(f"Text length stats:")
    print(df_clean['text_length'].describe())
    print(f"\nSentiment distribution:")
    print(df_clean['sentiment_label'].value_counts())

    return df_clean[['combined_text', 'sentiment_label', 'date']]

# Step 5: Sample balanced training data
def create_balanced_sample(df, samples_per_class=5000):
    """Create a balanced sample for training"""

    balanced_dfs = []

    for sentiment in df['sentiment_label'].unique():
        sentiment_df = df[df['sentiment_label'] == sentiment]
        if len(sentiment_df) >= samples_per_class:
            sampled = sentiment_df.sample(n=samples_per_class, random_state=42)
        else:
            sampled = sentiment_df  # Use all available if less than target
        balanced_dfs.append(sampled)

    balanced_df = pd.concat(balanced_dfs, ignore_index=True)
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

    print(f"Balanced training set shape: {balanced_df.shape}")
    print(f"Sentiment distribution:")
    print(balanced_df['sentiment_label'].value_counts())

    return balanced_df

# Main execution
if __name__ == "__main__":
    print("="*50)
    print("FNSPID DATASET SETUP FOR DISTILBERT TRAINING")
    print("="*50)

    # Step 1: Download (uncomment to download)
    # download_fnspid_news()

    # Step 2: Load data
    df_news = load_fnspid_data()

    # Step 3: Filter for recent years
    df_recent = filter_recent_data(df_news, 2022, 2024)

    # Step 4: Prepare for training
    df_prepared = prepare_training_data(df_recent)

    # Step 5: Create balanced sample
    df_balanced = create_balanced_sample(df_prepared, samples_per_class=3000)

    # Save training data
    df_balanced.to_csv('fnspid_training_data_2022_2024.csv', index=False)
    print(f"\n✅ Training data saved to: fnspid_training_data_2022_2024.csv")

    print("\n" + "="*50)
    print("NEXT STEPS:")
    print("="*50)
    print("1. Review the training data quality")
    print("2. Use this balanced dataset to train DistilBERT")
    print("3. Apply trained model to your Cleantech dataset")
    print("4. This should solve your domain mismatch problem!")

Downloading FNSPID Financial News Dataset...
This is a large dataset (~30GB), ensure you have sufficient storage


In [1]:

# Step 4: Prepare data for DistilBERT training
def prepare_training_data(df):
    """Prepare FNSPID data for DistilBERT fine-tuning"""

    # Create text column combining title and content
    if 'title' in df.columns and 'text' in df.columns:
        df['combined_text'] = df['title'].fillna('') + ". " + df['text'].fillna('')
    elif 'news_text' in df.columns:
        df['combined_text'] = df['news_text'].fillna('')
    else:
        # Adapt based on actual column names
        text_cols = [col for col in df.columns if 'text' in col.lower() or 'content' in col.lower()]
        if text_cols:
            df['combined_text'] = df[text_cols[0]].fillna('')

    # Clean sentiment labels (adapt based on actual format)
    # FNSPID uses various sentiment scoring methods - normalize to positive/neutral/negative
    if 'sentiment_score' in df.columns:
        # Convert sentiment scores to labels
        df['sentiment_label'] = df['sentiment_score'].apply(lambda x:
            'positive' if x > 0.1 else ('negative' if x < -0.1 else 'neutral'))
    elif 'sentiment' in df.columns:
        df['sentiment_label'] = df['sentiment']

    # Filter out articles that are too short or too long
    df['text_length'] = df['combined_text'].str.len()
    df_clean = df[
        (df['text_length'] >= 100) &  # Minimum 100 characters
        (df['text_length'] <= 2000) & # Maximum 2000 characters
        (df['combined_text'].notna()) &
        (df['sentiment_label'].notna())
    ].copy()

    print(f"Cleaned dataset shape: {df_clean.shape}")
    print(f"Text length stats:")
    print(df_clean['text_length'].describe())
    print(f"\nSentiment distribution:")
    print(df_clean['sentiment_label'].value_counts())

    return df_clean[['combined_text', 'sentiment_label', 'date']]

# Step 5: Sample balanced training data
def create_balanced_sample(df, samples_per_class=5000):
    """Create a balanced sample for training"""

    balanced_dfs = []

    for sentiment in df['sentiment_label'].unique():
        sentiment_df = df[df['sentiment_label'] == sentiment]
        if len(sentiment_df) >= samples_per_class:
            sampled = sentiment_df.sample(n=samples_per_class, random_state=42)
        else:
            sampled = sentiment_df  # Use all available if less than target
        balanced_dfs.append(sampled)

    balanced_df = pd.concat(balanced_dfs, ignore_index=True)
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

    print(f"Balanced training set shape: {balanced_df.shape}")
    print(f"Sentiment distribution:")
    print(balanced_df['sentiment_label'].value_counts())

    return balanced_df

# Main execution
if __name__ == "__main__":
    print("="*50)
    print("FNSPID DATASET SETUP FOR DISTILBERT TRAINING")
    print("="*50)

    # Step 1: Download (uncomment to download)
    # download_fnspid_news()

    # Step 2: Load data
    df_news = load_fnspid_data()

    # Step 3: Filter for recent years
    df_recent = filter_recent_data(df_news, 2022, 2024)

    # Step 4: Prepare for training
    df_prepared = prepare_training_data(df_recent)

    # Step 5: Create balanced sample
    df_balanced = create_balanced_sample(df_prepared, samples_per_class=3000)

    # Save training data
    df_balanced.to_csv('fnspid_training_data_2022_2024.csv', index=False)
    print(f"\n✅ Training data saved to: fnspid_training_data_2022_2024.csv")

    print("\n" + "="*50)
    print("NEXT STEPS:")
    print("="*50)
    print("1. Review the training data quality")
    print("2. Use this balanced dataset to train DistilBERT")
    print("3. Apply trained model to your Cleantech dataset")
    print("4. This should solve your domain mismatch problem!")

FNSPID DATASET SETUP FOR DISTILBERT TRAINING


NameError: name 'load_fnspid_data' is not defined