### Process News Sentiment

In [None]:
import pandas as pd
import numpy as np
import os
import glob
from transformers import pipeline
import torch
import sys

# Suppress a common but harmless warning from transformers
from transformers import logging
logging.set_verbosity_error()

def initialize_finbert():
    """
    Initializes and returns the FinBERT sentiment analysis pipeline.
    """
    device_name = 'cpu'
    if torch.cuda.is_available():
        device_name = 'cuda'
    
    print(f"Initializing FinBERT pipeline... (Using device: {device_name})")
    
    try:
        sentiment_pipeline = pipeline(
            "sentiment-analysis", 
            model="ProsusAI/finbert", 
            device=-1 if device_name == 'cpu' else 0
        )
        print("FinBERT pipeline loaded successfully.")
        return sentiment_pipeline
    except Exception as e:
        print(f"[Fatal Error] Could not load FinBERT model: {e}")
        print("Make sure you are connected to the internet and have 'transformers' and 'torch' installed.")
        sys.exit(1) # Exit the script if the model can't be loaded

def process_news_file(news_file, sentiment_pipeline):
    """
    Reads a raw news file, analyzes every article, and returns a DataFrame 
    of daily average sentiment scores.
    """
    
    # --- 1. Read and Clean News Data ---
    try:
        news_df = pd.read_csv(news_file)
        news_df['date'] = pd.to_datetime(news_df['date'])
    except Exception as e:
        print(f"  [Error] Could not read or parse news file {news_file}: {e}")
        return None

    # Clean text and create a 'full_text' column for analysis
    news_df['description'] = news_df['description'].fillna('').astype(str)
    news_df['title'] = news_df['title'].astype(str)
    news_df['full_text'] = news_df['title'] + ' ' + news_df['description']
    
    # Filter out empty text
    valid_texts_df = news_df[news_df['full_text'].str.strip() != ''].copy()
    
    if valid_texts_df.empty:
        print("  > No valid news text found. Returning empty DataFrame.")
        return pd.DataFrame(columns=['date', 'avg_polarity', 'avg_confidence']).set_index('date')

    # --- 2. Run FinBERT Analysis ---
    print(f"  > Analyzing {len(valid_texts_df)} individual news articles...")
    texts_to_analyze = valid_texts_df['full_text'].tolist()
    try:
        results = sentiment_pipeline(texts_to_analyze, truncation=True, batch_size=32)
    except Exception as e:
        print(f"  [Error] FinBERT analysis failed: {e}")
        return None
    
    # Map results back to the DataFrame
    results_df = pd.DataFrame(results)
    valid_texts_df['label'] = results_df['label'].values
    valid_texts_df['confidence'] = results_df['score'].values
    
    # --- 3. Calculate Polarity Score ---
    # Convert label to a single polarity score:
    # positive -> +confidence, negative -> -confidence, neutral -> 0
    def calculate_polarity(row):
        if row['label'] == 'positive':
            return row['confidence']
        elif row['label'] == 'negative':
            return -row['confidence']
        else:
            return 0.0
    
    valid_texts_df['polarity'] = valid_texts_df.apply(calculate_polarity, axis=1)
    
    # --- 4. Aggregate to get DAILY AVERAGE ---
    print("  > Aggregating sentiment scores by day...")
    valid_texts_df.set_index('date', inplace=True)
    daily_avg_sentiment = valid_texts_df.resample('D').agg(
        avg_polarity=('polarity', 'mean'),
        avg_confidence=('confidence', 'mean')
    )
    
    # Fill any gaps (days with news but no valid text?) with 0s
    daily_avg_sentiment.fillna(0, inplace=True)
    
    return daily_avg_sentiment

def main():
    # --- Configuration ---
    STOCK_NEWS_DIR = "Stock News"
    OUTPUT_DIR = "daily_sentiment_data" # New folder for this module's output

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Load the model ONCE for the entire run
    sentiment_pipeline = initialize_finbert()
    
    search_path = os.path.join(STOCK_NEWS_DIR, "*_news.csv")
    news_files = glob.glob(search_path)
    
    if not news_files:
        print(f"No *_news.csv files found in '{STOCK_NEWS_DIR}' folder.")
        return

    print(f"\nFound {len(news_files)} stocks. Starting sentiment processing...")
    
    success_count = 0
    fail_count = 0

    for news_file in news_files:
        basename = os.path.basename(news_file)
        ticker = basename.split('_news.csv')[0]
        
        print(f"\n--- Processing: {ticker} ---")
            
        daily_sentiment_data = process_news_file(news_file, sentiment_pipeline)
        
        if daily_sentiment_data is not None:
            # Reset index to make 'date' a column
            daily_sentiment_data.reset_index(inplace=True)
            
            output_filename = os.path.join(OUTPUT_DIR, f"{ticker}_sentiment.csv")
            daily_sentiment_data.to_csv(output_filename, index=False)
            
            print(f"  [Success] Saved to: {output_filename}")
            print(f"  > Total days with sentiment: {len(daily_sentiment_data)}")
            success_count += 1
        else:
            print(f"  [Failed] Could not process {ticker}.")
            fail_count += 1

    print(f"\n--- Sentiment Processing Complete ---")
    print(f"Successfully processed: {success_count} stocks")
    print(f"Skipped/Failed: {fail_count} stocks")

if __name__ == "__main__":
    main()

### Creating the Final Sentiment Features

In [None]:
import pandas as pd
import numpy as np
import os
import glob
import sys

def process_features(sentiment_file, news_file):
    """
    Combines daily sentiment with daily news counts and engineers new
    features like change, momentum, and count.
    """
    
    # --- 1. Load Daily Sentiment Data ---
    try:
        sentiment_df = pd.read_csv(sentiment_file, parse_dates=['date'])
        sentiment_df.set_index('date', inplace=True)
    except Exception as e:
        print(f"  [Error] Could not read sentiment file {sentiment_file}: {e}")
        return None

    # --- 2. Load and Aggregate News Count ---
    try:
        news_df = pd.read_csv(news_file, parse_dates=['date'])
        news_df.set_index('date', inplace=True)
        # Resample by day ('D') and get the size (count) of each group
        count_df = news_df.resample('D').size().to_frame('news_article_count')
    except Exception as e:
        print(f"  [Error] Could not read or process news file {news_file}: {e}")
        return None

    # --- 3. Merge Sentiment and Counts ---
    # We join to the sentiment_df, which has the full daily index
    print("  > Merging sentiment scores and article counts...")
    merged_df = sentiment_df.join(count_df, how='left')
    
    # Fill days with no news with 0 articles
    merged_df['news_article_count'].fillna(0, inplace=True)
    merged_df['news_article_count'] = merged_df['news_article_count'].astype(int)
    
    # Ensure data is sorted by date for correct momentum/change calculation
    merged_df.sort_index(inplace=True)

    # --- 4. Engineer New Features ---
    print("  > Engineering features: change, momentum, count...")
    
    # 1. sentiment_change: Change in sentiment score from previous day
    # .diff() calculates the difference from the previous row
    merged_df['sentiment_change'] = merged_df['avg_polarity'].diff().fillna(0)
    
    # 2. sentiment_momentum: Moving average of sentiment (3-day and 5-day)
    # .rolling() creates a sliding window for calculations
    merged_df['momentum_3d'] = merged_df['avg_polarity'].rolling(window=3).mean().fillna(0)
    merged_df['momentum_5d'] = merged_df['avg_polarity'].rolling(window=5).mean().fillna(0)
    
    # 3. news_article_count: (Already created in step 3)
    
    return merged_df

def main():
    # --- Configuration ---
    INPUT_SENTIMENT_DIR = "daily_sentiment_data"
    INPUT_NEWS_DIR = "Stock News"
    OUTPUT_DIR = "final_sentiment_features" # New folder for this module's output

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Find all the sentiment files we just created
    search_path = os.path.join(INPUT_SENTIMENT_DIR, "*_sentiment.csv")
    sentiment_files = glob.glob(search_path)
    
    if not sentiment_files:
        print(f"No *_sentiment.csv files found in '{INPUT_SENTIMENT_DIR}' folder.")
        print("Please run 'process_news_sentiment.py' first.")
        return

    print(f"\nFound {len(sentiment_files)} sentiment files. Starting feature engineering...")
    
    success_count = 0
    fail_count = 0

    for sentiment_file in sentiment_files:
        basename = os.path.basename(sentiment_file)
        ticker = basename.split('_sentiment.csv')[0]
        
        print(f"\n--- Processing: {ticker} ---")
        
        # Find the matching original news file
        news_file = os.path.join(INPUT_NEWS_DIR, f"{ticker}_news.csv")
        
        if not os.path.exists(news_file):
            print(f"  [Skipped] No matching raw news file found at: {news_file}")
            fail_count += 1
            continue
            
        feature_data = process_features(sentiment_file, news_file)
        
        if feature_data is not None:
            # Reset index to make 'date' a column again
            feature_data.reset_index(inplace=True)
            
            output_filename = os.path.join(OUTPUT_DIR, f"{ticker}_features.csv")
            feature_data.to_csv(output_filename, index=False)
            
            print(f"  [Success] Saved to: {output_filename}")
            success_count += 1
        else:
            print(f"  [Failed] Could not process {ticker}.")
            fail_count += 1

    print(f"\n--- Feature Engineering Complete ---")
    print(f"Successfully processed: {success_count} stocks")
    print(f"Skipped/Failed: {fail_count} stocks")

if __name__ == "__main__":
    main()