In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
import numpy as np
from typing import List, Dict, Union
import logging
from datetime import datetime
import torch

class FinancialSentimentAnalyzer:
    """Advanced financial sentiment analysis with aggregation."""
    
    def __init__(self, model_name: str = "ProsusAI/finbert"):
        self.model_name = model_name
        self.pipeline = pipeline("sentiment-analysis", 
                               model=model_name,
                               tokenizer=model_name,
                               device=0 if torch.cuda.is_available() else -1)
        self.logger = logging.getLogger(__name__)
        
        # Sentiment mapping
        self.sentiment_mapping = {
            'positive': 1,
            'negative': -1,
            'neutral': 0
        }
    
    def analyze_text(self, text: str) -> Dict:
        """Analyze sentiment of a single text."""
        if not text or len(text.strip()) == 0:
            return {
                'sentiment': 'neutral',
                'confidence': 0.0,
                'score': 0.0
            }
        
        try:
            # Truncate text if too long
            max_length = 512
            if len(text) > max_length:
                text = text[:max_length]
            
            result = self.pipeline(text)[0]
            
            sentiment = result['label'].lower()
            confidence = result['score']
            
            # Convert to numerical score
            score = self.sentiment_mapping.get(sentiment, 0) * confidence
            
            return {
                'sentiment': sentiment,
                'confidence': confidence,
                'score': score,
                'text_length': len(text)
            }
            
        except Exception as e:
            self.logger.error(f"Sentiment analysis error: {e}")
            return {
                'sentiment': 'neutral',
                'confidence': 0.0,
                'score': 0.0
            }
    
    def analyze_chunks(self, chunks: List[Dict]) -> List[Dict]:
        """Analyze sentiment for multiple document chunks."""
        results = []
        
        for chunk in chunks:
            sentiment_result = self.analyze_text(chunk['text'])
            
            result = {
                'chunk_id': chunk['chunk_id'],
                'sentiment': sentiment_result['sentiment'],
                'confidence': sentiment_result['confidence'],
                'score': sentiment_result['score'],
                'text_preview': chunk['text'][:100] + "..." if len(chunk['text']) > 100 else chunk['text']
            }
            results.append(result)
        
        return results
    
    def aggregate_sentiment(self, sentiment_results: List[Dict], 
                          method: str = 'weighted_average') -> Dict:
        """Aggregate sentiment scores across chunks."""
        if not sentiment_results:
            return {
                'overall_sentiment': 'neutral',
                'overall_score': 0.0,
                'confidence': 0.0,
                'sentiment_distribution': {'positive': 0, 'negative': 0, 'neutral': 0}
            }
        
        scores = [r['score'] for r in sentiment_results]
        confidences = [r['confidence'] for r in sentiment_results]
        sentiments = [r['sentiment'] for r in sentiment_results]
        
        # Calculate overall score
        if method == 'weighted_average':
            overall_score = np.average(scores, weights=confidences)
        else:  # simple average
            overall_score = np.mean(scores)
        
        # Determine overall sentiment
        if overall_score > 0.1:
            overall_sentiment = 'positive'
        elif overall_score < -0.1:
            overall_sentiment = 'negative'
        else:
            overall_sentiment = 'neutral'
        
        # Sentiment distribution
        sentiment_counts = pd.Series(sentiments).value_counts().to_dict()
        total_chunks = len(sentiment_results)
        
        sentiment_distribution = {
            'positive': sentiment_counts.get('positive', 0) / total_chunks,
            'negative': sentiment_counts.get('negative', 0) / total_chunks,
            'neutral': sentiment_counts.get('neutral', 0) / total_chunks
        }
        
        return {
            'overall_sentiment': overall_sentiment,
            'overall_score': overall_score,
            'confidence': np.mean(confidences),
            'sentiment_distribution': sentiment_distribution,
            'total_chunks': total_chunks,
            'score_std': np.std(scores)
        }
    
    def temporal_sentiment_analysis(self, documents: List[Dict]) -> pd.DataFrame:
        """Analyze sentiment trends over time."""
        results = []
        
        for doc in documents:
            # Assume documents have 'date', 'ticker', and 'text' fields
            chunks = doc.get('chunks', [{'text': doc.get('text', ''), 'chunk_id': 0}])
            sentiment_results = self.analyze_chunks(chunks)
            aggregated = self.aggregate_sentiment(sentiment_results)
            
            results.append({
                'date': doc.get('date'),
                'ticker': doc.get('ticker'),
                'sentiment': aggregated['overall_sentiment'],
                'score': aggregated['overall_score'],
                'confidence': aggregated['confidence'],
                'positive_ratio': aggregated['sentiment_distribution']['positive'],
                'negative_ratio': aggregated['sentiment_distribution']['negative'],
                'neutral_ratio': aggregated['sentiment_distribution']['neutral']
            })
        
        return pd.DataFrame(results)


class StreamlitSentiment:
    def __init__(self):
        self.sentiment_analyzer = FinancialSentimentAnalyzer()  # Fixed class name
        
    def analyze_for_streamlit(self, text):
        try:
            result = self.sentiment_analyzer.analyze_text(text)
            
            # Map FinancialSentimentAnalyzer output to expected Streamlit format
            sentiment = result.get("sentiment", "neutral")
            score = result.get("score", 0.0)
            confidence = result.get("confidence", 0.0)
            
            # Convert to compound-style scoring for compatibility
            return {
                "sentiment": sentiment.capitalize(),
                "compound": score,  # Overall sentiment score
                "confidence": confidence,
                "score": score,
                "positive": confidence if sentiment == 'positive' else 0.0,
                "negative": confidence if sentiment == 'negative' else 0.0,
                "neutral": confidence if sentiment == 'neutral' else 0.0,
                "status": "success"
            }
        except Exception as e:
            return {
                "sentiment": "Error",
                "compound": 0.0,
                "confidence": 0.0,
                "score": 0.0,
                "positive": 0.0,
                "negative": 0.0,
                "neutral": 1.0,
                "status": "error",
                "error": str(e)
            }

# Initialize the sentiment analyzer
print("🤖 Loading FinBERT sentiment model...")
streamlit_sentiment = StreamlitSentiment()
print("✅ Financial Sentiment Analysis initialized for Streamlit")


🤖 Loading FinBERT sentiment model...


Device set to use cpu


✅ Financial Sentiment Analysis initialized for Streamlit
