In [None]:
from huggingface_hub import login
login() # enter hugging face token

In [4]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached click-8.1.8-py3-none-any.whl (98 kB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, click, nltk
Successfully installed click-8.1.8 joblib-1.4.2 nltk-3.9.1


In [None]:
!python -m nltk.downloader all

^C


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\ARIJIT\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\ARIJIT\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\ARIJIT\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\ARIJIT\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\ARIJIT\AppData\Roaming\nltk_data

In [None]:
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from typing import List, Dict, Optional
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

class NewsSentimentAnalyzer:
    def __init__(self, model_name: str = "meta-llama/Llama-2-7b-chat-hf"):
        # Configure quantization to reduce memory usage
        self.quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
        
        # Initialize preprocessing tools
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        
        # Load model later when needed (lazy loading)
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
    
    def _initialize_model(self):
        """Lazy initialization of model to save memory"""
        if self.model is None:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                quantization_config=self.quantization_config,
                device_map="auto",
                torch_dtype=torch.float16
            )
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
    
    def preprocess_article(self, text: str, remove_stopwords: bool = True, 
                         lemmatize: bool = True, min_length: int = 100) -> str:
        """
        Preprocess news article text:
        1. Remove special characters and numbers
        2. Lowercase all text
        3. Remove extra whitespace
        4. (Optional) Remove stopwords
        5. (Optional) Lemmatize words
        6. Ensure minimum length
        """
        # Basic cleaning
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special chars and numbers
        text = text.lower().strip()  # Lowercase and trim
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        
        words = word_tokenize(text)
        
        if remove_stopwords:
            words = [w for w in words if w not in self.stop_words]
        
        if lemmatize:
            words = [self.lemmatizer.lemmatize(w) for w in words]
        
        processed_text = ' '.join(words)
        
        if len(processed_text.split()) < min_length:
            return text[:min_length*6] 
        return processed_text
    
    def analyze_sentiment(self, text: str, confidence: bool = False) -> Dict[str, str]:
        """
        Analyze sentiment of news article text.
        Returns dictionary with sentiment and optional confidence.
        """
        self._initialize_model()
        
        prompt = f"""
            [INST] <<SYS>>
            You are an expert financial news sentiment analyst. Your task is to:
            1. Carefully read the news article
            2. Analyze both explicit statements and implied tone
            3. Consider potential market impact
            4. Classify the sentiment into one of three categories:
            - "positive" (generally favorable, likely to boost asset prices)
            - "neutral" (factual reporting without clear bias)
            - "negative" (generally unfavorable, likely to depress asset prices)
            5. Provide your output in this exact JSON format:
            {{
            "sentiment": "your_sentiment_classification",
            "confidence": 0.00-1.00,
            "key_phrases": ["list", "of", "3-5", "phrases"],
            "reasoning": "1-2 sentence explanation"
            }}

            Guidelines:
            - Be strict with classifications (only 3 options)
            - Confidence should reflect certainty (0.5 = unsure, 0.9+ = very certain)
            - Key phrases should be most sentiment-indicative snippets
            - Reasoning should cite specific article content
            <</SYS>>

            News Article: {text[:3000]}... [truncated if necessary]

            Analysis:
            [/INST]"""
        
        
        inputs = self.tokenizer(
            prompt, 
            return_tensors="pt", 
            truncation=True, 
            max_length=1024  # Increased for news context
        ).to("cuda")
        
        # Generate output with constrained parameters
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=20,
                temperature=0.3,  # Lower for more deterministic results
                do_sample=True,
                top_p=0.9,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        # Decode and clean output
        decoded_output = self.tokenizer.decode(
            outputs[0], 
            skip_special_tokens=True
        )
        
        # Extract the sentiment label
        sentiment_part = decoded_output.split("Sentiment:")[-1].strip().lower()
        sentiment = re.search(r'\b(strongly_positive|positive|neutral|negative|strongly_negative)\b', sentiment_part)
        
        result = {
            "sentiment": sentiment.group(0) if sentiment else "neutral",
            "processed_text": self.preprocess_article(text)
        }
        
        if confidence:
            result["confidence"] = self._estimate_confidence(decoded_output)
        
        return result
    
    def _estimate_confidence(self, model_output: str) -> float:
        """
        Simple confidence estimation based on output characteristics
        """
        clean_output = model_output.lower()
        confidence = 0.7  # Base confidence
        
        # Increase confidence if output is clear and concise
        if "strongly_" in clean_output:
            confidence += 0.15
        if len(clean_output.split()) <= 5:  # Very concise answer
            confidence += 0.1
            
        return min(0.99, max(0.5, confidence))  # Keep within reasonable bounds
    
    def batch_analyze(self, articles: List[str], show_progress: bool = True) -> List[Dict[str, str]]:
        """
        Analyze multiple articles with optional progress display
        """
        results = []
        if show_progress:
            from tqdm import tqdm
            iterator = tqdm(articles, desc="Analyzing articles")
        else:
            iterator = articles
            
        for article in iterator:
            try:
                results.append(self.analyze_sentiment(article))
            except torch.cuda.OutOfMemoryError:
                torch.cuda.empty_cache()
                results.append({"sentiment": "error", "processed_text": article[:100]})
        return results

# Example usage
if __name__ == "__main__":
    analyzer = NewsSentimentAnalyzer()
    
    # Sample news articles
    news_articles = [
        "The stock market reached record highs today as the Federal Reserve announced it would maintain current interest rates. Analysts are optimistic about continued growth in the tech sector.",
        "Company XYZ shares plummeted 20% after disappointing earnings report. CEO admits to strategic missteps and warns of potential layoffs in coming quarters.",
        "The European Central Bank met today to discuss inflation trends. No policy changes were announced, with officials taking a wait-and-see approach."
    ]
    
    # Preprocess and analyze
    processed_articles = [analyzer.preprocess_article(art) for art in news_articles]
    results = analyzer.batch_analyze(processed_articles)
    
    # Display results
    for original, processed, result in zip(news_articles, processed_articles, results):
        print(f"\nOriginal Length: {len(original)} chars")
        print(f"Processed Length: {len(processed)} chars")
        print(f"Sentiment: {result['sentiment']}")
        print(f"Sample: {processed[:200]}...\n")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards:   0%|          | 0/2 [08:26<?, ?it/s]


KeyboardInterrupt: 

In [9]:
import pandas as pd
import numpy as np

In [None]:
if __name__ == "__main__":
    analyzer = NewsSentimentAnalyzer()
    
    dataframe = pd.read_csv(
        filepath_or_buffer="../Data Collection/data.csv"
    )
    
    # Sample news articles
    news_articles = dataframe["News"]
    
    # Preprocess and analyze
    processed_articles = [analyzer.preprocess_article(art) for art in news_articles]
    results = analyzer.batch_analyze(processed_articles)
    
    # Display results
    for original, processed, result in zip(news_articles, processed_articles, results):
        print(f"\nOriginal Length: {len(original)} chars")
        print(f"Processed Length: {len(processed)} chars")
        print(f"Sentiment: {result['sentiment']}")
        print(f"Sample: {processed[:200]}...\n")