In [None]:
from huggingface_hub import login
import os
from dotenv import load_dotenv
load_dotenv(
    dotenv_path="../.env"
)

token = os.getenv(
    key = "FINGPT_ACCESS_TOKEN"
)
# # print(token)
login(
    token=token) # enter hugging face token

In [2]:
!pip install nltk



In [3]:
!python -m nltk.downloader all

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

In [4]:
!pip install bitsandbytes
!pip install transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [5]:
pip uninstall  bitsandbytes

Found existing installation: bitsandbytes 0.45.5
Uninstalling bitsandbytes-0.45.5:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/bitsandbytes-0.45.5.dist-info/*
    /usr/local/lib/python3.11/dist-packages/bitsandbytes/*
Proceed (Y/n)? y
  Successfully uninstalled bitsandbytes-0.45.5


In [6]:
!pip install -U bitsandbytes



Collecting bitsandbytes
  Using cached bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Using cached bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [7]:
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from typing import Tuple, Dict

class SentimentAnalyzer:
    def __init__(self, model_name: str = "meta-llama/Llama-2-7b-chat-hf"):
        self.quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.labels = ["negative", "neutral", "positive"]

    def _initialize_model(self):
        if self.model is None:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                quantization_config=self.quantization_config,
                device_map="auto",
                torch_dtype=torch.float16
            )
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

    def analyze_sentiment(self, text: str) -> Tuple[str, Dict[str, float]]:
        """
        Analyze sentiment and return both label and softmax probabilities

        Returns:
            tuple: (predicted_label, {"negative": prob, "neutral": prob, "positive": prob})
        """
        self._initialize_model()

        # More explicit prompt with examples
        prompt = f"""Analyze the sentiment of this financial news text and provide ONLY a JSON response with confidence scores that sum to 1.0:

Examples:
Good news: {{"negative": 0.1, "neutral": 0.2, "positive": 0.7}}
Bad news: {{"negative": 0.8, "neutral": 0.1, "positive": 0.1}}
Neutral news: {{"negative": 0.2, "neutral": 0.6, "positive": 0.2}}

Text: "{text[:2000]}"

JSON response:"""

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        try:
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=100,
                    temperature=0.3,  # Lower temperature for more focused answers
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=self.tokenizer.eos_token_id
                )

            decoded_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract JSON portion
            try:
                start = decoded_output.find('{')
                end = decoded_output.rfind('}') + 1
                scores = json.loads(decoded_output[start:end])

                # Validate scores
                if not all(k in scores for k in self.labels):
                    raise ValueError("Missing sentiment classes")

                # Convert to probabilities (softmax)
                score_values = torch.tensor([scores["negative"], scores["neutral"], scores["positive"]])
                probabilities = torch.nn.functional.softmax(score_values, dim=0)

                # Get predicted label
                predicted_idx = torch.argmax(probabilities).item()
                predicted_label = self.labels[predicted_idx]

                return predicted_label, {
                    "negative": round(probabilities[0].item(), 4),
                    "neutral": round(probabilities[1].item(), 4),
                    "positive": round(probabilities[2].item(), 4)
                }

            except (json.JSONDecodeError, KeyError, ValueError) as e:
                print(f"JSON parsing failed: {e}")
                print(f"Model output: {decoded_output}")
                # Return neutral as fallback
                return "neutral", {"negative": 0.2, "neutral": 0.6, "positive": 0.2}

        except Exception as e:
            print(f"Analysis failed: {e}")
            return "error", {"negative": 0.0, "neutral": 0.0, "positive": 0.0}

if __name__ == "__main__":
    analyzer = SentimentAnalyzer()

    test_texts = [
        "Company XYZ shares plummeted 20% after disappointing earnings report. CEO admits to strategic missteps and warns of potential layoffs in coming quarters.",
        "The European Central Bank met today to discuss inflation trends. No policy changes were announced, with officials taking a wait-and-see approach.",
        "Breaking: Major merger announced between leading tech giants. The $50 billion deal is expected to create the largest player in the cloud computing space."
    ]

    for text in test_texts:
        label, scores = analyzer.analyze_sentiment(text)
        print("\n" + "="*50)
        print(f"Text: {text[:80]}...")
        print(f"Predicted: {label}")
        print("Confidence Scores:")
        for k, v in scores.items():
            print(f"  {k}: {v:.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

JSON parsing failed: Extra data: line 2 column 1 (char 51)
Model output: Analyze the sentiment of this financial news text and provide ONLY a JSON response with confidence scores that sum to 1.0:

Examples:
Good news: {"negative": 0.1, "neutral": 0.2, "positive": 0.7}
Bad news: {"negative": 0.8, "neutral": 0.1, "positive": 0.1}
Neutral news: {"negative": 0.2, "neutral": 0.6, "positive": 0.2}

Text: "Company XYZ shares plummeted 20% after disappointing earnings report. CEO admits to strategic missteps and warns of potential layoffs in coming quarters."

JSON response:
{
"negative": 0.8,
"neutral": 0.2,
"positive": 0.0
}

Text: Company XYZ shares plummeted 20% after disappointing earnings report. CEO admits...
Predicted: neutral
Confidence Scores:
  negative: 0.2000
  neutral: 0.6000
  positive: 0.2000
JSON parsing failed: Extra data: line 2 column 1 (char 51)
Model output: Analyze the sentiment of this financial news text and provide ONLY a JSON response with confidence scores that sum 

In [None]:
import bitsandbytes as bnb
import torch

bnb.__version__

In [None]:
import pandas as pd
import numpy as np

In [None]:
if __name__ == "__main__":
    analyzer = NewsSentimentAnalyzer()

    dataframe = pd.read_csv(
        filepath_or_buffer="/content/data.csv"
    )

    news_articles = dataframe["News"]

    processed_articles = [analyzer.preprocess_article(art) for art in news_articles]
    results = analyzer.batch_analyze(processed_articles)

    prin

    for original, processed, result in zip(news_articles, processed_articles, results):
        print(f"\nOriginal Length: {len(original)} chars")
        print(f"Processed Length: {len(processed)} chars")
        print(f"Sentiment: {result['sentiment']}")
        print(f"Sample: {processed[:200]}...\n")

In [3]:
import torch
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from typing import Tuple, Dict, List

class SentimentAnalyzer:
    def __init__(self, model_name: str = "meta-llama/Llama-2-7b-chat-hf"):
        self.quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.labels = ["negative", "neutral", "positive"]

    def _initialize_model(self):
        if self.model is None:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                quantization_config=self.quantization_config,
                device_map="auto",
                torch_dtype=torch.float16
            )
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

    def analyze_sentiment(self, text: str) -> Tuple[str, Dict[str, float]]:
        """Analyze sentiment with improved prompt engineering"""
        self._initialize_model()

        prompt = f"""Analyze this financial news sentiment and provide:
1. Primary sentiment (negative/neutral/positive)
2. Confidence scores (must sum to 1.0)

Guidelines:
- Negative: Bad news, declining metrics, warnings
- Neutral: Factual reporting, mixed indicators
- Positive: Good news, growth, positive outlook

Example Output:
{{
    "sentiment": "negative",
    "scores": {{
        "negative": 0.85,
        "neutral": 0.10,
        "positive": 0.05
    }},
    "reason": "Article discusses significant stock drop and layoffs"
}}

Text: "{text[:2000]}"

Analysis Output:"""

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        try:
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=200,
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=self.tokenizer.eos_token_id
                )

            decoded_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            try:
                # Improved JSON extraction
                json_str = decoded_output.split("Analysis Output:")[-1].strip()
                result = json.loads(json_str)

                # Validate structure
                if not all(k in result for k in ["sentiment", "scores"]):
                    raise ValueError("Invalid output format")

                sentiment = result["sentiment"].lower()
                if sentiment not in self.labels:
                    sentiment = "neutral"

                scores = result["scores"]
                if not all(k in scores for k in self.labels):
                    raise ValueError("Missing sentiment scores")

                # Normalize scores to sum to 1
                total = sum(scores.values())
                normalized_scores = {k: round(v/total, 4) for k, v in scores.items()}

                return sentiment, normalized_scores

            except (json.JSONDecodeError, KeyError, ValueError) as e:
                print(f"Output parsing error: {e}")
                print(f"Raw output: {decoded_output}")
                return "neutral", {k: 0.3333 for k in self.labels}

        except Exception as e:
            print(f"Analysis error: {e}")
            return "error", {k: 0.0 for k in self.labels}

def analyze_and_save_to_csv(texts: List[str], output_file: str = "/content/sentiment_results.csv"):
    analyzer = SentimentAnalyzer()
    results = []

    for text in texts:
        label, scores = analyzer.analyze_sentiment(text)
        results.append({
            "News": text[:100] + "..." if len(text) > 100 else text,
            "Sentiment": label,
            "Negative_Score": scores["negative"],
            "Neutral_Score": scores["neutral"],
            "Positive_Score": scores["positive"],
            "Confidence": scores[label]  # Highest score
        })

    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
    return df

if __name__ == "__main__":
    test_texts = [
        "Company XYZ shares plummeted 20% after disappointing earnings report. CEO admits to strategic missteps and warns of potential layoffs in coming quarters.",
        "The European Central Bank met today to discuss inflation trends. No policy changes were announced, with officials taking a wait-and-see approach.",
        "Breaking: Major merger announced between leading tech giants. The $50 billion deal is expected to create the largest player in the cloud computing space.",
        "Apple reports record quarterly profits driven by strong iPhone sales and services growth, beating analyst expectations.",
        "Automaker recalls 500,000 vehicles due to potential brake system failure, sending shares down 8% in pre-market trading."
    ]

    results_df = analyze_and_save_to_csv(test_texts)
    print("\nResults:")
    print(results_df.to_string())

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Results saved to /content/sentiment_results.csv

Results:
                                                                                                      News Sentiment  Negative_Score  Neutral_Score  Positive_Score  Confidence
0  Company XYZ shares plummeted 20% after disappointing earnings report. CEO admits to strategic misste...  negative            0.85           0.05            0.10        0.85
1  The European Central Bank met today to discuss inflation trends. No policy changes were announced, w...   neutral            0.00           0.60            0.40        0.60
2  Breaking: Major merger announced between leading tech giants. The $50 billion deal is expected to cr...  positive            0.05           0.90            0.05        0.05
3  Apple reports record quarterly profits driven by strong iPhone sales and services growth, beating an...  positive            0.00           0.60            0.40        0.40
4  Automaker recalls 500,000 vehicles due to potential brake s

In [8]:
import torch
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

class SentimentAnalyzer:
    def __init__(self, model_name: str = "meta-llama/Llama-2-7b-chat-hf"):
        self.quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
        self.model_name = model_name
        self.tokenizer = None
        self.model = None

    def _initialize_model(self):
        if self.model is None:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                quantization_config=self.quantization_config,
                device_map="auto",
                torch_dtype=torch.float16
            )

    def analyze_sentiment(self, text: str) -> dict:
        """Improved sentiment analysis with better prompt engineering"""
        self._initialize_model()

        prompt = """[INST] Analyze the sentiment of this financial news text.
        Consider both content and tone. Return ONLY a JSON object with:
        - "sentiment" (positive/neutral/negative)
        - "confidence" (0.0-1.0)
        - "reason" (brief explanation)

        Examples:
        {
            "sentiment": "positive",
            "confidence": 0.95,
            "reason": "Record profits and growth mentioned"
        }
        {
            "sentiment": "negative",
            "confidence": 0.88,
            "reason": "Significant stock drop and layoffs announced"
        }

        Text: """ + f'"{text[:2000]}"[/INST]'

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        try:
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=200,
                    temperature=0.3,
                    do_sample=True,
                    top_p=0.9
                )

            decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            try:
                # Extract JSON from output
                json_str = decoded.split('[/INST]')[-1].strip()
                result = json.loads(json_str)

                # Validate response
                if result["sentiment"].lower() not in ["positive", "neutral", "negative"]:
                    raise ValueError("Invalid sentiment")
                if not 0 <= result["confidence"] <= 1:
                    raise ValueError("Invalid confidence")

                return result

            except (json.JSONDecodeError, KeyError, ValueError) as e:
                print(f"Parsing error: {e}")
                return {
                    "sentiment": "neutral",
                    "confidence": 0.5,
                    "reason": "Analysis failed"
                }

        except Exception as e:
            print(f"Analysis error: {e}")
            return {
                "sentiment": "error",
                "confidence": 0.0,
                "reason": str(e)
            }

def analyze_news(news_items: list, output_file="sentiment_results.csv"):
    analyzer = SentimentAnalyzer()
    results = []

    for item in news_items:
        analysis = analyzer.analyze_sentiment(item)
        results.append({
            "News": item[:100] + "..." if len(item) > 100 else item,
            "Sentiment": analysis["sentiment"].capitalize(),
            "Confidence": analysis["confidence"],
            "Reason": analysis["reason"]
        })

    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    return df

# Example usage
news_samples = [
    "Tesla stock surges 12% after reporting record deliveries and new factory openings in Europe and Asia.",
    "Federal Reserve keeps interest rates unchanged, citing stable inflation and moderate economic growth.",
    "Major pharmaceutical company recalls popular medication after reports of severe side effects, shares drop 30%.",
    "Tech giant announces revolutionary AI assistant that outperforms humans in creative tasks, stock hits all-time high.",
    "Automotive industry faces supply chain disruptions as key component shortage leads to production delays worldwide."
]

results = analyze_news(news_samples)
print(results.to_string())

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing error: Expecting value: line 1 column 1 (char 0)
Parsing error: Expecting value: line 1 column 1 (char 0)
Parsing error: Expecting value: line 1 column 1 (char 0)
                                                                                                      News Sentiment  Confidence                                                Reason
0  Tesla stock surges 12% after reporting record deliveries and new factory openings in Europe and Asia...  Positive        0.90  Record deliveries and new factory openings mentioned
1  Federal Reserve keeps interest rates unchanged, citing stable inflation and moderate economic growth...   Neutral        0.50                                       Analysis failed
2  Major pharmaceutical company recalls popular medication after reports of severe side effects, shares...  Negative        0.88      Severe side effects and product recall mentioned
3  Tech giant announces revolutionary AI assistant that outperforms humans in creative tasks, sto

In [None]:
import torch
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from typing import List, Dict
import pandas as pd
import numpy as np
import time
import os

# Global variables
INPUT_FILE_ADDRESS = "../Data Collection/data.csv"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
COMPUTATION_TIME_DATA = "model_prediction_time_data.csv"

class FinancialSentimentAnalyzer:
    def __init__(self, model_name: str = "meta-llama/Llama-2-7b-chat-hf"):
        # Configure model quantization for efficient loading
        self.quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
        self.model_name = model_name
        self.tokenizer = None
        self.model = None

    def initialize_model(self):
        """Lazy initialization of model to save resources"""
        if self.model is None:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                quantization_config=self.quant_config,
                device_map="auto",
                torch_dtype=torch.float16
            )
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

    def generate_analysis_prompt(self, text: str) -> str:
        """Construct a precise prompt for financial sentiment analysis"""
        return f"""<s>[INST] <<SYS>>
You are an expert financial analyst. Analyze the sentiment of this news text and return:
1. Sentiment classification (positive/neutral/negative)
2. Confidence score (0.0-1.0)
3. Brief reasoning (1 sentence)

Return ONLY a JSON object in this exact format:
{{
    "sentiment": "classification",
    "confidence": 0.00,
    "reason": "explanation"
}}
<</SYS>>

News Text:
\"\"\"
{text[:2000]}
\"\"\"

Analysis: [/INST]"""

    def parse_model_output(self, output: str) -> Dict:
        """Robust output parsing with multiple fallback strategies"""
        output = output.strip()

        # Strategy 1: Direct JSON parsing
        try:
            start = output.find('{')
            end = output.rfind('}') + 1
            result = json.loads(output[start:end])
            if all(k in result for k in ["sentiment", "confidence", "reason"]):
                return result
        except (json.JSONDecodeError, KeyError):
            pass

        # Strategy 2: Keyword-based fallback
        sentiment = "neutral"
        confidence = 0.5
        reason = "Automatic analysis"

        if "positive" in output.lower():
            sentiment = "positive"
            confidence = 0.7
        elif "negative" in output.lower():
            sentiment = "negative"
            confidence = 0.7

        return {
            "sentiment": sentiment,
            "confidence": confidence,
            "reason": reason
        }

    def analyze_sentiment(self, text: str) -> Dict:
        """Perform sentiment analysis with robust error handling"""
        self.initialize_model()

        try:
            prompt = self.generate_analysis_prompt(text)
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=256,
                    temperature=0.5,
                    top_p=0.9,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )

            decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            analysis_part = decoded.split('[/INST]')[-1].strip()

            result = self.parse_model_output(analysis_part)

            # Normalize and validate results
            result["sentiment"] = result["sentiment"].lower()
            if result["sentiment"] not in ["positive", "neutral", "negative"]:
                result["sentiment"] = "neutral"

            result["confidence"] = max(0.0, min(1.0, float(result["confidence"])))

            return result

        except Exception as e:
            print(f"Analysis error: {str(e)}")
            return {
                "sentiment": "error",
                "confidence": 0.0,
                "reason": str(e)
            }

def analyze_news_articles(articles: List[str], output_path: str = "sentiment_analysis_using_llma2_7b.csv") -> pd.DataFrame:
    """Batch analyze news articles and save results to CSV"""
    analyzer = FinancialSentimentAnalyzer()
    results = []

    for article in articles:
        analysis = analyzer.analyze_sentiment(article)
        results.append({
            "News_Snippet": article[:150] + "..." if len(article) > 150 else article,
            "Sentiment": analysis["sentiment"].capitalize(),
            "Confidence_Score": analysis["confidence"],
            "Analysis_Reason": analysis["reason"]
        })

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)
    return df

# Example usage with clear financial news examples
if __name__ == "__main__":

    dataframe = pd.read_csv(
        filepath_or_buffer=INPUT_FILE_ADDRESS
    )
    start_time = time.time()
    test_articles = dataframe["News"].values()

    print("Running financial sentiment analysis...")
    results_df = analyze_news_articles(test_articles)

    end_time = time.time()
    
    print("\nAnalysis Results:")
    print(results_df.to_string(index=False))
    print(f"\nResults saved to 'sentiment_results.csv'")
        
    computation_time = end_time - start_time
    
    data = list((MODEL_NAME, computation_time)) # model time data
    
    time_dataframe = pd.DataFrame(
        data=data,columns=["Model", "Computation_time"],
    )
    
    time_dataframe.to_csv(
        path_or_buf=COMPUTATION_TIME_DATA,
        mode='a',
        index=False, 
        header=not os.path.exists(COMPUTATION_TIME_DATA), 
        encoding="utf-8"
    )

Running financial sentiment analysis...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Analysis Results:
                                                                                                                           News_Snippet Sentiment  Confidence_Score                                                                                                                                                                                                                                                                                                                                                                                                                                      Analysis_Reason
              Apple shares jump 8% in pre-market trading after announcing record iPhone sales and beating Q3 earnings estimates by 15%.  Positive               0.8                                                                                                                                                                                                                         