In [1]:
pip install transformers torch sentencepiece torchtext torchdata torchmetrics torchsummary

Collecting torchtext
  Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected packages: lightning-utilities, torchtext, torchmetrics
Successfully installed lightning-utilities-0.15.2 torchmetrics-1.8.2 torchtext-0.18.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
filepath = '/content/drive/My Drive/MLOps/news_alert'

In [4]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from typing import List, Dict
import re

class EventClassifier:
    def __init__(self):
        self.event_types = {
            'M&A': 0,
            'Earnings': 1,
            'Leadership_Change': 2,
            'Partnership': 3,
            'Product_Launch': 4,
            'Other': 5
        }

        # Load pre-trained financial sentiment model as base
        self.tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
        self.model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

        # Create event classification pipeline
        self.classifier = pipeline(
            "text-classification",
            model=self.model,
            tokenizer=self.tokenizer,
            top_k=None
        )

        # Event keywords for rule-based fallback
        self.event_keywords = {
            'M&A': ['acquire', 'acquisition', 'merge', 'merger', 'takeover', 'buyout', 'purchase', 'acquires', 'merged'],
            'Earnings': ['earnings', 'revenue', 'profit', 'loss', 'quarterly', 'financial results', 'beat estimates'],
            'Leadership_Change': ['CEO', 'appoint', 'resign', 'step down', 'leadership', 'executive', 'hire'],
            'Partnership': ['partner', 'partnership', 'collaborate', 'joint venture', 'alliance'],
            'Product_Launch': ['launch', 'release', 'new product', 'unveil', 'introduce']
        }

    def preprocess_text(self, text: str) -> str:
        """Preprocess text for classification"""
        # Combine title and content
        if len(text) > 512:
            text = text[:512]
        return text

    def rule_based_classification(self, text: str) -> Dict[str, float]:
        """Rule-based classification as fallback"""
        text_lower = text.lower()
        scores = {event: 0.0 for event in self.event_types.keys()}

        for event_type, keywords in self.event_keywords.items():
            keyword_count = sum(1 for keyword in keywords if keyword in text_lower)
            if keyword_count > 0:
                scores[event_type] = min(keyword_count * 0.2, 0.8)  # Normalize score

        # Default to Other if no strong signals
        if max(scores.values()) < 0.3:
            scores['Other'] = 0.9

        return scores

    def classify_event(self, title: str, content: str = "") -> Dict[str, float]:
        """Classify news event type"""
        combined_text = f"{title}. {content}"
        processed_text = self.preprocess_text(combined_text)

        try:
            # Use transformer model
            predictions = self.classifier(processed_text)[0]

            # Map sentiment to event types (simplified approach)
            # In production, you'd fine-tune on financial event data
            sentiment_scores = {pred['label']: pred['score'] for pred in predictions}

            # Enhanced rule-based classification combined with sentiment
            rule_scores = self.rule_based_classification(combined_text)

            # Combine scores (favor rule-based for event type detection)
            final_scores = {}
            for event_type in self.event_types.keys():
                final_scores[event_type] = rule_scores[event_type] * 0.7 + sentiment_scores.get('positive', 0) * 0.3

            return final_scores

        except Exception as e:
            print(f"Transformer classification failed: {e}")
            # Fallback to rule-based
            return self.rule_based_classification(combined_text)

    def get_primary_event(self, scores: Dict[str, float]) -> str:
        """Get the primary event type from classification scores"""
        return max(scores.items(), key=lambda x: x[1])[0]

    def is_high_confidence_event(self, scores: Dict[str, float], threshold: float = 0.6) -> bool:
        """Check if any event classification has high confidence"""
        return any(score > threshold for score in scores.values())

# if __name__ == "__main__":
classifier = EventClassifier()

# Test
test_title = "Microsoft acquires AI startup Lumier for $500 million"
test_content = "The acquisition will strengthen Microsoft's AI capabilities..."

scores = classifier.classify_event(test_title, test_content)
primary_event = classifier.get_primary_event(scores)

print(f"Event scores: {scores}")
print(f"Primary event: {primary_event}")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

Device set to use cpu


Event scores: {'M&A': 0.7197781455516816, 'Earnings': 0.29977814555168153, 'Leadership_Change': 0.29977814555168153, 'Partnership': 0.29977814555168153, 'Product_Launch': 0.29977814555168153, 'Other': 0.29977814555168153}
Primary event: M&A


In [5]:
import joblib
joblib.dump(AutoTokenizer,f"{filepath}/autoTokenizer.pkl")


['/content/drive/My Drive/MLOps/news_alert/autoTokenizer.pkl']

In [6]:
joblib.dump(AutoModelForSequenceClassification,f"{filepath}/autoModelSeqClassification.pkl")

['/content/drive/My Drive/MLOps/news_alert/autoModelSeqClassification.pkl']

In [7]:
joblib.dump(classifier,f"{filepath}/eventClassifier.pkl")

['/content/drive/My Drive/MLOps/news_alert/eventClassifier.pkl']