In [1]:
import pandas as pd
import torch
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class XLMRSubjectivityClassifier:
    def __init__(self, model_name="lighteternal/fact-or-opinion-xlmr-el"):
        """
        Initialize the XLM-RoBERTa-based subjectivity classifier.
        
        Args:
            model_name (str): The name of the pre-trained model to use
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        self.model.eval()
        
        # The model is expected to be a binary classifier with:
        # - label 0 for "fact"
        # - label 1 for "opinion"
    
    def classify(self, text):
        """
        Classify the input text as fact or opinion.
        
        Args:
            text (str): Input text to classify
            
        Returns:
            dict: Dictionary containing text, confidence scores, and classification
        """
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            
        # Get probabilities
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        fact_score = probs[0][0].item()
        opinion_score = probs[0][1].item()
        
        # Classify based on highest probability
        classification = "OPINION" if opinion_score > fact_score else "FACT"
        
        return {
            "text": text,
            "fact_score": fact_score,
            "opinion_score": opinion_score,
            "classification": classification
        }

# Example usage
if __name__ == "__main__":
    # Initialize classifier
    classifier = XLMRSubjectivityClassifier()
    
    # Test sentences
    test_sentences = [
        "The Earth revolves around the Sun.",
        "I think this movie is absolutely terrible.",
        "Water boils at 100 degrees Celsius at sea level.",
        "In my opinion, the government should increase funding for education.",
        "Paris is the capital of France.",
        "This is probably the best restaurant in town."
    ]
    
    # Process each sentence
    results = []
    start_time = time.time()
    for sentence in test_sentences:
        result = classifier.classify(sentence)
        results.append(result)
        print(f"Text: {result['text']}")
        print(f"Fact Score: {result['fact_score']:.2f}")
        print(f"Opinion Score: {result['opinion_score']:.2f}")
        print(f"Classification: {result['classification']}")
        print("-" * 50)

    print(f"Total time for {len(test_sentences)} sentences: {time.time() - start_time:.4f} seconds")
    
    # Create a DataFrame for better visualization
    df = pd.DataFrame(results)
    print("\nSummary:")
    print(df[["text", "fact_score", "opinion_score", "classification"]])

  from .autonotebook import tqdm as notebook_tqdm


Text: The Earth revolves around the Sun.
Fact Score: 0.00
Opinion Score: 1.00
Classification: OPINION
--------------------------------------------------
Text: I think this movie is absolutely terrible.
Fact Score: 1.00
Opinion Score: 0.00
Classification: FACT
--------------------------------------------------
Text: Water boils at 100 degrees Celsius at sea level.
Fact Score: 0.00
Opinion Score: 1.00
Classification: OPINION
--------------------------------------------------
Text: In my opinion, the government should increase funding for education.
Fact Score: 0.99
Opinion Score: 0.01
Classification: FACT
--------------------------------------------------
Text: Paris is the capital of France.
Fact Score: 0.00
Opinion Score: 1.00
Classification: OPINION
--------------------------------------------------
Text: This is probably the best restaurant in town.
Fact Score: 0.01
Opinion Score: 0.99
Classification: OPINION
--------------------------------------------------
Total time for 6 sentenc