Esta versão: 
    -> corrige os erros todos (ortograficos e gramaticais)

    -> vê a polarity e subjectivity com o textblob

    -> avalia se a frase é negativa ou afirmativa 
            (utiliza a análise do spaCY para procurar palavras de negação como not, n't, never, ...)

    -> avalia se a frase é factual ou uma opinião 
            (usa um modelo pre treinado, https://huggingface.co/lighteternal/fact-or-opinion-xlmr-el)

                Label 0: Opinion/Subjective sentence
                Label 1: Fact/Objective sentence

    -> classifica a emoção 
            (usa um modelo pre treinado, https://huggingface.co/ayoubkirouane/BERT-Emotions-Classifier)
            
            este modelo tem 11 emoções:
                'anger' 
                'anticipation'
                'disgust'
                'fear'
                'joy'
                'love'
                'optimism'
                'pessimism'
                'sadness'
                'surprise'
                'trust'

    -> vê inference times de todas as funções

    -> Compara os resultados de modelos pré treinados com modelos feitos por nós

In [69]:
import time
import nltk
import spacy
import textblob
from textblob import TextBlob
from spellchecker import SpellChecker
from gramformer import Gramformer
from transformers import pipeline

In [70]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model...")
    import os
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhabid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Ignore warnings

In [71]:
import warnings
warnings.filterwarnings("ignore")

## Setup the decorator

In [72]:

from types import MethodType

def add_method(cls):
    def decorator(func):
        setattr(cls, func.__name__, func)
        return func
    return decorator

## Initialize pretrained models 

In [73]:
class TextAnalyzer:
    def __init__(self):
        self.spell_checker = SpellChecker()
        self.gramformer = Gramformer(models=1, use_gpu=False)
        
        # Initialize emotion classifier
        print("Loading emotion classification model...")
        self.emotion_classifier = pipeline("text-classification", 
        model="ayoubkirouane/BERT-Emotions-Classifier", return_all_scores=True)

        # Initialize specialized fact vs. opinion classifier with correct model name
        print("Loading specialized fact-opinion classification model...")
        self.fact_opinion_classifier = pipeline(
            "text-classification",
            model="lighteternal/fact-or-opinion-xlmr-el"
        )
        
        # For storing timing information
        self.inference_times = {}

## Initialize custom models

In [74]:
@add_method(TextAnalyzer)
def load_custom_models(self):
    """Load custom trained models for comparison."""
    import pickle
    from sentence_transformers import SentenceTransformer
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    print("Loading custom models...")
    
    # Load embedding model
    try:
        self.sentence_encoder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    except Exception as e:
        print(f"Warning: Could not load sentence encoder: {e}")
        self.sentence_encoder = None
    
    # Load TF-IDF vectorizer
    try:
        with open("tfidf_vectorizer.pkl", "rb") as f:
            self.tfidf_vectorizer = pickle.load(f)
        print("Loaded TF-IDF vectorizer")
    except FileNotFoundError:
        print("Warning: TF-IDF vectorizer not found. Will fit on first input.")
        self.tfidf_vectorizer = None
    
    # Load custom models
    try:
        with open("best_model_Type.pkl", "rb") as f:
            self.custom_type_model = pickle.load(f)
        
        with open("best_model_Factuality.pkl", "rb") as f:
            self.custom_fact_model = pickle.load(f)
            
        with open("best_model_Sentiment.pkl", "rb") as f:
            self.custom_sentiment_model = pickle.load(f)
            
        # Define mappings for results
        self.type_labels = {0: "affirmative", 1: "negation"}
        self.factuality_labels = {0: "fact", 1: "opinion"}
        self.sentiment_labels = {0: "sadness", 1: "anger", 2: "neutral", 3: "happiness", 4: "euphoria"}
        
        print("Custom models loaded successfully")
    except FileNotFoundError as e:
        print(f"Error loading custom models: {e}")

In [75]:
@add_method(TextAnalyzer)
def preprocess_text_for_custom_models(self, text, model_type=None):
    """
    Preprocess text for custom models using the appropriate feature extraction method.
    
    Args:
        text: The text to preprocess
        model_type: Which model this is for ('type', 'factuality', 'sentiment'), to determine correct featurization
    """
    # If we're using TF-IDF vectorizer (needed for factuality model)
    if model_type == 'factuality' or self.sentence_encoder is None:
        if self.tfidf_vectorizer:
            return self.tfidf_vectorizer.transform([text])
        else:
            # Create and fit a new vectorizer if needed
            from sklearn.feature_extraction.text import TfidfVectorizer
            self.tfidf_vectorizer = TfidfVectorizer(max_features=5000)
            self.tfidf_vectorizer.fit([text])
            print("Note: Fitted vectorizer on first input (this is not optimal for production)")
            return self.tfidf_vectorizer.transform([text])
    
    # Otherwise use sentence embeddings
    try:
        return self.sentence_encoder.encode([text])
    except Exception as e:
        print(f"Error with sentence embeddings: {e}, falling back to TF-IDF")
        if self.tfidf_vectorizer:
            return self.tfidf_vectorizer.transform([text])
        else:
            # Create and fit a new vectorizer if needed
            from sklearn.feature_extraction.text import TfidfVectorizer
            self.tfidf_vectorizer = TfidfVectorizer(max_features=5000)
            self.tfidf_vectorizer.fit([text])
            print("Note: Fitted vectorizer on first input (this is not optimal for production)")
            return self.tfidf_vectorizer.transform([text])

In [76]:
@add_method(TextAnalyzer)
def custom_predict_type(self, text):
    """Predict sentence type (affirmative/negative) using custom model."""
    start_time = time.time()
    
    # Try with sentence embeddings first
    try:
        if self.sentence_encoder:
            text_features = self.sentence_encoder.encode([text])
            prediction = self.custom_type_model.predict(text_features)[0]
        else:
            raise ValueError("Sentence encoder not available")
    except Exception as e:
        # Fall back to TF-IDF if sentence embeddings fail
        text_features = self.preprocess_text_for_custom_models(text, model_type='type')
        prediction = self.custom_type_model.predict(text_features)[0]
    
    sentence_type = self.type_labels[prediction]
    
    elapsed_time = time.time() - start_time
    self.inference_times['custom_type_detection'] = elapsed_time
    
    return sentence_type, elapsed_time

In [77]:
@add_method(TextAnalyzer)
def custom_predict_factuality(self, text):
    """Predict factuality (fact/opinion) using custom model."""
    start_time = time.time()
    
    # Always use TF-IDF for factuality model since it expects 2754 features
    text_features = self.preprocess_text_for_custom_models(text, model_type='factuality')
    
    # Make prediction
    prediction = self.custom_fact_model.predict(text_features)[0]
    factuality = self.factuality_labels[prediction]
    
    elapsed_time = time.time() - start_time
    self.inference_times['custom_factuality_detection'] = elapsed_time
    
    return factuality, elapsed_time

In [78]:
@add_method(TextAnalyzer)
def custom_predict_emotion(self, text):
    """Predict emotion using custom model."""
    start_time = time.time()
    
    # Try with sentence embeddings first
    try:
        if self.sentence_encoder:
            text_features = self.sentence_encoder.encode([text])
            prediction = self.custom_sentiment_model.predict(text_features)[0]
        else:
            raise ValueError("Sentence encoder not available")
    except Exception as e:
        # Fall back to TF-IDF if sentence embeddings fail
        text_features = self.preprocess_text_for_custom_models(text, model_type='sentiment')
        prediction = self.custom_sentiment_model.predict(text_features)[0]
    
    emotion = self.sentiment_labels[prediction]
    
    elapsed_time = time.time() - start_time
    self.inference_times['custom_emotion_detection'] = elapsed_time
    
    return emotion, elapsed_time

## Optimized Gramformer for sentence correction

In [79]:
@add_method(TextAnalyzer)
def correct_spelling(self, sentence):
    start_time = time.time()
    words = sentence.split() 
    corrected_words = [self.spell_checker.correction(word) or word for word in words] 
    result = " ".join(corrected_words)
    elapsed_time = time.time() - start_time
    self.inference_times['spell_correction'] = elapsed_time
    return result

@add_method(TextAnalyzer)
def correct_sentence(self, sentence):
    start_time = time.time()
    spelled_corrected = self.correct_spelling(sentence)
    
    # Separate timing for grammar correction
    grammar_start_time = time.time()
    corrected_sentences = self.gramformer.correct(spelled_corrected, max_candidates=1)
    result = next(iter(corrected_sentences), spelled_corrected)
    grammar_elapsed_time = time.time() - grammar_start_time
    
    elapsed_time = time.time() - start_time
    self.inference_times['grammar_correction'] = grammar_elapsed_time
    self.inference_times['total_correction'] = elapsed_time
    return result

## Polarity detection with Spacy

In [80]:
@add_method(TextAnalyzer)
def analyze_sentence_type(self, text):
    """Determine if the sentence is affirmative or negative"""
    start_time = time.time()
    doc = nlp(text)
    
    # Check for negation
    has_negation = any(token.dep_ == 'neg' for token in doc)
    
    # Determine sentence type
    if has_negation:
        sentence_type = "negation"
    else:
        sentence_type = "affirmative"
    
    elapsed_time = time.time() - start_time
    self.inference_times['sentence_type_analysis'] = elapsed_time
    
    return {
        'sentence_type': sentence_type
    }

## Subjectivity detection with a pre-trained model

In [81]:
@add_method(TextAnalyzer)
def classify_fact_opinion(self, text):
    """Classify if the text is a fact or an opinion using a specialized model."""
    start_time = time.time()
    result = self.fact_opinion_classifier(text)[0]
    
    # This model outputs LABEL_0 (opinion) or LABEL_1 (fact)
    # Convert to more readable format
    label_map = {"LABEL_0": "opinion", "LABEL_1": "fact"}
    classification = label_map.get(result['label'], result['label'])
    confidence = result['score']
    
    elapsed_time = time.time() - start_time
    self.inference_times['fact_opinion_classification'] = elapsed_time
    
    return {
        'classification': classification,
        'confidence': confidence
    }

## Emotion classification with DistilBERT

In [82]:
@add_method(TextAnalyzer)
def detect_emotion(self, text):
    """Detects emotions in text using the BERT-Emotions-Classifier model."""
    start_time = time.time()
    emotion_scores = self.emotion_classifier(text)[0]
    sorted_emotions = sorted(emotion_scores, key=lambda x: x['score'], reverse=True)
    top_emotion = sorted_emotions[0]
    all_emotions = {emotion['label']: emotion['score'] for emotion in sorted_emotions}
    
    elapsed_time = time.time() - start_time
    self.inference_times['emotion_detection'] = elapsed_time
    
    return {"emotion": top_emotion['label'], "confidence": top_emotion['score'], "all_emotions": all_emotions}

# Analysis of the given text

In [83]:
@add_method(TextAnalyzer)
def analyze_text(self, text):
    """Complete analysis of the given text."""
    # Reset timing data for new analysis
    self.inference_times = {}
    
    total_start_time = time.time()
    
    original_text = text
    corrected_text = self.correct_sentence(text)
    
    sentence_params = self.analyze_sentence_type(corrected_text)
    emotion_data = self.detect_emotion(corrected_text)
    fact_opinion_data = self.classify_fact_opinion(corrected_text)
    
    total_elapsed_time = time.time() - total_start_time
    self.inference_times['total_analysis'] = total_elapsed_time
    
    result = {
        'original_text': original_text,
        'corrected_text': corrected_text,
        'needs_correction': original_text != corrected_text,
        'sentence_type': sentence_params['sentence_type'],
        'emotion': emotion_data['emotion'],
        'emotion_confidence': emotion_data['confidence'],
        'all_emotions': emotion_data['all_emotions'],
        'fact_opinion': fact_opinion_data['classification'],
        'fact_opinion_confidence': fact_opinion_data['confidence'],
        'inference_times': self.inference_times
    }
    
    return result

In [84]:
@add_method(TextAnalyzer)
def compare_models(self, text):
    """Compare custom models with pretrained models on the same text."""
    # First correct any spelling/grammar issues in the text
    corrected_text = self.correct_sentence(text)
    
    # Get predictions and timing from pretrained models
    type_result = self.analyze_sentence_type(corrected_text)
    pretrained_type = type_result["sentence_type"]
    
    emotion_result = self.detect_emotion(corrected_text)
    pretrained_emotion = emotion_result["emotion"]
    
    factuality_result = self.classify_fact_opinion(corrected_text)
    pretrained_factuality = factuality_result["classification"]
    
    # Get predictions and timing from custom models
    custom_type, custom_type_time = self.custom_predict_type(corrected_text)
    custom_emotion, custom_emotion_time = self.custom_predict_emotion(corrected_text)
    custom_factuality, custom_factuality_time = self.custom_predict_factuality(corrected_text)
    
    # Collect pretrained model timing info
    pretrained_type_time = self.inference_times.get('sentence_type_analysis', 0)
    pretrained_emotion_time = self.inference_times.get('emotion_detection', 0)
    pretrained_factuality_time = self.inference_times.get('fact_opinion_classification', 0)
    
    # Return comparison results
    return {
        'text': text,
        'corrected_text': corrected_text,
        'type': {
            'pretrained': pretrained_type,
            'custom': custom_type,
            'time_pretrained': pretrained_type_time,
            'time_custom': custom_type_time,
            'time_difference': pretrained_type_time - custom_type_time,
            'time_ratio': pretrained_type_time / custom_type_time if custom_type_time > 0 else float('inf')
        },
        'factuality': {
            'pretrained': pretrained_factuality,
            'custom': custom_factuality,
            'time_pretrained': pretrained_factuality_time,
            'time_custom': custom_factuality_time,
            'time_difference': pretrained_factuality_time - custom_factuality_time,
            'time_ratio': pretrained_factuality_time / custom_factuality_time if custom_factuality_time > 0 else float('inf')
        },
        'emotion': {
            'pretrained': pretrained_emotion,
            'custom': custom_emotion,
            'time_pretrained': pretrained_emotion_time,
            'time_custom': custom_emotion_time,
            'time_difference': pretrained_emotion_time - custom_emotion_time,
            'time_ratio': pretrained_emotion_time / custom_emotion_time if custom_emotion_time > 0 else float('inf')
        }
    }

## Sentence classification

In [85]:
def display_comparison_results(results):
    """Display formatted comparison results between models."""
    print(f"\n{'='*80}")
    print(f"COMPARISON RESULTS FOR: '{results['text']}'")
    print(f"{'='*80}")
    
    if results['text'] != results['corrected_text']:
        print(f"Corrected text: '{results['corrected_text']}'")
    
    # Type comparison
    print(f"\n{'-'*30} SENTENCE TYPE {'-'*30}")
    print(f"Pretrained model: {results['type']['pretrained']} ({results['type']['time_pretrained']:.4f}s)")
    print(f"Custom model:     {results['type']['custom']} ({results['type']['time_custom']:.4f}s)")
    print(f"Time difference:  {abs(results['type']['time_difference']):.4f}s " + 
          f"({'pretrained' if results['type']['time_difference'] > 0 else 'custom'} is slower)")
    print(f"Agreement:        {'✓' if results['type']['pretrained'] == results['type']['custom'] else '✗'}")
    
    # Factuality comparison
    print(f"\n{'-'*30} FACTUALITY {'-'*30}")
    print(f"Pretrained model: {results['factuality']['pretrained']} ({results['factuality']['time_pretrained']:.4f}s)")
    print(f"Custom model:     {results['factuality']['custom']} ({results['factuality']['time_custom']:.4f}s)")
    print(f"Time difference:  {abs(results['factuality']['time_difference']):.4f}s " + 
          f"({'pretrained' if results['factuality']['time_difference'] > 0 else 'custom'} is slower)")
    print(f"Agreement:        {'✓' if results['factuality']['pretrained'] == results['factuality']['custom'] else '✗'}")
    
    # Emotion comparison
    print(f"\n{'-'*30} EMOTION {'-'*30}")
    print(f"Pretrained model: {results['emotion']['pretrained']} ({results['emotion']['time_pretrained']:.4f}s)")
    print(f"Custom model:     {results['emotion']['custom']} ({results['emotion']['time_custom']:.4f}s)")
    print(f"Time difference:  {abs(results['emotion']['time_difference']):.4f}s " + 
          f"({'pretrained' if results['emotion']['time_difference'] > 0 else 'custom'} is slower)")
    print(f"Agreement:        {'✓' if results['emotion']['pretrained'] == results['emotion']['custom'] else '✗'}")
    
    print(f"\n{'='*80}\n")


def main():
    analyzer = TextAnalyzer()
    analyzer.load_custom_models()
    
    print("Model Comparison Tool")
    print("---------------------")
    print("Enter a sentence to analyze with both pre-trained and custom models (or 'quit' to exit):")
    
    while True:
        user_input = input("\nYour sentence: ")
        if user_input.lower() == 'quit':
            break
        
        comparison_results = analyzer.compare_models(user_input)
        display_comparison_results(comparison_results)

if __name__ == "__main__":
    main()

[Gramformer] Grammar error correct/highlight model loaded..
Loading emotion classification model...


Device set to use cpu


Loading specialized fact-opinion classification model...


Device set to use cpu


Loading custom models...
Loaded TF-IDF vectorizer
Custom models loaded successfully
Model Comparison Tool
---------------------
Enter a sentence to analyze with both pre-trained and custom models (or 'quit' to exit):

COMPARISON RESULTS FOR: 'Nicole rides her bike in the afternoon'
Corrected text: 'Nicole rides her bike in the afternoon.'

------------------------------ SENTENCE TYPE ------------------------------
Pretrained model: affirmative (0.0199s)
Custom model:     affirmative (0.1567s)
Time difference:  0.1368s (custom is slower)
Agreement:        ✓

------------------------------ FACTUALITY ------------------------------
Pretrained model: fact (0.1578s)
Custom model:     opinion (0.0020s)
Time difference:  0.1558s (pretrained is slower)
Agreement:        ✗

------------------------------ EMOTION ------------------------------
Pretrained model: joy (0.2518s)
Custom model:     neutral (0.0795s)
Time difference:  0.1723s (pretrained is slower)
Agreement:        ✗



COMPARISON RES