In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from gramformer import Gramformer
import torch
import spacy
import time

  from .autonotebook import tqdm as notebook_tqdm


## Ignore warnings

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Optimized Gramformer for sentence correction

In [3]:
class OptimizedT5Corrector:
    def __init__(self, debug=True):
        # Load model and tokenizer directly
        self.model_name = "prithivida/grammar_error_correcter_v1"
        self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
        
        # Optimize model for inference
        self.model.eval()
        
        # Use torch.compile for PyTorch 2.0+ (significant speedup)
        if hasattr(torch, 'compile'):
            try:
                self.model = torch.compile(self.model)
                if debug: print("Successfully applied torch.compile optimization")
            except Exception as e:
                if debug: print(f"Could not apply torch.compile: {e}")
        
        # Optimize memory usage
        self.model.config.use_cache = True
        
    def correct(self, sentence, max_length=128):
        # Apply inference optimizations
        with torch.inference_mode():
            # Prepare input - the "gec:" prefix is important for the model
            input_text = f"gec: {sentence}"
            input_ids = self.tokenizer(input_text, return_tensors="pt").input_ids
            
            # Optimize generation parameters for speed
            outputs = self.model.generate(
                input_ids=input_ids,
                max_length=max_length,
                num_beams=2,  #2 # Reduced from 5 for speed
                early_stopping=True,
                use_cache=True  # Enable KV caching for faster generation
            )
            
            # Decode output
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

## Polarity detection with Spacy

In [4]:
def analyze_sentence_type(nlp, text):
    # Measure inference time
    start_time = time.time()

    # Parse the sentence using spaCy
    doc = nlp(text)

    # Check for negation by looking for the 'neg' dependency tag
    has_negation = any(token.dep_ == "neg" for token in doc)
    
    # Determine sentence type
    sentence_type = "negation" if has_negation else "affirmation"

    end_time = time.time()
    inference_time = end_time - start_time
    
    return {
        'sentence': text,
        'sentence_type': sentence_type,
        'inference_time': inference_time
    }

## Subjectivity detection with a pre-trained model

In [5]:
def classify_sentence(model_name, sentence):
    # Set device (GPU if available, otherwise CPU)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Load tokenizer and model
    #model_name = "lighteternal/fact-or-opinion-xlmr-el"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.to(device)
    model.eval()
    
    # Measure inference time
    start_time = time.time()
    
    # Tokenize input and move to device
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Assuming: 0 = objective/fact, 1 = subjective/opinion
    obj_score = probs[0][1].item()
    subj_score = probs[0][0].item()
    classification = "fact" if obj_score > subj_score else "opinion"
    
    end_time = time.time()
    inference_time = end_time - start_time
    
    # Prepare results
    result = {
        "sentence": sentence,
        "objective_score": obj_score,
        "subjective_score": subj_score,
        "classification": classification,
        "inference_time_seconds": inference_time
    }
    
    return result

## Emotion classification with DistilBERT

In [6]:
class EnhancedEmotionClassifier:
    def __init__(self, model_name="joeddav/distilbert-base-uncased-go-emotions-student"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.eval()
        
        # GoEmotions labels with neutral
        self.labels = [
            "admiration", "amusement", "anger", "annoyance", "approval", "caring",
            "confusion", "curiosity", "desire", "disappointment", "disapproval",
            "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
            "joy", "love", "nervousness", "neutral", "optimism", "pride", "realization",
            "relief", "remorse", "sadness", "surprise"
        ]
    
    def classify(self, text, top_k=3):
        # Tokenize and prepare input
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        
        # Perform inference
        with torch.inference_mode():
            outputs = self.model(**inputs)
            
        # Get predictions
        scores = torch.sigmoid(outputs.logits)[0]  # Use sigmoid for multi-label
        
        # Get top-k emotions
        top_scores, top_indices = torch.topk(scores, top_k)
        
        # Format results
        top_emotions = [
            {"emotion": self.labels[idx], "score": score.item()}
            for score, idx in zip(top_scores, top_indices)
        ]
        
        # Also include if neutral is detected
        if "neutral" not in [e["emotion"] for e in top_emotions]:
            neutral_idx = self.labels.index("neutral")
            neutral_score = scores[neutral_idx].item()
            if neutral_score > 0.3:  # Threshold can be adjusted
                top_emotions.append({"emotion": "neutral", "score": neutral_score})
        
        return top_emotions

## Sentence classification

In [7]:
test_sentence = "I just finishd read the report. I am glad it contens the information we expected!"

print("Checking for grammar correction...")
corrector = OptimizedT5Corrector(debug=False)
corrected = corrector.correct(test_sentence)
print(f"Corrected sentence: {corrected}")


print("\nChecking polarity...")
model_name = "en_core_web_sm"
model = spacy.load(model_name)
polarity = analyze_sentence_type(model, corrected)
print(f"Polarity of the sentence: {polarity['sentence_type']}")
print(f"Inference time: {polarity['inference_time']} s")


print("\nChacking subjectivity...")
model_name = "lighteternal/fact-or-opinion-xlmr-el"
subjectivity = classify_sentence(model_name, corrected)
print(f"Sentence subjectivity: {subjectivity['classification']}")
print(f"Inference time: {subjectivity['inference_time_seconds']} s")


print("\nChecking emotions...")
classifier = EnhancedEmotionClassifier()
sentiment = classifier.classify(corrected)
print(f"Classified emotion: {sentiment}")


Checking for grammar correction...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Corrected sentence: I just finished reading the report. I am glad it contains the information we expected!

Checking polarity...
Polarity of the sentence: affirmation
Inference time: 0.012177705764770508 s

Chacking subjectivity...
Sentence subjectivity: opinion
Inference time: 0.17078065872192383 s

Checking emotions...
Classified emotion: [{'emotion': 'gratitude', 'score': 0.9140950441360474}, {'emotion': 'realization', 'score': 0.7886174917221069}, {'emotion': 'caring', 'score': 0.7577161192893982}, {'emotion': 'neutral', 'score': 0.5822216868400574}]
