In [None]:
import re
import string
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (
    AutoTokenizer, 
    BertTokenizer, 
    GPT2Tokenizer,
    T5Tokenizer
)
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
except:
    pass

In [None]:
class TextPreprocessor:
    def __init__(self):
        self.setup_tokenizers()
        self.setup_nltk_tools()
        
    def setup_tokenizers(self):
        """Initialize various tokenizers"""
        try:
            self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
            self.gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
            self.t5_tokenizer = AutoTokenizer.from_pretrained('t5-small')
            print("✓ Tokenizers loaded successfully")
        except Exception as e:
            print(f"Warning: Could not load some tokenizers: {e}")
            self.bert_tokenizer = None
            self.gpt2_tokenizer = None
            self.t5_tokenizer = None
    
    def setup_nltk_tools(self):
        """Initialize NLTK tools"""
        try:
            self.stop_words = set(stopwords.words('english'))
            self.stemmer = PorterStemmer()
            self.lemmatizer = WordNetLemmatizer()
        except:
            self.stop_words = set()
            self.stemmer = None
            self.lemmatizer = None