<a href="https://colab.research.google.com/github/EdissonMC/SentimentAnalyzerSwitchTokenizers/blob/main/sentimentanalyzer_s_tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers torchtext gradio matplotlib pandas spacy
!python -m spacy download en_core_web_sm



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 148, in _get_module_details
  File "<frozen runpy>", line 112, in _get_module_details
  File "/usr/local/lib/python3.11/dist-packages/spacy/__init__.py", line 6, in <module>
  File "/usr/local/lib/python3.11/dist-packages/spacy/errors.py", line 3, in <module>
    from .compat import Literal
  File "/usr/local/lib/python3.11/dist-packages/spacy/compat.py", line 4, in <module>
    from thinc.util import copy_array
  File "/usr/local/lib/p

In [None]:
!pip install numpy==1.26.4 --force-reinstall

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.4
    Uninstalling numpy-2.2.4:
      Successfully uninstalled numpy-2.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but yo

In [None]:
import torch
import time
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer
import torchtext
from torchtext.data.utils import get_tokenizer as torchtext_tokenizer
import gradio as gr

In [None]:

class SentimentAnalyzer:
    def __init__(self):
        self.model_name = "distilbert-base-uncased-finetuned-sst-2-english"
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)

        self.tokenizers = {
            "transformers_default": AutoTokenizer.from_pretrained(self.model_name),
            "transformers_wordpiece": BertTokenizer.from_pretrained("bert-base-uncased"),
            "torchtext_basic": torchtext_tokenizer("basic_english"),
            "torchtext_spacy": torchtext_tokenizer("spacy", language="en_core_web_sm")
        }

        self.vocab = self.tokenizers["transformers_default"].get_vocab()

    def preprocess_torchtext(self, text, tokenizer_name):
        tokens = self.tokenizers[tokenizer_name](text)
        input_ids = [self.vocab.get(token, self.vocab.get("[UNK]", 100)) for token in tokens]
        input_ids = [self.vocab.get("[CLS]", 101)] + input_ids + [self.vocab.get("[SEP]", 102)]
        input_ids = input_ids[:512]
        attention_mask = [1] * len(input_ids)
        return {
            "input_ids": torch.tensor([input_ids]),
            "attention_mask": torch.tensor([attention_mask])
        }

    def analyze(self, text, tokenizer_name="transformers_default"):
        start_time = time.time()
        if tokenizer_name.startswith("torchtext"):
            inputs = self.preprocess_torchtext(text, tokenizer_name)
        else:
            inputs = self.tokenizers[tokenizer_name](text, return_tensors="pt", truncation=True, padding=True)
        tokenization_time = time.time() - start_time

        if tokenizer_name.startswith("torchtext"):
            num_tokens = inputs["input_ids"].shape[1]
        else:
            num_tokens = len(inputs["input_ids"][0])

        with torch.no_grad():
            prediction_start = time.time()
            outputs = self.model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            prediction_time = time.time() - prediction_start

        positive_score = predictions[0][1].item()
        negative_score = predictions[0][0].item()

        result = {
            "Positivo": positive_score,
            "Negativo": negative_score
        }

        if positive_score > 0.75:
            interpretation = "Texto muy positivo"
        elif positive_score > 0.5:
            interpretation = "Texto ligeramente positivo"
        elif negative_score > 0.75:
            interpretation = "Texto muy negativo"
        else:
            interpretation = "Texto ligeramente negativo"

        performance = {
            "Tokenizador": tokenizer_name,
            "Tiempo de tokenización (ms)": round(tokenization_time * 1000, 2),
            "Tiempo de predicción (ms)": round(prediction_time * 1000, 2),
            "Tiempo total (ms)": round((tokenization_time + prediction_time) * 1000, 2),
            "Número de tokens": num_tokens
        }

        return result, interpretation, performance


In [None]:
analyzer = SentimentAnalyzer()
text = "I absolutely loved this movie. It was amazing!"
result, interpretation, performance = analyzer.analyze(text, "transformers_default")

print("Resultado:", result)
print("Interpretación:", interpretation)
print("Estadísticas:", performance)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Resultado: {'Positivo': 0.9998804330825806, 'Negativo': 0.00011954931687796488}
Interpretación: Texto muy positivo
Estadísticas: {'Tokenizador': 'transformers_default', 'Tiempo de tokenización (ms)': 9.59, 'Tiempo de predicción (ms)': 74.82, 'Tiempo total (ms)': 84.41, 'Número de tokens': 12}
