In [1]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [2]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english" #https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english
classifier = pipeline("sentiment-analysis", model=model_name)

In [17]:
res = classifier(["I love being loved by happy people",
                  "Russia's western tension can favour vale3 pressure nicel price according to analysts btg"])

In [18]:
for result in res:
    print(result)

{'label': 'POSITIVE', 'score': 0.999406099319458}
{'label': 'NEGATIVE', 'score': 0.9888573884963989}


In [3]:
classifier("I love being loved by happy people")

[{'label': 'POSITIVE', 'score': 0.999870777130127}]

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [21]:

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [22]:
res = classifier(["I love being loved by happy people",
                  "Russia's western tension can favour vale3 pressure nicel price according to analysts btg."])

In [23]:
for result in res:
    print(result)

{'label': 'POSITIVE', 'score': 0.999870777130127}
{'label': 'NEGATIVE', 'score': 0.9888573884963989}


In [27]:
tokens = tokenizer.tokenize("Russia's western tension can favour vale3 pressure nicel price according to analysts btg.")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer("Russia's western tension can favour vale3 pressure nicel price according to analysts btg.")
print(tokens)
print()
print(token_ids)
print()
print(input_ids)


['russia', "'", 's', 'western', 'tension', 'can', 'favour', 'vale', '##3', 'pressure', 'nice', '##l', 'price', 'according', 'to', 'analysts', 'bt', '##g', '.']

[3607, 1005, 1055, 2530, 6980, 2064, 7927, 10380, 2509, 3778, 3835, 2140, 3976, 2429, 2000, 18288, 18411, 2290, 1012]

{'input_ids': [101, 3607, 1005, 1055, 2530, 6980, 2064, 7927, 10380, 2509, 3778, 3835, 2140, 3976, 2429, 2000, 18288, 18411, 2290, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [30]:
train = ["I love being loved by happy people",
                  "Russia's western tension can favour vale3 pressure nicel price according to analysts btg."]

In [31]:
batch = tokenizer(train, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(batch)

{'input_ids': tensor([[  101,  1045,  2293,  2108,  3866,  2011,  3407,  2111,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  3607,  1005,  1055,  2530,  6980,  2064,  7927, 10380,  2509,
          3778,  3835,  2140,  3976,  2429,  2000, 18288, 18411,  2290,  1012,
           102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [35]:
with torch.no_grad():
    outputs = model(**batch, labels=torch.tensor([1,0]))  # segunda parte mostra o tensor de loss 
    print(outputs)
    predictions = F.softmax(outputs.logits, dim=1)
    print(predictions)
    labels= torch.argmax(predictions, dim=1)
    print(labels)
    labels= [model.config.id2label[label_id] for label_id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=tensor(0.0053), logits=tensor([[-4.2900,  4.6641],
        [ 2.4757, -2.0806]]), hidden_states=None, attentions=None)
tensor([[1.2920e-04, 9.9987e-01],
        [9.8961e-01, 1.0392e-02]])
tensor([1, 0])
['POSITIVE', 'NEGATIVE']


In [None]:
Min 23:30

save_directory = "saved"   # Cria um diretório
tokenizer.save_pretrained(save_directory)    #Salva o tokenizer em um diretório
model.save_pretrained(save_directory)      #Salva o modelo no direório

tokenizer = AutoTokenizer.from_pretrained(save_directory)    # Carrega o tokenizer salvo
model = AutoModelForSequenceClassification.from_pretrained(save_directory)  # Carrega o modelo salvo

In [1]:
model_name = "vidhur2k/mBERT-Portuguese-Mono"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
textos = ["Com um resultado não tão bom",
          "As tensões ao ocidente da rússia podem beneficiar a vale por causa do preço do níquel",
         "Ela dirige um carro verde.", "Isso não foi muito bom", "Não tão ruim quanto esperado" ]

In [None]:
batch = tokenizer(textos, padding=True, truncation=True, max_length=512, return_tensors="pt")
#batch = torch.tensor(batch["input_ids"])) # transforma o objeto 'input_ids' em um tensor (TensorFLow)

In [None]:
with torch.no_grad():
    outputs = model(**batch) 
    labels_ids= torch.argmax(predictions, dim=1)
    print(labels_ids)
    labels= [model.config.id2label[label_id] for label_id in labels_ids.tolist()]
    print(labels)

In [None]:
https://www.youtube.com/watch?v=GSt00_-0ncQ&ab_channel=PythonEngineer

min 31:30
    