In [None]:
%pip install transformers
%pip3 install torch torchvision torchaudio

In [1]:
# make sentiment analysis with default pretrained model
from transformers import pipeline

sentiment = pipeline("sentiment-analysis")
print(sentiment("I like icecream"))

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'NEGATIVE', 'score': 0.8968188166618347}]


In [27]:
# make sentiment analysis with choosen model
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# full syntaxis
# model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
# sentiment = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# short syntaxis
sentiment = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest")

print(sentiment("I like icecream"))

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'label': 'positive', 'score': 0.8467546701431274}]




In [7]:
# methods of tokenizer

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
text = "What is the colours of icecream?"

tokenized_text = tokenizer(text)
print(f"tokenized_text:{tokenized_text}")

tokens = tokenizer.tokenize(text)
print(f"tokens:{tokens}")

ids = tokenizer.convert_tokens_to_ids(tokens)
print(f"ids:{ids}")
print("Words count:", text.count(" ") + 1)
print("tokens count:", len(ids))

decoded_string = tokenizer.decode(ids)
print(f"decoded_string:{decoded_string}")

tokenized_text:{'input_ids': [0, 2264, 16, 5, 13353, 9, 2480, 34806, 116, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
tokens:['What', 'Ġis', 'Ġthe', 'Ġcolours', 'Ġof', 'Ġice', 'cream', '?']
ids:[2264, 16, 5, 13353, 9, 2480, 34806, 116]
Words count: 6
tokens count: 8
decoded_string:What is the colours of icecream?




In [36]:
# using PyTorch with Huggingface model
import torch
import torch.nn.functional as f
from transformers import pipeline

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")


X_train = [
    "Python code is easy-readable",
    "Specialized models have great performance despite small size, but only in spesific field",
    "Python and anaconda belong to different groups of snakes",
    "Anaconda is not a species of boa"
]

# With huggingface's transformers.pipeline only
sentiment = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
result = sentiment(X_train)
print(result)

# Same activity with PyTorch, step by step
batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(batch, end="\n--------------------\n")

with torch.no_grad():
    outputs = model(**batch)
    print(outputs)
    predictions = f.softmax(outputs.logits, dim=1)
    print(predictions)
    labels = torch.argmax(predictions, dim=1)
    print(labels)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'label': 'positive', 'score': 0.8435574769973755}, {'label': 'positive', 'score': 0.8998537659645081}, {'label': 'neutral', 'score': 0.5533633232116699}, {'label': 'neutral', 'score': 0.5480014085769653}]
{'input_ids': tensor([[    0, 48659,  3260,    16,  1365,    12, 46753,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    0, 25110,  1538,  3092,    33,   372,   819,  1135,   650,  1836,
             6,    53,   129,    11,  2292,   293,  8685,   882,     2],
        [    0, 48659,     8,    41,  1043, 11192,  9943,     7,   430,  1134,
             9, 24328,     2,     1,     1,     1,     1,     1,     1],
        [    0,  4688,  1043, 11192,    16,    45,    10,  4707,     9,  5276,
           102,     2,     1,     1,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1,

In [7]:
# text generation
generator = pipeline("text-generation", model="distilgpt2")
res = generator("Every morning I", max_length=30, num_return_sequences=2,)
for item in res:
    print(item)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'generated_text': "Every morning I received a letter from a local man who is not a lawyer or even an estate attorney. He said 'I'd rather be a lawyer"}
{'generated_text': 'Every morning I was working on something a little bit, and the people were looking away for me," he said, leaning his head against the wall.'}


In [21]:
# text clasification to choosen categories
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
res = classifier(
    "Author expresses his deepest feelings, every curve of the creature shout about emotional agony",
    candidate_labels=["music", "painting", "sculpture"]
)
print(res['labels'])
print(res['scores'])




['sculpture', 'painting', 'music']
[0.6227449178695679, 0.24915392696857452, 0.1281011700630188]


In [None]:
%pip install protobuf

In [20]:
#summarizing, Ukr language
from transformers import pipeline, AutoTokenizer

text = """
В самому серці густих лісів Карпат ммешкає загадкова й чарівна пташка Ауроліс. Це істота, яку ніколи не бачили на власні очі,
але її присутність відчувається в шелесті листя та ехо її мелодійних пісень. Легенди про Ауроліс живуть у переказах місцевих мешканців,
які поколіннями передають знання про це дивовижне створіння.
Ауроліс це птах із неймовірно яскравим оперенням, яке переливається всіма кольорами веселки. Його крила сягають розмаху до двох метрів,
що робить його неймовірно елегантним у польоті. Найбільш унікальною особливістю Ауролісу є його хвіст, що світиться м'яким золотавим сяйвом
темряві, створюючи враження зоряного сліду в небі. Через цю особливість його ще називають Зоряним Ластівком.
"""

model = "ukr-models/uk-summarizer"
tokenizer = AutoTokenizer.from_pretrained("ukr-models/uk-summarizer")

tokens = tokenizer.tokenize(text)
print(f"tokens:{tokens}")

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
print(summarizer(text, max_length=100, min_length=50))

tokens:['▁В', '▁сам', 'ому', '▁сер', 'ці', '▁', 'густ', 'их', '▁ліс', 'ів', '▁Карп', 'ат', '▁', 'м', 'мешк', 'ає', '▁загад', 'кова', '▁', 'й', '▁чар', 'івна', '▁', 'пта', 'шка', '▁Ау', 'ролі', 'с', '.', '▁Це', '▁істо', 'та', ',', '▁', 'яку', '▁', 'ні', 'коли', '▁не', '▁бач', 'или', '▁на', '▁власн', 'і', '▁', 'очі', ',', '▁але', '▁', 'ї', 'ї', '▁присут', 'ність', '▁від', 'чу', 'вається', '▁в', '▁', 'шел', 'есті', '▁лист', 'я', '▁та', '▁', 'е', 'хо', '▁', 'ї', 'ї', '▁мелод', 'ійних', '▁піс', 'ень', '.', '▁Лег', 'енди', '▁про', '▁Ау', 'ролі', 'с', '▁жив', 'уть', '▁у', '▁пере', 'казах', '▁', 'місц', 'ев', 'их', '▁ме', 'шкан', 'ців', ',', '▁', 'які', '▁покол', 'і', 'ннями', '▁перед', 'ають', '▁зна', 'ння', '▁про', '▁це', '▁див', 'ови', 'жне', '▁створ', 'іння', '.', '▁Ау', 'ролі', 'с', '▁це', '▁п', 'тах', '▁із', '▁не', 'ймов', 'ірно', '▁', 'я', 'скра', 'вим', '▁опер', 'енням', ',', '▁як', 'е', '▁пере', 'ли', 'вається', '▁вс', 'іма', '▁коль', 'орами', '▁весел', 'ки', '.', '▁Й', 'ого', '▁кри',