In [None]:
import torch
!pip install transformers
from transformers import pipeline
import pandas as pd
!pip install datasets transformers[sentencepiece]
from transformers import GPT2Model, GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup,GPT2Config
from transformers import pipeline

# Pipeline: incorporate Tokenization, Forward, Postprocessing

In [3]:
text='I like the actor called Cheng Yi (original name: Fu Shiqi), who performs vividly in the latest drama Immortal Samsara'

## Only load GPT 2 bare architecture
* with random parameters

In [None]:
config=GPT2Config()
model=GPT2LMHeadModel(config)

#then we can use load to use trained parameters
model.load_state_dict(torch.load('path',map_location=torch.device('cpu')))

## For sentiment classification, [doc](https://huggingface.co/docs/transformers/v4.21.0/en/main_classes/pipelines#transformers.TextClassificationPipeline)

In [None]:
pipe=pipeline('text-classification', model=None, tokenizer=None)
outputs=pipe(text)
outputs=pd.DataFrame(outputs)
outputs

## For feature extraction
* Without any top layer
* Return the embeddings [768] of the current word

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
pipe1 = pipeline(task="feature-extraction",model=model,tokenizer=tokenizer)
#There are lots of different tasks other than"feature-extraction", see huggingface description.
#It is not necessary to set model=model or tokenizer=tokenizer, pipeline("feature-extraction") will use default models. 
#The line is the same as the following code.
    # inputs = tokenizer("hello", return_tensors="pt")
    # outputs = model(**inputs)
    # last_hidden_states = outputs.last_hidden_state

In [None]:
inputs = tokenizer("hello", return_tensors="pt")
model(**inputs) #=model(inputs['input_ids'],attention_mask=inputs['attention_mask']) 

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[-37.0707, -36.4855, -40.3520,  ..., -46.5167, -45.4142, -37.9090]]],
       grad_fn=<UnsafeViewBackward>), past_key_values=((tensor([[[[-7.0826e-01,  2.2258e+00,  6.1847e-01, -3.3636e-01,  1.5154e+00,
           -1.7025e-02,  1.2717e-02,  2.5720e-01, -1.5566e+00,  9.3795e-01,
            3.1825e-01,  7.8573e-01,  4.9001e-01,  8.4974e-01, -6.6905e-01,
           -4.8115e-01, -1.5217e-01,  8.0209e-01,  2.1202e+00, -4.7508e-01,
           -6.6709e-01,  3.5872e-01, -4.2612e-01, -1.1006e+00,  6.4983e-02,
           -8.5395e-01, -1.7462e-01, -6.3068e-01,  2.4067e-01, -1.4915e+00,
            2.6309e+00, -2.5473e-01, -7.7622e-01, -6.9709e-02, -4.1723e-02,
            5.0557e-02,  7.7904e-01,  2.3666e-01,  4.8333e-02,  1.1410e+00,
           -3.7447e-01,  3.5144e-01, -1.5779e+00, -1.4066e-01, -3.1211e-02,
           -5.4283e-01,  1.7680e+00, -7.8461e-01, -4.6734e-01,  8.1111e-03,
           -4.7365e-01,  1.4545e-01,  1.6624e+00,  1.

## For language modelling and training
* Return the logits [m,seq,50257], with each ID representing the next token
* model(**inputs,labels),we can use this to train our model, since it returns loss

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
pipe2 = pipeline(task="feature-extraction",model=model,tokenizer=tokenizer)
context="would you mind me smoking"
(pipe2(context)) #[1,5,50257]
#The code above is the same as the following:
#inputs="would you mind me smoking"
#inputs=tokenizer(inputs,return_tensors="pt")
#outputs=model(**inputs)

[[[-39.36349105834961,
   -38.66656494140625,
   -42.61349105834961,
   -41.72972869873047,
   -40.938907623291016,
   -41.88724136352539,
   -38.7708740234375,
   -39.84280014038086,
   -38.23600387573242,
   -41.296714782714844,
   -41.10597229003906,
   -35.584556579589844,
   -36.8923225402832,
   -35.959922790527344,
   -38.27095031738281,
   -41.04011535644531,
   -40.09268569946289,
   -40.15596389770508,
   -40.72275161743164,
   -41.022865295410156,
   -41.38331985473633,
   -41.71014404296875,
   -41.85360336303711,
   -41.672943115234375,
   -42.030433654785156,
   -37.447505950927734,
   -38.9921875,
   -41.88325119018555,
   -40.055789947509766,
   -40.756736755371094,
   -38.5909423828125,
   -41.83613204956055,
   -41.021278381347656,
   -41.96195983886719,
   -41.74087142944336,
   -42.28169631958008,
   -42.117958068847656,
   -42.374027252197266,
   -42.16175079345703,
   -42.27912139892578,
   -39.93644332885742,
   -42.79711151123047,
   -42.528282165527344,
   -42.

In [None]:
inputs = tokenizer("you are the you", return_tensors="pt")
outputs = model(**inputs, labels=torch.tensor([[15,11,11,13]]))
# outputs=model(**inputs,labels=inputs['input_ids']). Note that only the loss from the first token to the second-to-last token are computed. 
loss = outputs.loss

## Pipeline for question answering, [doc](https://huggingface.co/docs/transformers/v4.21.0/en/main_classes/pipelines#transformers.QuestionAnsweringPipeline)

In [None]:
reader=pipeline('question-answering')
question="What is the original name of the person he likes?"
outputs=reader(question=question,context=text)
pd.DataFrame([outputs])

## Pipeline for translation, [doc](https://huggingface.co/docs/transformers/v4.21.0/en/main_classes/pipelines#transformers.TranslationPipeline)

In [None]:
translator=pipeline('translation_en_to_zh',model='liam168/trans-opus-mt-en-zh')
outputs=translator(text,clean_up_tokenization_spaces=True)
outputs[0]['translation_text']

## For predicting masks

In [None]:
pipe=pipeline("fill-mask")
pipe("I <mask> you")

## Pipeline for NER, [doc](https://huggingface.co/docs/transformers/v4.21.0/en/main_classes/pipelines#transformers.TokenClassificationPipeline)

In [None]:
ner_tagger=pipeline("ner", model=None,aggregation_strategy='simple')
outputs=ner_tagger(text)
pd.DataFrame(outputs)

## For summarization

In [None]:
# we can use BART model, which is specialized in text summarization. 
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('bart-large-cnn')
text="Recently, the pre-trained language model, BERT (and its robustly optimized version RoBERTa), has attracted a lot of attention in natural language understanding (NLU), and achieved state-of-the-art accuracy in various NLU tasks, such as sentiment classification, natural language inference, semantic textual similarity and question answering. Inspired by the linearization exploration work of Elman [8], we extend BERT to a new model, StructBERT, by incorporating language structures into pre-training. Specifically, we pre-train StructBERT with two auxiliary tasks to make the most of the sequential order of words and sentences, which leverage language structures at the word and sentence levels, respectively. As a result, the new model is adapted to different levels of language understanding required by downstream tasks. The StructBERT with structural pre-training gives surprisingly good empirical results on a variety of downstream tasks, including pushing the state-of-the-art on the GLUE benchmark to 89.0 (outperforming all published models), the F1 score on SQuAD v1.1 question answering to 93.0, the accuracy on SNLI to 91.7. Our Summary The Alibaba research team suggests extending BERT to a new StructBERT language model by leveraging word-level and sentence-level ordering. To capture the linguistic structures during the pre-training procedure, they extend the BERT model with the word structural objective and the sentence structural objective. As a result, the StructBERT model is forced to reconstruct the right order of words and sentences. The experiments demonstrate that the introduced model significantly advances the state-of-the-art results on a variety of natural language understanding tasks, including sentiment analysis and question answering."

inputs = tokenizer([text], max_length=1024, return_tensors='pt')
output_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=False)
for ids in output_ids:
    summary=tokenizer.decode(ids,skip_special_tokens=True,clean_up_tokenization_spaces=False)
    print (summary)

## For text generation

### Model loading
* We can use model.generate()
* We can use pipeline

In [None]:
# Pipeline
from transformers import pipeline
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
generator = pipeline('text-generation', model=model,tokenizer=tokenizer)
outputs=generator("Who is Taylor Swift?", max_length=200,clean_up_tokenization_spaces=True, num_return_sequences=1,do_sample=False)
outputs[0]['generated_text']

In [None]:
# Use model.generate, the same as above
inputs=tokenizer("Who is Taylor Swift?",return_tensors='pt')
outputs=model.generate(**inputs,max_length=200,clean_up_tokenization_spaces=True, num_return_sequences=1,do_sample=False)
outputs=tokenizer.decode(outputs[0])
outputs

### How to generate the next word
* We can use model.generate(), and there are lots of parameters/functions we can set.
    * return_dict_in_generate=True, to return multiple outputs with Tensors!
    * Other parameters: max_length, min_length, do_sample, early_stopping, num_beams, temperature, top_k, top_p, repetition_penalty, length_penalty, no_repeat_ngram_size, num_return_sequences, max_time, output_scores, return_dict_in_generate , output_attentions, output_hidden_states, return_dict_in_generate. https://huggingface.co/docs/transformers/main_classes/model#transformers.generation_utils.GenerationMixin.generate.no_repeat_ngram_size
* Greedy search
* Beam search
* Sampling
    * Top-k
    * Top-p

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

#### Greedy search
* Without any configuration

In [None]:
inputs=tokenizer('the the the the',return_tensors='pt')
outputs=model.generate(**inputs,max_length=1, do_sample=False,output_scores=True,return_dict_in_generate=True)
print (outputs)
#print (tokenizer.decode(list(outputs[0])))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 4, but ``max_length`` is set to 1. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


GreedySearchDecoderOnlyOutput(sequences=tensor([[1169,  262,  262,  262,  262]]), scores=(tensor([[-64.3933, -63.5729, -67.5316,  ..., -69.8050, -68.0332, -63.0680]]),), attentions=None, hidden_states=None)


In [None]:
tokenizer.convert_ids_to_tokens(50255)
tokenizer("promise")

{'input_ids': [16963, 786], 'attention_mask': [1, 1]}

#### Sampling
* (Greedy search with sampling is better for my risk identification purpose! also beam with do_sample)
* do_sample
* temperature
* top_k
* top_p

In [None]:
inputs=tokenizer('the cyber threats of a company include',return_tensors='pt')

outputs=model.generate(**inputs, min_length=20,max_length=20, do_sample=False, num_return_sequences=1)
#for i in range (10):
#    print (i,tokenizer.decode(outputs[i]))
print (tokenizer.decode(outputs[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


the cyber threats of a company include the theft of personal information, the unauthorized disclosure of personal information,


#### Beam search
* num_beams
* early_stopping
* no_repeat_ngram_size

In [None]:
import time
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
inputs=tokenizer('cybersecurity threats include',return_tensors='pt')
time1=time.time()
outputs=model.generate(**inputs, max_length=30, num_beams=2, early_stopping=True,no_repeat_ngram_size=2,num_return_sequences=1)
print (tokenizer.decode(outputs[0]))
time2=time.time()
time2-time1