In [1]:
import torch
import multiprocessing
from transformers import set_seed
from transformers import pipeline
import gc
from transformers import LEDForConditionalGeneration, LEDTokenizer

set_seed(42)

In [2]:
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    n_gpu = float(torch.cuda.device_count())
    device_name = torch.cuda.get_device_name(DEVICE)
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
    device_name = "Apple Silicon Device"
    n_gpu = 1.0
else:
    DEVICE = torch.device("cpu")
    device_name = "CPU"
    n_gpu = 0.0

n_cores = multiprocessing.cpu_count()
print(f"Number of GPUs: {n_gpu} / Number of CPU Cores: {n_cores}")
print(f"Training on {device_name} ({DEVICE})")

Number of GPUs: 1.0 / Number of CPU Cores: 8
Training on Apple Silicon Device (mps)


In [3]:
def free_memory(pipeline=None):
    if pipeline is not None:
        del pipeline

    gc.collect()

    if DEVICE == torch.device("cuda"):
        torch.cuda.empty_cache()
    elif DEVICE == torch.device("mps"):
        torch.mps.empty_cache()

In [4]:
text = """Scaled Dot-Product Attention
We call our particular attention "Scaled Dot-Product Attention". The input consists of
queries and keys of dimension d_k, and values of dimension d_v. We compute the dot products of the
query with all keys, divide each by square root(d_k), and apply a softmax function to obtain the weights on the
values. In practice, we compute the attention function on a set of queries simultaneously, packed together
into a matrix Q. The keys and values are also packed together into matrices K and V .
The two most commonly used attention functions are additive attention, and dot-product (multiplicative) attention. 
Dot-product attention is identical to our algorithm, except for the scaling factor of square root(1/d_k). 
Additive attention computes the compatibility function using a feed-forward network with
a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is
much faster and more space-efficient in practice, since it can be implemented using highly optimized
matrix multiplication code.
While for small values of d_k the two mechanisms perform similarly, additive attention outperforms
dot product attention without scaling for larger values of d_k. We suspect that for large values of
d_k, the dot products grow large in magnitude, pushing the softmax function into regions where it has
extremely small gradients. To counteract this effect, we scale the dot products by square root(1/d_k).

Multi-Head Attention
Instead of performing a single attention function with dmodel-dimensional keys, values and queries,
we found it beneficial to linearly project the queries, keys and values h times with different, learned
linear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of
queries, keys and values we then perform the attention function in parallel, yielding dv-dimensional
output values. These are concatenated and once again projected, resulting in the final values.
Multi-head attention allows the model to jointly attend to information from different representation
subspaces at different positions. With a single attention head, averaging inhibits this.

In this work we employ h = 8 parallel attention layers, or heads. For each of these we use
d_k = d_v = d_model/h = 64. Due to the reduced dimension of each head, the total computational cost
is similar to that of single-head attention with full dimensionality.

Applications of Attention in our Model
The Transformer uses multi-head attention in three different ways:
- In "encoder-decoder attention" layers, the queries come from the previous decoder layer,
and the memory keys and values come from the output of the encoder. This allows every
position in the decoder to attend over all positions in the input sequence. This mimics the
typical encoder-decoder attention mechanisms in sequence-to-sequence models.
- The encoder contains self-attention layers. In a self-attention layer all of the keys, values
and queries come from the same place, in this case, the output of the previous layer in the
encoder. Each position in the encoder can attend to all positions in the previous layer of the
encoder.
- Similarly, self-attention layers in the decoder allow each position in the decoder to attend to
all positions in the decoder up to and including that position. We need to prevent leftward
information flow in the decoder to preserve the auto-regressive property. We implement this
inside of scaled dot-product attention by masking out (setting to minus infinity) all values in the input
of the softmax which correspond to illegal connections.

Position-wise Feed-Forward Networks
In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully
connected feed-forward network, which is applied to each position separately and identically. This
consists of two linear transformations with a ReLU activation in between.
While the linear transformations are the same across different positions, they use different parameters
from layer to layer. Another way of describing this is as two convolutions with kernel size 1.
The dimensionality of input and output is d_model = 512, and the inner-layer has dimensionality
d_f_f = 2048.

Embeddings and Softmax
Similarly to other sequence transduction models, we use learned embeddings to convert the input
tokens and output tokens to vectors of dimension dmodel. 
We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. 
In our model, we share the same weight matrix between the two embedding layers and the pre-softmax
linear transformation. In the embedding layers, we multiply those weights by square root(d_dmodel).
"""

In [6]:
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

# by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv").to(DEVICE)

# decoder attention type can't be changed & will be "original_full"
# you can change `attention_type` (encoder only) to full attention like this:
# model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", attention_type="original_full")

# you can change `block_size` & `num_random_blocks` like this:
# model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", block_size=16, num_random_blocks=2)

inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
prediction = model.generate(
      **inputs,
      num_beams=4,
      early_stopping=True,
      repetition_penalty=1.8,
      max_length=250, #add a max length
)
prediction = tokenizer.batch_decode(prediction)
print(prediction)

['<s> in this paper , we present an efficient method for the preparation of multipartite entanglement between two remote parties .<n> our approach is based on the construction of a system of interacting subsystems , which are called multiply - multiply entangled systems ( multiply - multiply eigenvectors ) .<n> we show that such systems can be used to perform distributed quantum computation .<n> additionally , we show that it is possible to use these systems as a quantum computer .<n> finally , we discuss the performance and limitations of our approach . <n> [ [ section ] ] in this paper , we present an efficient method for the preparation of multipartite entanglement between two remote parties .<n> our approach is based on the construction of a system of interacting subsystems , which are called multiply - multiply entangled systems ( multiply - multiply eigenvectors ) .<n> we show that such systems can be used to perform distributed quantum computation .<n> additionally , we show tha

# Playground

In [None]:
model_name = "facebook/bart-large-cnn"
summarizer = pipeline("summarization", model=model_name, device=DEVICE, min_length=100, max_length=2000)

outputs = summarizer(text)
print(outputs[0]['summary_text'])

free_memory(summarizer)

In [None]:
model_name = "google/pegasus-xsum"
summarizer = pipeline("summarization", model=model_name, device=DEVICE)

In [None]:
model_name = "suriya7/bart-finetuned-text-summarization"
summarizer = pipeline("summarization", model=model_name, device=DEVICE, min_length=100, max_length=2000)

outputs = summarizer(text)
print(outputs[0]['summary_text'])

free_memory(summarizer)

In [None]:
model_name = "Falconsai/text_summarization"
summarizer = pipeline("summarization", model=model_name, device=DEVICE)

outputs = summarizer(text)
print(outputs[0]['summary_text'])

free_memory(summarizer)

In [None]:
model_name = "google/bigbird-pegasus-large-arxiv"
summarizer = pipeline("text-generation", model=model_name, device=DEVICE, max_length=2048)

text = "Please give me a short summarization of the following section of a paper: " + text

outputs = summarizer(text)
print(outputs[0]['generated_text'])

free_memory(summarizer)

In [None]:
tokenizer = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")

input_ids = tokenizer(text, return_tensors="pt").input_ids.to(DEVICE)
global_attention_mask = torch.zeros_like(input_ids)
global_attention_mask[:, 0] = 1

model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv", return_dict_in_generate=True).to(DEVICE)

sequences = model.generate(input_ids, global_attention_mask=global_attention_mask).sequences

summary = tokenizer.batch_decode(sequences)
print(summary)

In [None]:
model_name = "allenai/led-large-16384-arxiv"
summarizer = pipeline("summarization", model=model_name, device=DEVICE)

outputs = summarizer(text)
print(outputs[0]['summary_text'])

free_memory(summarizer)

In [None]:
model_name = "Callidior/bert2bert-base-arxiv-titlegen"
summarizer = pipeline("summarization", model=model_name, device=DEVICE)

outputs = summarizer(text)
print(outputs[0]['summary_text'])

free_memory(summarizer)

In [None]:
tokenizer = LEDTokenizer.from_pretrained("Callidior/bert2bert-base-arxiv-titlegen")

input_ids = tokenizer(text, return_tensors="pt").input_ids.to(DEVICE)
global_attention_mask = torch.zeros_like(input_ids)
global_attention_mask[:, 0] = 1

model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv", return_dict_in_generate=True).to(DEVICE)

sequences = model.generate(input_ids, global_attention_mask=global_attention_mask).sequences

summary = tokenizer.batch_decode(sequences)
print(summary)