In [12]:
!pip install -q transformers

In [13]:
import torch
from transformers import XLNetLMHeadModel, XLNetTokenizer,pipeline
import re
import os
import requests
from tqdm.auto import tqdm

In [14]:
tokenizer = XLNetTokenizer.from_pretrained("Rostlab/prot_xlnet", do_lower_case=False)

In [15]:
model = XLNetLMHeadModel.from_pretrained("Rostlab/prot_xlnet")

In [17]:
# Ensure sequences_Example is properly tokenized
sequences_Example = "A E T C Z A O"
sequences_Example = re.sub(r"[UZOB]", "<unk>", sequences_Example)

In [18]:
ids = tokenizer.encode(sequences_Example, add_special_tokens=False)

In [19]:
filtered_ids = [token_id for token_id in ids if token_id < tokenizer.vocab_size]
if not filtered_ids:
    filtered_ids = [tokenizer.unk_token_id]
input_ids = torch.tensor(filtered_ids).unsqueeze(0)

In [20]:
max_length = 100
temperature = 1.0
k = 0
p = 0.9
repetition_penalty = 1.0
num_return_sequences = 3

In [21]:
output_ids = model.generate(
    input_ids=input_ids,
    max_length=max_length,
    temperature=temperature,
    top_k=k,
    top_p=p,
    repetition_penalty=repetition_penalty,
    do_sample=True,
    num_return_sequences=num_return_sequences,
)

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (-1). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


In [22]:
# Decode the generated sequences
output_sequences = [tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids]


In [23]:
# Print generated sequences
print('Generated Sequences\n')
for output_sequence in output_sequences:
    print(output_sequence)

Generated Sequences

A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C
A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C A A A P A E T C
A E T C A A A P E Q T C A A A P E Q T C A A A P E Q T C A A A P E Q T C A A A P E Q T C A A A P E Q T C A A A P E Q T C A A A P E Q T C A A A P E Q T C A A A P E Q T C A A A P E Q T C A A A P E Q T C
