# Sample inference code for SpiderGPT

In [32]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
from tqdm.notebook import tqdm
import pandas as pd
import torch
import random
import numpy as np
import seaborn as sns
from transformers import get_linear_schedule_with_warmup
import time
import datetime
from matplotlib import pyplot as plt
from transformers import Trainer, TrainingArguments,DataCollatorForLanguageModeling
import re
from itertools import chain


trained_model_name='anon-genAI/spiderfiber-anon'
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(trained_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model_name = trained_model_name
 
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    trust_remote_code=True
).to(device)


In [33]:
prompt = "EstimateProperties<AAAGGAGQGGYGGQGAGQGAAAAAAGGAGQGGYGGQGAGQGAGAAAAAAGGAGQGGYGGLGSGQGGYGGQG\
 AGAAAAAAAAGGAGQGGYGGLGSGQGGYGGQGAGAAAAAAGGAGQGGYGGLGGQGAGQGSGAAAAAAGGAGQGGYGGQ \
 GAGQGAGAAAAAAGGAGQGGYGGLGGQGAGQGAAAAAAGGAGQ GGYGGQGAGQGAGAAAAAAGGAGQGGYGGLGSGQGGYGGQ \
 GAGAAAAAAGGAGQGGYGGLGGQGAGAAAAAAGGAGQGG YGGQGAGQGAAAAAAGGAGQGGYGGQGAGQGGYGGQGA GAAAAAAGGAGQGGYGGLGGQGAGQGAGAAA\
  AAAGGAGQGGYGGQGAGQGAGAAAAAAGGAGQGGYGGLGGQGA GAAAAAAGGAGQGGYGGQGAGQGGYGGQGSGAAAAAAAA \
  GGAGQGGYGGLGSQGAGQGAGAAAAAAGGAGQGGYGG QGAGQGAGAAAAAAGGAGQGGYGGQGAGQGAGAAAAAAGGAGQGG \
  YGGQGAGQGAGAAAAAAGGAGQGGYGGLGSGQGGY GGQGAGAAAAAAGGAGQGGYGGQGAGAAAASAAASRLSSPEASSGLS \
  GCDVLVQALLEVVSALIHILGSSSIGPVNYGSASQSTQIVGQSVYQALG>"
generated = torch.tensor(tokenizer.encode(prompt, add_special_tokens = False)).unsqueeze(0).to(device)
# print(generated.shape, generated)

sample_outputs = model.generate(
                                inputs=generated, 
                                eos_token_id =tokenizer.eos_token_id,
                                do_sample=False,   
                                top_k=500, 
                                max_length = 300,
                                top_p=0.9, 
                                num_return_sequences=1,
                                temperature=1,
                                max_new_tokens=400
                                ).to(device)

for i, sample_output in enumerate(sample_outputs):
    print(tokenizer.decode(sample_output, skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=400) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


EstimateProperties<AAAGGAGQGGYGGQGAGQGAAAAAAGGAGQGGYGGQGAGQGAGAAAAAAGGAGQGGYGGLGSGQGGYGGQG AGAAAAAAAAGGAGQGGYGGLGSGQGGYGGQGAGAAAAAAGGAGQGGYGGLGGQGAGQGSGAAAAAAGGAGQGGYGGQ  GAGQGAGAAAAAAGGAGQGGYGGLGGQGAGQGAAAAAAGGAGQ GGYGGQGAGQGAGAAAAAAGGAGQGGYGGLGSGQGGYGGQ  GAGAAAAAAGGAGQGGYGGLGGQGAGAAAAAAGGAGQGG YGGQGAGQGAAAAAAGGAGQGGYGGQGAGQGGYGGQGA GAAAAAAGGAGQGGYGGLGGQGAGQGAGAAA  AAAGGAGQGGYGGQGAGQGAGAAAAAAGGAGQGGYGGLGGQGA GAAAAAAGGAGQGGYGGQGAGQGGYGGQGSGAAAAAAAA   GGAGQGGYGGLGSQGAGQGAGAAAAAAGGAGQGGYGG QGAGQGAGAAAAAAGGAGQGGYGGQGAGQGAGAAAAAAGGAGQGG   YGGQGAGQGAGAAAAAAGGAGQGGYGGLGSGQGGY GGQGAGAAAAAAGGAGQGGYGGQGAGAAAASAAASRLSSPEASSGLS   GCDVLVQALLEVVSALIHILGSSSIGPVNYGSASQSTQIVGQSVYQALG> [0.23,0.22,0.36,0.35,0.34,0.38,0.43,0.09]


In [34]:
prompt = "GenerateSequence<0.177,0.222,0.082,0.065,0.225,0.241,0.266,0.515>"
generated = torch.tensor(tokenizer.encode(prompt, add_special_tokens = False)).unsqueeze(0).to(device)
print(generated.shape, generated)

sample_outputs = model.generate(
                                inputs=generated, 
                                eos_token_id =tokenizer.eos_token_id,
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.9, 
                                num_return_sequences=1,
                                temperature=1,
                                max_new_tokens=400
                                ).to(device)

for i, sample_output in enumerate(sample_outputs):
    print(tokenizer.decode(sample_output, skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=400) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


torch.Size([1, 57]) tensor([[ 43, 299,  73,  86,  69,  88,  73, 303,  32,  20,  18,  21,  27,  27,
          16,  20,  18,  22,  22,  22,  16,  20,  18,  20,  28,  22,  16,  20,
          18,  20,  26,  25,  16,  20,  18,  22,  22,  25,  16,  20,  18,  22,
          24,  21,  16,  20,  18,  22,  26,  26,  16,  20,  18,  25,  21,  25,
          34]], device='cuda:0')
GenerateSequence<0.177,0.222,0.082,0.065,0.225,0.241,0.266,0.515> [NDISSASSASAVSDGQGGYGQEQSPRAGTGSAGQDQVGYGGQGGVSASASAGVAGGAGTATEGGYGGPGAGSGGAGAPGGYGPAGPGTGSLNNQGGYGPGAGAGAAAVSSASVGAGSQGYGPSGYTSGTGASGPGGASGAAAAAAAATGGYGRAGPGAAAAAAAAGQGGYGQGGQGTGAAAAAAAGGSGGQGQGSGAAAAAAAASGQGGQGGYGQGGQSGQGGQGGYGQGGQGYGQQGAGAGAAAAAAAAAGQGGQGGYGQGGQGGYGQGSSGAAAAAAAAAAGGSGGQGGQGGYGQGGQGGYGQGAAAAAAAAAGGTGGQGGYGQGAGSGQGGQGGYGQGGQGGYGQGAAAAAAASGLSGQGRGAGQGGQGGYGQGGQGGYGQGAAAAAAAGGSGQGGYGQGPQIGQGSGAAAAAAAAAGRGGYGQGAGPGGAGQGGQGGYGQGGQSGQGGQGGYGQGGQGAGAAAAAAAAGGAGGAGRGGYGQGAGPGGAGAAAAAAAAAAGGQGGQGGYGQGGYGQGGIGGYSQRTAGAGSAAATGGQGPGGYGQGSGPRSASVAAAGGGQ

In [42]:
# prompt = "GenerateSilkContent<0.177,0.222,0.082,0.065,0.225,0.241,0.266,0.515>"
prompt = "Sequence<AAAG"
generated = torch.tensor(tokenizer.encode(prompt, add_special_tokens = False)).unsqueeze(0).to(device)
print(generated.shape, generated)

sample_outputs = model.generate(
                                inputs=generated, 
                                eos_token_id =tokenizer.eos_token_id,
                                do_sample=True,  
                                max_length = 200,
                                num_return_sequences=2,
                                max_new_tokens=300
                                ).to(device)

for i, sample_output in enumerate(sample_outputs):
    out_string=tokenizer.decode(sample_output, skip_special_tokens=True)
    print("{}: {}\n\n".format(i, out_string))
    num_tokens = len(out_string)
    if num_tokens < 50:
        print(f"⚠️ Output only contains {num_tokens} tokens — likely incomplete.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=300) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


torch.Size([1, 3]) tensor([[ 303,   32, 1988]], device='cuda:0')
0: Sequence<AAAGPGGYGPSQRGPSGPGSAAAAAAGAGPGGYGPGQQGPSGPGSAAAAATAAGGPGGYGPGQQGPGGYGPSGPSGPGGAGPYAAAAAAAAGGPGGYGPGAQGPSGPSNGPGQQGPGGYGPSGPGASAAAAAASGPGGRGPSGPSGPGGAAAAAAAAAGGPGGYGPSQQGPGGYGPSGPGGPGGAAAAAGGPGGYGPGSQGPGGPGASAAAAAASGPGGSGPGGYGPSQQGPGQQGPGGYGPSGPGGASAAAAAAAAAASGPGGNNGYGPGGPGQQGPGGYGPGGSGPGGASAAAAAAGGAGGPGGYGPGGYGPGSQGPSGPGGYGPSSQGPGAAGGAGGPGSQGPYGPGSQGGYGPGGSGPAAAASSSAASGPGGYGPGSQGPSVNAAAAAAGGSGPGGYGPGGYGPGPSGPGGAGAAAAAAAASGPGGYGPGSQGPSGPAGYGPSGLSGPGGAAAAAASGPGGYGPGSQGPSGPRGYSQGLGPGGAASAAAAAAGGPGGYGPANQGPSGSSSGPGGASAAAAAAAGGLGGQGPSGPGSQGPSGPGGYGPGSQGPGGYGPGSQGPGGPGASAAAAAASGPGGYGPGSQGPSGPGSQGPSGPGGYGPGSQGPGGYGPGSQGPSGYGPSGPGGASAAAAAASASGPGGPGSQGPSGPSGPGGYGPGSQGPSGPGGYGPGASAAAAAAASGPGGYGPGSQGPSGPGSQGPSGPGGPGASAAAAAASGPGGYGPGSQGPSGPSGPGGYGPGSQGPSGPGGYGP>


1: Sequence<AAAGAAAAAAAAGGQGGQGGYGSQGAGQGGYGAGQGGAGAAGAAAAAAAAGGAGGSGQGGLLAGGAGQGYGAGLGGQGGDGQGGAGAAASAAAAGGAGGQGGYGGLGSQGAGQGGYGSGGAGAAAAAAAAGGQGGQGGYGSQGAGQG