In [1]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn as nn
import copy
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval().cuda()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [4]:
import json
file_path = './data/transportation_sentences.json'

with open(file_path, 'r') as file:
    sentences = json.load(file)

In [5]:
# Hook the output form decoder15 of qwen
features = {}
def forward_hook(module, input, output):
    features["decoder15"] = copy.deepcopy(output[0].cpu().detach() \
                                      if isinstance(output, (tuple, list)) else output.cpu().detach())
    features["decoder15"] = features["decoder15"].numpy().tolist()

hook_handle = model.model.layers[14].register_forward_hook(forward_hook)

# Generate a story that begins with a specific sentence
# Return generated tokens and corresponding features
def generate_story(query = "I saw a Truck on the way to school."):

    prompt = f"Give me a short story that begins with: {query} \n\n"
    prompt += "Try to include more transportation-related words in the story.\n\n"
    prompt += "The story should contain at most 50 words"

    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    dummy_input = tokenizer(text, return_tensors="pt")["input_ids"].cuda()
    eos_token_id = tokenizer.eos_token_id

    token_feature = []
    for i in range(150):
        out = model(dummy_input)
        
        next_token_id = torch.argmax(out.logits[:, -1, :], dim=-1).unsqueeze(0)  # shape: (1, 1)
        
        dummy_input = torch.cat([dummy_input, next_token_id], dim=1)
        token_feature.append([next_token_id.cpu().numpy().tolist()[0][0], features["decoder15"][0][-1]])

        if next_token_id.item() == eos_token_id:
            break
    if True:
        print(tokenizer.decode(dummy_input[0], skip_special_tokens=True))

    return token_feature

In [None]:
# Generate 300 stories for 30 types of transportation
# time-consuming
'''
structure of stories
{
    transportation1: {
        First sentence1 : [[token1, feature1], [token2, feature2], ...]
        First sentence2 : [[token1, feature1], [token2, feature2], ...]
        ...
        First sentence10 : [[token1, feature1], [token2, feature2], ...]
        }
    transportation2: {
        First sentence1 : [[token1, feature1], [token2, feature2], ...]
        First sentence2 : [[token1, feature1], [token2, feature2], ...]
        ...
        First sentence10 : [[token1, feature1], [token2, feature2], ...]
        }
        
    ...

    transportation30: {
        First sentence1 : [[token1, feature1], [token2, feature2], ...]
        First sentence2 : [[token1, feature1], [token2, feature2], ...]
        ...
        First sentence10 : [[token1, feature1], [token2, feature2], ...]
        }
}
'''

stories = {}
# sentences = sentences[:2]
torch.cuda.empty_cache() 
with torch.no_grad():
    for obj in sentences:
        for k, sentences in obj.items():
            for sentence in sentences:
                token_feature = generate_story(sentence)
                try:
                    stories[k][sentence] = token_feature
                except:
                    stories[k] = {sentence: token_feature}

with open("./data/dataset.json", 'w', encoding='utf-8') as f:
    json.dump(stories, f, indent=4, ensure_ascii=False)

system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user
Give me a short story that begins with: I saw a Car on the way to school. 

Try to include more transportation-related words in the story.

The story should contain at most 50 words
assistant
I saw a car on the way to school. As I drove, I noticed a familiar red car parked in the driveway. The driver was a young man with a friendly demeanor. I parked my car and walked to the school, where I found a group of students chatting and laughing.
system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user
Give me a short story that begins with: The Car is parked outside the building. 

Try to include more transportation-related words in the story.

The story should contain at most 50 words
assistant
The car was parked outside the building, and the owner was busy with work. The owner was a retired businessman, and he had been living in the building for years. The owner had a passion for vintage

In [None]:
# a story example
file_path = "./data/dataset.json"

with open(file_path, 'r') as file:
    stories = json.load(file)

print(f"Category: {'Car'}")
print(f"First sentence: {'The Car is parked outside the building.'}")

story = stories['Car']['The Car is parked outside the building.']
print(f"Story: {tokenizer.decode([token[0] for token in story], skip_special_tokens=True)}")
print(f"Tokens: {[token[0] for token in story]}")

Category: Car
First sentence: The Car is parked outside the building.
Story: The car was parked outside the building, and the owner was busy with work. The owner was a retired businessman, and he had been living in the building for years. The owner had a passion for vintage cars, and he had a collection of antique cars that he had inherited from his father. The owner had a passion for history, and he had a collection of old maps and books that he had collected over the years. The owner had a passion for music, and he had a collection of old records and albums that he had collected over the years. The owner had a passion for nature, and he had a collection of old books and magazines that he had collected over the years. The owner had a passion for photography, and he had a collection
Tokens: [785, 1803, 572, 42235, 4889, 279, 4752, 11, 323, 279, 6372, 572, 13028, 448, 975, 13, 576, 6372, 572, 264, 21583, 46278, 11, 323, 566, 1030, 1012, 5382, 304, 279, 4752, 369, 1635, 13, 576, 6372, 10