# Decider 생성을 위한 데이터셋 생성 과정

1. 데이터셋에서 일부를 추출(100개 예정)
2. 사용할 언어 모델 정하기 (Llama3.2-8B)
3. 해당 언어 모델 토크나이저로 json 출력 분할
4. 각 상태가 변하는 순간을 레이블

레이블해주어야 할 것은 각 상태가 변하는 순간에 어떤 질문이 들어가는지와 그때의 답.

# 데이터셋 추출

In [1]:
import pandas as pd

df = pd.read_csv("/workspace/datas/few-nerd/supervised/train.preprocessed.csv")

In [2]:
df_100 = df.sample(100)

In [3]:
df_100.to_csv("/workspace/datas/few-nerd/supervised/train.sampled.csv", index=False)

# 토크나이저로 json 분할

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
import pandas as pd

df_100 = pd.read_csv("/workspace/datas/few-nerd/supervised/train.sampled.csv")

In [7]:
token_id_list = tokenizer.encode(df_100["NER"][0], add_special_tokens=False)

token id list에서 하나씩 가져다가 길이만큼 json을 슬라이싱하여 decoding하면 생성 과정 시뮬레이션 완성.

In [33]:
token_id_list = []
for text in df_100["NER"]:
    token_id_list.append(tokenizer.encode(text, add_special_tokens=False))

In [34]:
max_length = max([len(ids) for ids in token_id_list])

print(f"Max length of tokenized NER: {max_length}")

Max length of tokenized NER: 66


In [35]:
token_list = token_id_list[0]

generation_list = []
for idx in range(len(token_list)):
    gen_str = tokenizer.decode(token_list[:idx+1], skip_special_tokens=True)
    generation_list.append(gen_str)

In [9]:
generation_list

["{'",
 "{'other",
 "{'other':",
 "{'other': ['",
 "{'other': ['Mah",
 "{'other': ['Mahab",
 "{'other': ['Mahabhar",
 "{'other': ['Mahabharata",
 "{'other': ['Mahabharata'],",
 "{'other': ['Mahabharata'], '",
 "{'other': ['Mahabharata'], 'location",
 "{'other': ['Mahabharata'], 'location':",
 "{'other': ['Mahabharata'], 'location': ['",
 "{'other': ['Mahabharata'], 'location': ['K",
 "{'other': ['Mahabharata'], 'location': ['Kich",
 "{'other': ['Mahabharata'], 'location': ['Kichaka",
 "{'other': ['Mahabharata'], 'location': ['Kichaka',",
 "{'other': ['Mahabharata'], 'location': ['Kichaka', '",
 "{'other': ['Mahabharata'], 'location': ['Kichaka', 'Sh",
 "{'other': ['Mahabharata'], 'location': ['Kichaka', 'Shail",
 "{'other': ['Mahabharata'], 'location': ['Kichaka', 'Shailoda",
 "{'other': ['Mahabharata'], 'location': ['Kichaka', 'Shailoda']}"]

In [18]:
len(generation_list)

34

In [19]:
len(token_list)

34

In [10]:
df_100

Unnamed: 0,Sentence,NER
0,The Mahabharata refers to the Kichaka bamboos ...,"{'other': ['Mahabharata'], 'location': ['Kicha..."
1,"In 2007 , he was one of the nominees of BATAS ...",{'organization': ['BATAS']}
2,"In late mitosis , Cdc6 protein joins the bound...","{'other': ['Cdc6', 'ORC', 'Cdt1-Mcm2-7']}"
3,In the times of Khmelnytsky Uprising it was ta...,"{'event': ['Khmelnytsky Uprising'], 'organizat..."
4,"At the end of its run , the film 's final dome...",{'other': ['$']}
...,...,...
95,"During Winter 2015 , Adekugbe was invited to t...","{'person': ['Adekugbe'], 'location': ['English..."
96,Someone who breaks the law should be punished ...,{}
97,Domela Nieuwenhuis was elected to the House of...,"{'person': ['Domela Nieuwenhuis'], 'organizati..."
98,"Under third-year head coach Glenn Martin , the...",{'person': ['Glenn Martin']}


In [36]:
sample_num = 100

generation_list = []
sentence_list = []
for idx in range(sample_num):
    token_list = token_id_list[idx]
    for gen_idx in range(len(token_list)):
        gen_str = tokenizer.decode(token_list[:gen_idx+1], skip_special_tokens=True)
        generation_list.append(gen_str)
    
    # 마지막 생성 문장을 다시 넣어서 S_value^f에서 S_e로 가는 상태 전이를 표현한다.
    generation_list.append(generation_list[-1])
    
    sentence_list.extend([df_100.iloc[idx]["Sentence"]] * (len(token_list) + 1))

In [37]:
df_100_labeled = pd.DataFrame({"sentence": sentence_list, "generation": generation_list})

In [40]:
df_100_labeled.head(30)

Unnamed: 0,sentence,generation
0,The Mahabharata refers to the Kichaka bamboos ...,{'
1,The Mahabharata refers to the Kichaka bamboos ...,{'other
2,The Mahabharata refers to the Kichaka bamboos ...,{'other':
3,The Mahabharata refers to the Kichaka bamboos ...,{'other': ['
4,The Mahabharata refers to the Kichaka bamboos ...,{'other': ['Mah
5,The Mahabharata refers to the Kichaka bamboos ...,{'other': ['Mahab
6,The Mahabharata refers to the Kichaka bamboos ...,{'other': ['Mahabhar
7,The Mahabharata refers to the Kichaka bamboos ...,{'other': ['Mahabharata
8,The Mahabharata refers to the Kichaka bamboos ...,"{'other': ['Mahabharata'],"
9,The Mahabharata refers to the Kichaka bamboos ...,"{'other': ['Mahabharata'], '"


In [41]:
df_100_labeled.to_csv("/workspace/datas/few-nerd/supervised/train.sampled.labeled.csv", index=False)

# Labeling하기

In [32]:
prompt_list = []
answer_list = []

In [37]:
print(len(df_100_labeled))

3017


In [34]:
df_100_labeled.iloc[:10]

Unnamed: 0,sentence,generation
0,what s the name of tim burton s movie about a ...,{'
1,what s the name of tim burton s movie about a ...,{'Actor
2,what s the name of tim burton s movie about a ...,{'Actor':
3,what s the name of tim burton s movie about a ...,{'Actor': ['
4,what s the name of tim burton s movie about a ...,{'Actor': ['k
5,what s the name of tim burton s movie about a ...,{'Actor': ['kurt
6,what s the name of tim burton s movie about a ...,{'Actor': ['kurt russ
7,what s the name of tim burton s movie about a ...,{'Actor': ['kurt russell
8,what s the name of tim burton s movie about a ...,"{'Actor': ['kurt russell'],"
9,what s the name of tim burton s movie about a ...,"{'Actor': ['kurt russell'], '"


In [35]:
df_100_labeled.iloc[0]["sentence"]

'what s the name of tim burton s movie about a boy and his recently dead dog'

채워야할 레이블: 현재 상태, 다음 상태, 이진 분류 답

In [None]:
prompt_format_a = "sentence: {0} \n generation: {1}\n In this context, there are other entities?"

prompt_format_b = "sentence: {0} \n generation: {1}\n In this context, the entity tagged as {2} is {3}, there are other {2} entities?"

In [None]:
sentence = "Lost Kingdom Adventure is a Sally Corporation Interactive Dark Ride located at four Legoland theme parks around the world ."

generation = "{'product': ['rugby'], 'person': ['François Chambell"

entity_type = "person"

cur_entities = " ['François Chamb"

In [58]:
prompt_format_a.format(sentence, generation, entity_type, cur_entities)

"sentence: Lost Kingdom Adventure is a Sally Corporation Interactive Dark Ride located at four Legoland theme parks around the world . \n generation: {'product': ['rugby'], 'person': ['François Chamb\n In this context, the entity tagged as person is  ['François Chamb, there are other person entities?"

In [42]:
prompt_format_b.format(sentence, generation)

"sentence: Lost Kingdom Adventure is a Sally Corporation Interactive Dark Ride located at four Legoland theme parks around the world . \n generation: {'product':\n In this context, there are other entities?"