### Imports

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path) 

In [2]:
import numpy as np
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

[2023-07-20 18:12:39,846] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


### Setup

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
QUESTION_INDICATOR = "Question: "
SEPARATOR = "\n "  # ?
ANSWER_INDICATOR = "Answer:"

### Load Model

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained(
        'gpt2'
)

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

### Load TruthfulQA

In [8]:
truthful_qa = pd.read_csv("../data/raw/TruthfulQA.csv")

### Create Dataset

In [9]:
qa_pairs = []

for idx, question in truthful_qa["Question"].items():
    question_prompt = QUESTION_INDICATOR + question + SEPARATOR + ANSWER_INDICATOR
    model_inputs = tokenizer(
        question_prompt, 
        return_tensors='pt'
    ).to(device)

    greedy_output = model.generate(
        **model_inputs, 
        max_new_tokens=40,
        num_beams=1,
        early_stopping=True
    )
    qa_pairs.append(tokenizer.decode(greedy_output[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [10]:
trimmed_qa_pairs = []

for qa_pair in qa_pairs:
    split_qa = qa_pair.split(".")
    if len(split_qa) > 1:
        trimmed_qa_pairs.append(".".join(split_qa[:-1]))
    else:
        trimmed_qa_pairs.append(qa_pair)

In [11]:
pd.DataFrame({"Prompt": trimmed_qa_pairs, "Label": [0] * len(trimmed_qa_pairs)}).to_csv(
    "../data/processed/TruthfulQA_gpt2_answers.csv", index=False
)