In [1]:
# read in the doc
def read_preprocess_doc(doc_path):
    with open(doc_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    content = content.split()
    if len(content) > 3072:
        print(f'the document {doc_path} is too long, it will be truncated to 3072 tokens')
        content = content[:3072]
    content = ' '.join(content)
    return content

In [2]:
# read in all doc dir under the folder path and return a list of doc content
data_folder_dir = '../data/crawled/crawled_text_data_word_3000_150'

doc_list = []
# extract all doc content
import os
for root, dirs, files in os.walk(data_folder_dir):
    for file in files:
        if file.endswith('.txt'):
            doc_path = os.path.join(root, file)
            doc_list.append(doc_path)
    
print(doc_list[:5])
print(len(doc_list))

# randomely select 10 doc to test
doc_list = doc_list[:20]

['../data/crawled/crawled_text_data_word_3000_150/70-0.txt', '../data/crawled/crawled_text_data_word_3000_150/49-0.txt', '../data/crawled/crawled_text_data_word_3000_150/60-0.txt', '../data/crawled/crawled_text_data_word_3000_150/21-0.txt', '../data/crawled/crawled_text_data_word_3000_150/113-0.txt']
209


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# load the model
# model_id = "meta-llama/Llama-2-7b-chat-hf"
model_id = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [5]:
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16)

In [6]:
INSTRUCTION = """
I have provided a web-crawled document above that may be relevant to one or more topics such as 
general information, history, economy, music, culture, sports, or upcoming events 
related to Pittsburgh or Carnegie Mellon University. Based on this document, 
generate 5~10 factual question and answer pairs that cover different types of inquiries, 
such as time, events, people, locations, or numerical data.
If there are fewer pairs available, only provide the number you can find.
The questions should make sense independently of the document by including key words from the document.
For example, the question "Q: What is the fee of the event?" is not acceptable, 
because words like "the event" is too vague without context of the document.
It should be sufficiently detailed as "Q: What is the fee of the event at PPG Arena on October 13?" 
or "Q: What is the fee of the event that celebrates the 50th anniversary of CMU?".
Please provide concise answers without repeating the question or using complete sentences.
For example, given the question "Q: When was Carnegie Mellon University founded?",
you should only answer "A: 1900".
\n\nExamples: \n
Q: Who is Pittsburgh named after? A: William Pitt \n
Q: What famous machine learning venue had its first conference in Pittsburgh in 1980? A: ICML \n
Q: What musical artist is performing at PPG Arena on October 13? A: Billie Eilish\n\n
Before you start, please read the document above and provide the number of question and answer pairs you can find."""

In [11]:
import re
import pandas as pd
def extract_qa_pairs(generated_text, doc_id):
    # Use regular expressions to extract all the questions and answers
    questions = re.findall(r'(?:\d*\.\s*)?Q:\s*(.*?)\s*A:', generated_text, re.DOTALL)
    answers = re.findall(r'A:\s*(.*?)(?=\s*(?:\d*\.\s*Q:|Q:|$))', generated_text, re.DOTALL)

    doc_ids = [doc_id] * len(questions)
    # Create a pandas DataFrame from the extracted questions and answers
    df = pd.DataFrame({
        'Doc_id': doc_ids,
        'Question': questions,
        'Answer': answers
    })

    # Display the resulting DataFrame
    return df

In [14]:
from tqdm import tqdm
all_qa_df = pd.DataFrame()
for doc in doc_list:
    # read the doc
    print(doc)
    content = read_preprocess_doc(doc)
    formatted_input = content + "\n\n" + INSTRUCTION + "\n\n" + "Your answer:"
    tokenized_prompt = tokenizer(formatted_input, return_tensors="pt").to(model.device)
    print(tokenized_prompt.input_ids.size(1))
    
    # generate the response
    messages = [
    {"role": "user", "content": formatted_input},
    ]
    with torch.no_grad():
        result = pipe(messages, max_new_tokens=512)
    # print(result[0]['generated_text'][1]['content'])
    
    # extract the question and answer pairs
    doc_id = doc.split('/')[-1]
    qa_df = extract_qa_pairs(result[0]['generated_text'][1]['content'], doc_id)
    
    # append to the all_qa_df
    all_qa_df = pd.concat([all_qa_df, qa_df], axis=0)
    # reindex the all_qa_df
    all_qa_df.reset_index(drop=True, inplace=True)

../data/crawled/crawled_text_data_word_3000_150/70-0.txt
2803
../data/crawled/crawled_text_data_word_3000_150/49-0.txt
2719
../data/crawled/crawled_text_data_word_3000_150/60-0.txt
2026
../data/crawled/crawled_text_data_word_3000_150/21-0.txt
1197
../data/crawled/crawled_text_data_word_3000_150/113-0.txt
the document ../data/crawled/crawled_text_data_word_3000_150/113-0.txt is too long, it will be truncated to 3072 tokens
4178
../data/crawled/crawled_text_data_word_3000_150/77-0.txt
1059


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


../data/crawled/crawled_text_data_word_3000_150/13-2.txt
3147
../data/crawled/crawled_text_data_word_3000_150/23-0.txt
642
../data/crawled/crawled_text_data_word_3000_150/2-0.txt
2159
../data/crawled/crawled_text_data_word_3000_150/0-4.txt
the document ../data/crawled/crawled_text_data_word_3000_150/0-4.txt is too long, it will be truncated to 3072 tokens
3978
../data/crawled/crawled_text_data_word_3000_150/45-0.txt
1019
../data/crawled/crawled_text_data_word_3000_150/92-0.txt
2276
../data/crawled/crawled_text_data_word_3000_150/85-0.txt
2646
../data/crawled/crawled_text_data_word_3000_150/115-2.txt
the document ../data/crawled/crawled_text_data_word_3000_150/115-2.txt is too long, it will be truncated to 3072 tokens
4288
../data/crawled/crawled_text_data_word_3000_150/100-0.txt
1162
../data/crawled/crawled_text_data_word_3000_150/150-0.txt
3274
../data/crawled/crawled_text_data_word_3000_150/1-3.txt
the document ../data/crawled/crawled_text_data_word_3000_150/1-3.txt is too long, it w

In [15]:
all_qa_df.tail(20)

Unnamed: 0,Doc_id,Question,Answer
232,94-0.txt,What is the name of the museum where the Carne...,Carnegie Museum
233,94-0.txt,What is the name of the neighborhood where the...,North Shore
234,94-0.txt,What is the name of the stadium where the Pitt...,Heinz Field
235,94-0.txt,How many events is Pittsburgh Symphony Orchest...,89
236,94-0.txt,What is the name of the team playing against t...,Cincinnati Reds
237,94-0.txt,What is the name of the venue where the Puscif...,Petersen Events Center
238,94-0.txt,What is the name of the team playing against t...,Milwaukee Brewers
239,94-0.txt,What is the name of the team playing against t...,Milwaukee Brewers
240,94-0.txt,What is the name of the team playing against t...,(Not available)
241,94-0.txt,What is the name of the team playing against t...,(Not available)


In [16]:
all_qa_df.to_csv('../data/annotated/generated_qa_pairs_3000_test20.csv', index=False)