In [None]:
# add hugginface login
# from huggingface_hub import login
# from dotenv import load_dotenv
# import os

# load_dotenv()
# login(token=os.getenv('HUGGINGFACE_TOKEN')) # add the token to an .env file

In [20]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")

GPU is available: NVIDIA A10G


In [1]:
from huggingface_hub import login
import getpass

# Prompt the user to enter the Hugging Face token securely
token = getpass.getpass("Enter your Hugging Face token: ")

# Login to Hugging Face using the token
login(token=token)

Enter your Hugging Face token:  ········


In [21]:
# read in the doc
def read_preprocess_doc(doc_path):
    with open(doc_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    content = content.split()
    if len(content) > 3072:
        print(f'the document {doc_path} is too long, it will be truncated to 3072 tokens')
        content = content[:3072]
    content = ' '.join(content)
    return content

In [22]:
# read in all doc dir under the folder path and return a list of doc content
data_folder_dir = '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072'

doc_list = []
# extract all doc content
import os
for root, dirs, files in os.walk(data_folder_dir):
    for file in files:
        if file.endswith('.txt'):
            doc_path = os.path.join(root, file)
            doc_list.append(doc_path)
    
print(doc_list[:5])
print(len(doc_list))

# # randomely select 10 doc to test
# doc_list = doc_list[:20]

['../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-0-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-1-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-10-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-11-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-12-0.txt']
760


In [24]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# load the model
# model_id = "meta-llama/Llama-2-7b-chat-hf"
model_id = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [13]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [14]:
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16)

In [15]:
INSTRUCTION = """
I have provided a web-crawled document above that may be relevant to one or more topics such as 
general information, history, economy, music, culture, sports, or upcoming events 
related to Pittsburgh or Carnegie Mellon University. Based on this document, 
generate 5~10 factual question and answer pairs that cover different types of inquiries, 
such as time, events, people, locations, or numerical data.
If there are fewer pairs available, only provide the number you can find.
The questions should make sense independently of the document by including key words from the document.
For example, the question "Q: What is the fee of the event?" is not acceptable, 
because words like "the event" is too vague without context of the document.
It should be sufficiently detailed as "Q: What is the fee of the event at PPG Arena on October 13?" 
or "Q: What is the fee of the event that celebrates the 50th anniversary of CMU?".
Please provide concise answers without repeating the question or using complete sentences.
For example, given the question "Q: When was Carnegie Mellon University founded?",
you should only answer "A: 1900".
\n\nExamples: \n
Q: Who is Pittsburgh named after? A: William Pitt \n
Q: What famous machine learning venue had its first conference in Pittsburgh in 1980? A: ICML \n
Q: What musical artist is performing at PPG Arena on October 13? A: Billie Eilish\n\n
Before you start, please read the document above and provide the number of question and answer pairs you can find."""

In [16]:
import re
import pandas as pd
def extract_qa_pairs(generated_text, doc_id):
    # Use regular expressions to extract all the questions and answers
    questions = re.findall(r'(?:\d*\.\s*)?Q:\s*(.*?)\s*A:', generated_text, re.DOTALL)
    answers = re.findall(r'A:\s*(.*?)(?=\s*(?:\d*\.\s*Q:|Q:|$))', generated_text, re.DOTALL)

    doc_ids = [doc_id] * len(questions)
    # Create a pandas DataFrame from the extracted questions and answers
    df = pd.DataFrame({
        'Doc_id': doc_ids,
        'Question': questions,
        'Answer': answers
    })

    # Display the resulting DataFrame
    return df

In [27]:
torch.cuda.empty_cache()

In [26]:
import pandas as pd
from tqdm import tqdm

# Load the existing QA pairs from the CSV file to see what has already been processed
processed_qa_df_1 = pd.read_csv('../data/annotated/generated_qa_pairs_1.csv')
processed_qa_df_2 = pd.read_csv('../data/annotated/new_generated_qa_pairs_2.csv')
processed_qa_df = pd.concat([processed_qa_df_1, processed_qa_df_2], axis=0)

# Get the list of already processed doc IDs (assuming doc_id is stored in a column called 'doc_id')
processed_doc_ids = processed_qa_df['Doc_id'].unique()

# Initialize an empty DataFrame for new QA pairs
all_qa_df = pd.DataFrame()

# Iterate over the documents in doc_list
for doc in tqdm(doc_list):
    # Extract the doc_id from the document path
    doc_id = doc.split('/')[-1]

    # Check if this doc_id has already been processed
    if doc_id in processed_doc_ids:
        print(f"Skipping already processed document: {doc_id}")
        continue  # Skip the file if it has already been processed

    # Process the document
    print(f"Processing document: {doc_id}")
    content = read_preprocess_doc(doc)
    formatted_input = content + "\n\n" + INSTRUCTION + "\n\n" + "Your answer:"
    tokenized_prompt = tokenizer(formatted_input, return_tensors="pt").to(model.device)
    print(tokenized_prompt.input_ids.size(1))
    
    # Generate the response
    messages = [
        {"role": "user", "content": formatted_input},
    ]
    with torch.no_grad():
        result = pipe(messages, max_new_tokens=512)

    # Extract the question and answer pairs
    qa_df = extract_qa_pairs(result[0]['generated_text'][1]['content'], doc_id)
    
    # Append the new QA pairs to the all_qa_df
    all_qa_df = pd.concat([all_qa_df, qa_df], axis=0)
    
    # Reindex the DataFrame
    all_qa_df.reset_index(drop=True, inplace=True)

  0%|                                                                  | 0/760 [00:00<?, ?it/s]

Skipping already processed document: 0-0-0.txt
Skipping already processed document: 0-1-0.txt
Skipping already processed document: 0-10-0.txt
Skipping already processed document: 0-11-0.txt
Skipping already processed document: 0-12-0.txt
Skipping already processed document: 0-13-0.txt
Skipping already processed document: 0-14-0.txt
Skipping already processed document: 0-15-0.txt
Skipping already processed document: 0-16-0.txt
Skipping already processed document: 0-17-0.txt
Skipping already processed document: 0-18-0.txt
Skipping already processed document: 0-19-0.txt
Skipping already processed document: 0-2-0.txt
Skipping already processed document: 0-20-0.txt
Skipping already processed document: 0-21-0.txt
Skipping already processed document: 0-22-0.txt
Skipping already processed document: 0-23-0.txt
Skipping already processed document: 0-24-0.txt
Skipping already processed document: 0-25-0.txt
Skipping already processed document: 0-26-0.txt
Skipping already processed document: 0-27-0

  7%|███▉                                                     | 53/760 [00:40<09:00,  1.31it/s]

Skipping already processed document: 1-14-0.txt
Skipping already processed document: 1-15-0.txt
Skipping already processed document: 1-16-0.txt
Skipping already processed document: 1-17-0.txt
Processing document: 1-17-1.txt
334


  8%|████▎                                                    | 58/760 [01:33<22:30,  1.92s/it]

Skipping already processed document: 1-18-0.txt
Skipping already processed document: 1-19-0.txt
Skipping already processed document: 1-2-0.txt
Skipping already processed document: 1-3-0.txt
Skipping already processed document: 1-4-0.txt
Skipping already processed document: 1-5-0.txt
Skipping already processed document: 1-6-0.txt
Skipping already processed document: 1-7-0.txt
Skipping already processed document: 1-8-0.txt
Skipping already processed document: 1-9-0.txt
Skipping already processed document: 10-0-0.txt
Skipping already processed document: 10-1-0.txt
Processing document: 10-2-0.txt
1513


  9%|█████▎                                                   | 70/760 [01:37<16:05,  1.40s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1002.00 MiB. GPU 

In [18]:
all_qa_df.head()

Unnamed: 0,Doc_id,Question,Answer
0,10-0-0.txt,What is the name of the cul-de-sac located in ...,Roslyn Place
1,10-0-0.txt,What is the name of the street with the larges...,East Carson Street
2,10-0-0.txt,What is the name of the highway connecting Pit...,I-376
3,10-0-0.txt,What is the name of the highway that runs nort...,I-279
4,10-0-0.txt,What is the name of the belt system in Alleghe...,Allegheny County Belt System


In [19]:
# Save the new QA pairs back to the CSV file (optional)
all_qa_df.to_csv('../data/annotated/new_generated_qa_pairs_2.csv', index=False)