In [None]:
# add hugginface login
# from huggingface_hub import login
# from dotenv import load_dotenv
# import os

# load_dotenv()
# login(token=os.getenv('HUGGINGFACE_TOKEN')) # add the token to an .env file

In [1]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")

GPU is available: NVIDIA A10G


In [2]:
from huggingface_hub import login
import getpass

# Prompt the user to enter the Hugging Face token securely
token = getpass.getpass("Enter your Hugging Face token: ")

# Login to Hugging Face using the token
login(token=token)

Enter your Hugging Face token:  ········


In [3]:
# read in the doc
def read_preprocess_doc(doc_path):
    with open(doc_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    content = content.split()
    if len(content) > 3072:
        print(f'the document {doc_path} is too long, it will be truncated to 3072 tokens')
        content = content[:3072]
    content = ' '.join(content)
    return content

In [4]:
# read in all doc dir under the folder path and return a list of doc content
data_folder_dir = '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072'

doc_list = []
# extract all doc content
import os
for root, dirs, files in os.walk(data_folder_dir):
    for file in files:
        if file.endswith('.txt'):
            doc_path = os.path.join(root, file)
            doc_list.append(doc_path)
    
print(doc_list[:5])
print(len(doc_list))

# # randomely select 10 doc to test
# doc_list = doc_list[:20]

['../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-0-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-1-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-10-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-11-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-12-0.txt']
760


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# load the model
# model_id = "meta-llama/Llama-2-7b-chat-hf"
model_id = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [7]:
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16)

In [8]:
INSTRUCTION = """
I have provided a web-crawled document above that may be relevant to one or more topics such as 
general information, history, economy, music, culture, sports, or upcoming events 
related to Pittsburgh or Carnegie Mellon University. Based on this document, 
generate 5~10 factual question and answer pairs that cover different types of inquiries, 
such as time, events, people, locations, or numerical data.
If there are fewer pairs available, only provide the number you can find.
The questions should make sense independently of the document by including key words from the document.
For example, the question "Q: What is the fee of the event?" is not acceptable, 
because words like "the event" is too vague without context of the document.
It should be sufficiently detailed as "Q: What is the fee of the event at PPG Arena on October 13?" 
or "Q: What is the fee of the event that celebrates the 50th anniversary of CMU?".
Please provide concise answers without repeating the question or using complete sentences.
For example, given the question "Q: When was Carnegie Mellon University founded?",
you should only answer "A: 1900".
\n\nExamples: \n
Q: Who is Pittsburgh named after? A: William Pitt \n
Q: What famous machine learning venue had its first conference in Pittsburgh in 1980? A: ICML \n
Q: What musical artist is performing at PPG Arena on October 13? A: Billie Eilish\n\n
Before you start, please read the document above and provide the number of question and answer pairs you can find."""

In [9]:
import re
import pandas as pd
def extract_qa_pairs(generated_text, doc_id):
    # Use regular expressions to extract all the questions and answers
    questions = re.findall(r'(?:\d*\.\s*)?Q:\s*(.*?)\s*A:', generated_text, re.DOTALL)
    answers = re.findall(r'A:\s*(.*?)(?=\s*(?:\d*\.\s*Q:|Q:|$))', generated_text, re.DOTALL)

    doc_ids = [doc_id] * len(questions)
    # Create a pandas DataFrame from the extracted questions and answers
    df = pd.DataFrame({
        'Doc_id': doc_ids,
        'Question': questions,
        'Answer': answers
    })

    # Display the resulting DataFrame
    return df

In [27]:
torch.cuda.empty_cache()

In [None]:
# import pandas as pd
from tqdm import tqdm

# Load the existing QA pairs from the CSV file to see what has already been processed
processed_qa_df_1 = pd.read_csv('../data/annotated/generated_qa_pairs_1.csv')
processed_qa_df_2 = pd.read_csv('../data/annotated/new_generated_qa_pairs_2.csv')
processed_qa_df = pd.concat([processed_qa_df_1, processed_qa_df_2], axis=0)

# Get the list of already processed doc IDs (assuming doc_id is stored in a column called 'doc_id')
processed_doc_ids = processed_qa_df['Doc_id'].unique()

# Initialize an empty DataFrame for new QA pairs
all_qa_df = pd.DataFrame()

# Iterate over the documents in doc_list
for doc in tqdm(doc_list):
    # Extract the doc_id from the document path
    doc_id = doc.split('/')[-1]

    # Check if this doc_id has already been processed
    if doc_id in processed_doc_ids:
        print(f"Skipping already processed document: {doc_id}")
        continue  # Skip the file if it has already been processed

    # Process the document
    print(f"Processing document: {doc_id}")
    content = read_preprocess_doc(doc)
    formatted_input = content + "\n\n" + INSTRUCTION + "\n\n" + "Your answer:"
    tokenized_prompt = tokenizer(formatted_input, return_tensors="pt").to(model.device)
    print(tokenized_prompt.input_ids.size(1))
    
    # Generate the response
    messages = [
        {"role": "user", "content": formatted_input},
    ]
    with torch.no_grad():
        result = pipe(messages, max_new_tokens=512)

    # Extract the question and answer pairs
    qa_df = extract_qa_pairs(result[0]['generated_text'][1]['content'], doc_id)
    # qa_df.to_csv('../data/annotated/new_generated_qa_pairs_2.csv')
    
    # Append the new QA pairs to the all_qa_df
    all_qa_df = pd.concat([all_qa_df, qa_df], axis=0)
    
    # Reindex the DataFrame
    all_qa_df.reset_index(drop=True, inplace=True)

  0%|                                                                  | 0/760 [00:00<?, ?it/s]

Skipping already processed document: 0-0-0.txt
Skipping already processed document: 0-1-0.txt
Skipping already processed document: 0-10-0.txt
Skipping already processed document: 0-11-0.txt
Skipping already processed document: 0-12-0.txt
Skipping already processed document: 0-13-0.txt
Skipping already processed document: 0-14-0.txt
Skipping already processed document: 0-15-0.txt
Skipping already processed document: 0-16-0.txt
Skipping already processed document: 0-17-0.txt
Skipping already processed document: 0-18-0.txt
Skipping already processed document: 0-19-0.txt
Skipping already processed document: 0-2-0.txt
Skipping already processed document: 0-20-0.txt
Skipping already processed document: 0-21-0.txt
Skipping already processed document: 0-22-0.txt
Skipping already processed document: 0-23-0.txt
Skipping already processed document: 0-24-0.txt
Skipping already processed document: 0-25-0.txt
Skipping already processed document: 0-26-0.txt
Skipping already processed document: 0-27-0

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
  9%|█████▎                                                   | 71/760 [00:12<02:04,  5.52it/s]

Processing document: 10-3-0.txt
1460


  9%|█████▍                                                   | 72/760 [00:29<05:53,  1.94it/s]

Processing document: 10-4-0.txt
1288


 10%|█████▍                                                   | 73/760 [00:38<08:32,  1.34it/s]

Processing document: 10-5-0.txt
1408


 10%|█████▌                                                   | 74/760 [00:48<12:14,  1.07s/it]

Processing document: 100-0-0.txt
1162


 10%|█████▋                                                   | 75/760 [00:58<17:29,  1.53s/it]

Processing document: 101-0-0.txt
1079


 10%|█████▋                                                   | 76/760 [01:07<23:35,  2.07s/it]

Processing document: 102-0-0.txt
788


 10%|█████▊                                                   | 77/760 [01:14<28:38,  2.52s/it]

Processing document: 103-0-0.txt
1046


 10%|█████▊                                                   | 78/760 [01:23<36:28,  3.21s/it]

Processing document: 104-0-0.txt
997


 10%|█████▉                                                   | 79/760 [01:31<43:34,  3.84s/it]

Processing document: 105-0-0.txt
1557


 11%|██████                                                   | 80/760 [01:42<57:31,  5.08s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processing document: 105-1-0.txt
750


 11%|█████▊                                                 | 81/760 [01:50<1:02:53,  5.56s/it]

Processing document: 106-0-0.txt
933


 11%|█████▉                                                 | 82/760 [01:58<1:09:04,  6.11s/it]

Processing document: 107-0-0.txt
1464


 11%|██████                                                 | 83/760 [02:11<1:26:08,  7.63s/it]

Processing document: 107-1-0.txt
799


 11%|██████                                                 | 84/760 [02:19<1:27:53,  7.80s/it]

Processing document: 108-0-0.txt
1481


 11%|██████▏                                                | 85/760 [02:30<1:38:36,  8.76s/it]

Processing document: 108-1-0.txt
1445


 11%|██████▏                                                | 86/760 [02:40<1:39:52,  8.89s/it]

Processing document: 108-2-0.txt
1555


 11%|██████▎                                                | 87/760 [02:54<1:55:40, 10.31s/it]

Processing document: 109-0-0.txt
1493


 12%|██████▎                                                | 88/760 [03:08<2:07:23, 11.37s/it]

Processing document: 109-1-0.txt
1501


 12%|██████▍                                                | 89/760 [03:21<2:12:26, 11.84s/it]

Processing document: 11-0-0.txt
1737


 12%|██████▌                                                | 90/760 [03:43<2:48:36, 15.10s/it]

Processing document: 11-1-0.txt
1250


 12%|██████▌                                                | 91/760 [04:02<3:00:35, 16.20s/it]

Processing document: 110-0-0.txt
1488


 12%|██████▋                                                | 92/760 [04:15<2:49:33, 15.23s/it]

Processing document: 110-1-0.txt
1491


 12%|██████▋                                                | 93/760 [04:26<2:34:32, 13.90s/it]

Processing document: 111-0-0.txt
1522


 12%|██████▊                                                | 94/760 [04:34<2:15:25, 12.20s/it]

Processing document: 111-1-0.txt
1193


 12%|██████▉                                                | 95/760 [04:43<2:04:49, 11.26s/it]

Processing document: 111-2-0.txt
1615


 13%|██████▉                                                | 96/760 [04:54<2:03:59, 11.20s/it]

Processing document: 111-2-1.txt
819


 13%|███████                                                | 97/760 [05:01<1:48:57,  9.86s/it]

Processing document: 111-3-0.txt
1513


 13%|███████                                                | 98/760 [05:14<1:58:26, 10.74s/it]

Processing document: 111-4-0.txt
1386


 13%|███████▏                                               | 99/760 [05:21<1:46:49,  9.70s/it]

Processing document: 111-5-0.txt
1237


 13%|███████                                               | 100/760 [05:31<1:46:21,  9.67s/it]

Processing document: 112-0-0.txt
1873


 13%|███████▏                                              | 101/760 [05:41<1:48:00,  9.83s/it]

Processing document: 112-1-0.txt
1493


 13%|███████▏                                              | 102/760 [05:50<1:43:50,  9.47s/it]

Processing document: 112-2-0.txt
1304


 14%|███████▎                                              | 103/760 [05:57<1:38:25,  8.99s/it]

Processing document: 113-0-0.txt
1577


 14%|███████▍                                              | 104/760 [06:10<1:50:44, 10.13s/it]

Processing document: 113-1-0.txt
1543


 14%|███████▍                                              | 105/760 [06:30<2:23:55, 13.18s/it]

Processing document: 113-2-0.txt
1595


 14%|███████▌                                              | 106/760 [06:48<2:37:18, 14.43s/it]

Processing document: 113-3-0.txt
1745


 14%|███████▌                                              | 107/760 [07:03<2:40:14, 14.72s/it]

Processing document: 113-4-0.txt
1245


 14%|███████▋                                              | 108/760 [07:19<2:44:45, 15.16s/it]

Processing document: 113-5-0.txt
1738


 14%|███████▋                                              | 109/760 [07:31<2:33:56, 14.19s/it]

Processing document: 113-6-0.txt
855


 14%|███████▊                                              | 110/760 [07:42<2:22:23, 13.14s/it]

Processing document: 114-0-0.txt
359


 15%|███████▉                                              | 111/760 [07:47<1:54:35, 10.59s/it]

Processing document: 115-0-0.txt
1629


 15%|███████▉                                              | 112/760 [07:58<1:55:07, 10.66s/it]

Processing document: 115-1-0.txt
1560


 15%|████████                                              | 113/760 [08:18<2:27:32, 13.68s/it]

Processing document: 115-10-0.txt
1533


 15%|████████                                              | 114/760 [08:41<2:56:13, 16.37s/it]

Processing document: 115-11-0.txt
1471


 15%|████████▏                                             | 115/760 [08:57<2:56:44, 16.44s/it]

Processing document: 115-12-0.txt
1872


 15%|████████▏                                             | 116/760 [09:21<3:18:09, 18.46s/it]

Processing document: 115-13-0.txt
1880


 15%|████████▎                                             | 117/760 [09:44<3:33:02, 19.88s/it]

Processing document: 115-14-0.txt
1739


 16%|████████▍                                             | 118/760 [10:05<3:35:35, 20.15s/it]

Processing document: 115-15-0.txt
1809


 16%|████████▍                                             | 119/760 [10:28<3:44:38, 21.03s/it]

Processing document: 115-16-0.txt
1381


 16%|████████▌                                             | 120/760 [10:46<3:34:07, 20.07s/it]

Processing document: 115-2-0.txt
1612


 16%|████████▌                                             | 121/760 [11:05<3:33:07, 20.01s/it]

Processing document: 115-3-0.txt
1550


 16%|████████▋                                             | 122/760 [11:16<3:02:09, 17.13s/it]

Processing document: 115-4-0.txt
1535


 16%|████████▋                                             | 123/760 [11:38<3:19:25, 18.78s/it]

Processing document: 115-5-0.txt
1640


 16%|████████▊                                             | 124/760 [11:56<3:15:26, 18.44s/it]

Processing document: 115-6-0.txt
1755


 16%|████████▉                                             | 125/760 [12:15<3:17:05, 18.62s/it]

Processing document: 115-7-0.txt
1740


 17%|████████▉                                             | 126/760 [12:35<3:20:47, 19.00s/it]

Processing document: 115-8-0.txt
1547


 17%|█████████                                             | 127/760 [12:58<3:32:02, 20.10s/it]

Processing document: 115-9-0.txt
1509


 17%|█████████                                             | 128/760 [13:16<3:25:21, 19.50s/it]

Processing document: 116-0-0.txt
1552


 17%|█████████▏                                            | 129/760 [13:34<3:19:39, 18.98s/it]

Processing document: 116-1-0.txt
929


 17%|█████████▏                                            | 130/760 [13:42<2:45:43, 15.78s/it]

Processing document: 117-0-0.txt
1676


 17%|█████████▎                                            | 131/760 [13:55<2:36:45, 14.95s/it]

Processing document: 117-1-0.txt
1987


 17%|█████████▍                                            | 132/760 [14:18<3:03:02, 17.49s/it]

Processing document: 117-1-1.txt
637


 18%|█████████▍                                            | 133/760 [14:26<2:33:18, 14.67s/it]

Processing document: 117-2-0.txt
720


 18%|█████████▌                                            | 134/760 [14:36<2:17:25, 13.17s/it]

Processing document: 118-0-0.txt
1709


 18%|█████████▌                                            | 135/760 [14:48<2:13:31, 12.82s/it]

Processing document: 119-0-0.txt
1493


 18%|█████████▋                                            | 136/760 [15:01<2:12:52, 12.78s/it]

Processing document: 12-0-0.txt
1776


 18%|█████████▋                                            | 137/760 [15:09<1:58:29, 11.41s/it]

Processing document: 12-1-0.txt
1661


 18%|█████████▊                                            | 138/760 [15:21<2:00:07, 11.59s/it]

Processing document: 12-10-0.txt
1700


 18%|█████████▉                                            | 139/760 [15:35<2:06:36, 12.23s/it]

Processing document: 12-11-0.txt
1004


 18%|█████████▉                                            | 140/760 [15:45<1:59:10, 11.53s/it]

Processing document: 12-2-0.txt
1565


 19%|██████████                                            | 141/760 [16:07<2:33:29, 14.88s/it]

Processing document: 12-3-0.txt
1601


 19%|██████████                                            | 142/760 [16:30<2:57:32, 17.24s/it]

Processing document: 12-4-0.txt
1603


 19%|██████████▏                                           | 143/760 [16:53<3:14:14, 18.89s/it]

Processing document: 12-5-0.txt
1606


 19%|██████████▏                                           | 144/760 [17:16<3:25:48, 20.05s/it]

Processing document: 12-6-0.txt
1610


 19%|██████████▎                                           | 145/760 [17:38<3:33:45, 20.85s/it]

Processing document: 12-7-0.txt
1651


 19%|██████████▎                                           | 146/760 [17:57<3:25:19, 20.06s/it]

Processing document: 12-8-0.txt
1703


 19%|██████████▍                                           | 147/760 [18:19<3:33:42, 20.92s/it]

Processing document: 12-9-0.txt
1755


 19%|██████████▌                                           | 148/760 [18:33<3:10:26, 18.67s/it]

Processing document: 120-0-0.txt
336


 20%|██████████▌                                           | 149/760 [18:35<2:19:20, 13.68s/it]

Processing document: 122-0-0.txt
1478


 20%|██████████▋                                           | 150/760 [18:54<2:36:10, 15.36s/it]

Processing document: 122-1-0.txt
1235


 20%|██████████▋                                           | 151/760 [19:16<2:55:10, 17.26s/it]

Processing document: 122-2-0.txt
1547


 20%|██████████▊                                           | 152/760 [19:35<3:01:48, 17.94s/it]

Processing document: 122-3-0.txt
1415


 20%|██████████▊                                           | 153/760 [19:58<3:15:06, 19.29s/it]

Processing document: 123-0-0.txt
884


 20%|██████████▉                                           | 154/760 [20:06<2:42:11, 16.06s/it]

Processing document: 124-0-0.txt
1413


 20%|███████████                                           | 155/760 [20:17<2:24:11, 14.30s/it]

Processing document: 124-1-0.txt
696


 21%|███████████                                           | 156/760 [20:24<2:04:35, 12.38s/it]

Processing document: 125-0-0.txt
867


 21%|███████████▏                                          | 157/760 [20:33<1:53:43, 11.32s/it]

Processing document: 126-0-0.txt
1012


 21%|███████████▏                                          | 158/760 [20:55<2:23:48, 14.33s/it]

Processing document: 127-0-0.txt
1536


 21%|███████████▎                                          | 159/760 [21:04<2:07:14, 12.70s/it]

Processing document: 127-1-0.txt
1616


 21%|███████████▎                                          | 160/760 [21:25<2:32:54, 15.29s/it]

Processing document: 127-2-0.txt
1655


 21%|███████████▍                                          | 161/760 [21:36<2:18:59, 13.92s/it]

Processing document: 128-0-0.txt
1434


 21%|███████████▌                                          | 162/760 [21:49<2:18:30, 13.90s/it]

Processing document: 128-1-0.txt
631


 21%|███████████▌                                          | 163/760 [21:58<2:02:59, 12.36s/it]

Processing document: 129-0-0.txt
1735


 22%|███████████▋                                          | 164/760 [22:12<2:06:10, 12.70s/it]

Processing document: 129-1-0.txt
896


In [18]:
all_qa_df.head()

Unnamed: 0,Doc_id,Question,Answer
0,10-0-0.txt,What is the name of the cul-de-sac located in ...,Roslyn Place
1,10-0-0.txt,What is the name of the street with the larges...,East Carson Street
2,10-0-0.txt,What is the name of the highway connecting Pit...,I-376
3,10-0-0.txt,What is the name of the highway that runs nort...,I-279
4,10-0-0.txt,What is the name of the belt system in Alleghe...,Allegheny County Belt System


In [19]:
# Save the new QA pairs back to the CSV file (optional)
all_qa_df.to_csv('../data/annotated/new_generated_qa_pairs_2.csv', index=False)