In [1]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")

GPU is available: NVIDIA A10G


In [2]:
from huggingface_hub import login
import getpass

# Prompt the user to enter the Hugging Face token securely
token = getpass.getpass("Enter your Hugging Face token: ")

# Login to Hugging Face using the token
login(token=token)

Enter your Hugging Face token:  ········


In [3]:
# read in the doc
def read_preprocess_doc(doc_path):
    with open(doc_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    content = content.split()
    if len(content) > 3072:
        print(f'the document {doc_path} is too long, it will be truncated to 3072 tokens')
        content = content[:3072]
    content = ' '.join(content)
    return content

In [4]:
# read in all doc dir under the folder path and return a list of doc content
data_folder_dir = '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072'

doc_list = []
# extract all doc content
import os
for root, dirs, files in os.walk(data_folder_dir):
    for file in files:
        if file.endswith('.txt'):
            doc_path = os.path.join(root, file)
            doc_list.append(doc_path)
    
print(doc_list[:5])
print(len(doc_list))

# # randomely select 10 doc to test
# doc_list = doc_list[:20]

['../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-0-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-1-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-10-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-11-0.txt', '../data/crawled/crawled_text_data_append_sharding_3072_600_max_3072/0-12-0.txt']
760


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# load the model
# model_id = "meta-llama/Llama-2-7b-chat-hf"
model_id = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [7]:
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16)

In [8]:
INSTRUCTION = """
I have provided a web-crawled document above that may be relevant to one or more topics such as 
general information, history, economy, music, culture, sports, or upcoming events 
related to Pittsburgh or Carnegie Mellon University. Based on this document, 
generate 5~10 factual question and answer pairs that cover different types of inquiries, 
such as time, events, people, locations, or numerical data.
If there are fewer pairs available, only provide the number you can find.
The questions should make sense independently of the document by including key words from the document.
For example, the question "Q: What is the fee of the event?" is not acceptable, 
because words like "the event" is too vague without context of the document.
It should be sufficiently detailed as "Q: What is the fee of the event at PPG Arena on October 13?" 
or "Q: What is the fee of the event that celebrates the 50th anniversary of CMU?".
Please provide concise answers without repeating the question or using complete sentences.
For example, given the question "Q: When was Carnegie Mellon University founded?",
you should only answer "A: 1900".
\n\nExamples: \n
Q: Who is Pittsburgh named after? A: William Pitt \n
Q: What famous machine learning venue had its first conference in Pittsburgh in 1980? A: ICML \n
Q: What musical artist is performing at PPG Arena on October 13? A: Billie Eilish\n\n
Before you start, please read the document above and provide the number of question and answer pairs you can find."""

In [9]:
import re
import pandas as pd
def extract_qa_pairs(generated_text, doc_id):
    # Use regular expressions to extract all the questions and answers
    questions = re.findall(r'(?:\d*\.\s*)?Q:\s*(.*?)\s*A:', generated_text, re.DOTALL)
    answers = re.findall(r'A:\s*(.*?)(?=\s*(?:\d*\.\s*Q:|Q:|$))', generated_text, re.DOTALL)

    doc_ids = [doc_id] * len(questions)
    # Create a pandas DataFrame from the extracted questions and answers
    df = pd.DataFrame({
        'Doc_id': doc_ids,
        'Question': questions,
        'Answer': answers
    })

    # Display the resulting DataFrame
    return df

In [27]:
# torch.cuda.empty_cache()

In [14]:
# import pandas as pd
from tqdm import tqdm
import datetime
# Load the existing QA pairs from the CSV file to see what has already been processed
processed_qa_df_1 = pd.read_csv('../data/annotated/generated_qa_pairs_1.csv')
# processed_qa_df_2 = pd.read_csv('../data/annotated/new_generated_qa_pairs_2.csv')
# processed_qa_df_3 = pd.read_csv('../data/annotated/new_generated_qa_pairs_3.csv')
# processed_qa_df_4 = pd.read_csv('../data/annotated/new_generated_qa_pairs_4.csv')
# processed_qa_df = pd.concat([processed_qa_df_1, processed_qa_df_2, processed_qa_df_3, processed_qa_df_4], axis=0)
processed_qa_df = pd.concat([processed_qa_df_1], axis=0)

# Get the list of already processed doc IDs (assuming doc_id is stored in a column called 'doc_id')
processed_doc_ids = processed_qa_df['Doc_id'].unique()

# Initialize an empty DataFrame for new QA pairs
all_qa_df = pd.DataFrame()

# Iterate over the documents in doc_list
for doc in tqdm(doc_list):
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    # Extract the doc_id from the document path
    doc_id = doc.split('/')[-1]

    # Check if this doc_id has already been processed
    if doc_id in processed_doc_ids:
        print(f"Skipping already processed document: {doc_id}")
        continue  # Skip the file if it has already been processed

    # Process the document
    print(f"Processing document: {doc_id}")
    content = read_preprocess_doc(doc)
    formatted_input = content + "\n\n" + INSTRUCTION + "\n\n" + "Your answer:"
    tokenized_prompt = tokenizer(formatted_input, return_tensors="pt").to(model.device)
    print(tokenized_prompt.input_ids.size(1))
    
    # Generate the response
    messages = [
        {"role": "user", "content": formatted_input},
    ]
    with torch.no_grad():
        result = pipe(messages, max_new_tokens=512)

    # Extract the question and answer pairs
    qa_df = extract_qa_pairs(result[0]['generated_text'][1]['content'], doc_id)
    qa_df.to_csv(f'../data/annotated/new_generated_qa_pairs_{timestamp}.csv')
    
    # Append the new QA pairs to the all_qa_df
    all_qa_df = pd.concat([all_qa_df, qa_df], axis=0)
    
    # Reindex the DataFrame
    all_qa_df.reset_index(drop=True, inplace=True)

  0%|                                                                  | 0/760 [00:00<?, ?it/s]

Skipping already processed document: 0-0-0.txt
Skipping already processed document: 0-1-0.txt
Skipping already processed document: 0-10-0.txt
Skipping already processed document: 0-11-0.txt
Skipping already processed document: 0-12-0.txt
Skipping already processed document: 0-13-0.txt
Skipping already processed document: 0-14-0.txt
Skipping already processed document: 0-15-0.txt
Skipping already processed document: 0-16-0.txt
Skipping already processed document: 0-17-0.txt
Skipping already processed document: 0-18-0.txt
Skipping already processed document: 0-19-0.txt
Skipping already processed document: 0-2-0.txt
Skipping already processed document: 0-20-0.txt
Skipping already processed document: 0-21-0.txt
Skipping already processed document: 0-22-0.txt
Skipping already processed document: 0-23-0.txt
Skipping already processed document: 0-24-0.txt
Skipping already processed document: 0-25-0.txt
Skipping already processed document: 0-26-0.txt
Skipping already processed document: 0-27-0

 20%|██████████▉                                             | 149/760 [00:02<00:10, 56.61it/s]

Skipping already processed document: 122-0-0.txt
Skipping already processed document: 122-1-0.txt
Skipping already processed document: 122-2-0.txt
Skipping already processed document: 122-3-0.txt
Skipping already processed document: 123-0-0.txt
Skipping already processed document: 124-0-0.txt
Skipping already processed document: 124-1-0.txt
Skipping already processed document: 125-0-0.txt
Skipping already processed document: 126-0-0.txt
Skipping already processed document: 127-0-0.txt
Skipping already processed document: 127-1-0.txt
Skipping already processed document: 127-2-0.txt
Skipping already processed document: 128-0-0.txt
Skipping already processed document: 128-1-0.txt
Skipping already processed document: 129-0-0.txt
Skipping already processed document: 129-1-0.txt
Skipping already processed document: 13-0-0.txt
Skipping already processed document: 13-1-0.txt
Skipping already processed document: 13-2-0.txt
Skipping already processed document: 13-2-1.txt
Skipping already process

 25%|█████████████▊                                          | 187/760 [00:13<00:50, 11.38it/s]

Processing document: 134-0-0.txt
1384


 25%|█████████████▊                                          | 188/760 [00:26<02:05,  4.56it/s]

Processing document: 135-0-0.txt
1679


 25%|█████████████▉                                          | 189/760 [00:35<03:10,  2.99it/s]

Processing document: 135-1-0.txt
1504
Processing document: 136-0-0.txt
1559


 25%|██████████████                                          | 191/760 [00:59<07:32,  1.26it/s]

Processing document: 136-1-0.txt
1605


 25%|██████████████▏                                         | 192/760 [01:20<12:40,  1.34s/it]

Processing document: 136-2-0.txt
1181


 25%|██████████████▏                                         | 193/760 [01:30<15:33,  1.65s/it]

Processing document: 136-3-0.txt
1369
Processing document: 137-0-0.txt
1462


 26%|██████████████▎                                         | 195/760 [01:58<26:45,  2.84s/it]

Processing document: 138-0-0.txt
1560


 26%|██████████████▍                                         | 196/760 [02:16<35:39,  3.79s/it]

Processing document: 138-1-0.txt
761


 26%|██████████████▌                                         | 197/760 [02:25<39:44,  4.23s/it]

Processing document: 139-0-0.txt
1697


 26%|██████████████▌                                         | 198/760 [02:46<55:43,  5.95s/it]

Processing document: 139-1-0.txt
1221


 26%|██████████████▏                                       | 199/760 [03:00<1:05:32,  7.01s/it]

Processing document: 139-2-0.txt
1416


 26%|██████████████▏                                       | 200/760 [03:08<1:08:06,  7.30s/it]

Processing document: 14-0-0.txt
1385


 26%|██████████████▎                                       | 201/760 [03:24<1:22:33,  8.86s/it]

Processing document: 14-1-0.txt
1190


 27%|██████████████▎                                       | 202/760 [03:34<1:23:29,  8.98s/it]

Processing document: 140-0-0.txt
1581


 27%|██████████████▍                                       | 203/760 [03:46<1:30:04,  9.70s/it]

Processing document: 140-1-0.txt
1183


 27%|██████████████▍                                       | 204/760 [03:54<1:26:06,  9.29s/it]

Processing document: 141-0-0.txt
515


 27%|██████████████▌                                       | 205/760 [04:00<1:17:39,  8.40s/it]

Processing document: 142-0-0.txt
569


 27%|██████████████▋                                       | 206/760 [04:12<1:27:47,  9.51s/it]

Processing document: 143-0-0.txt
676


 27%|██████████████▋                                       | 207/760 [04:21<1:26:45,  9.41s/it]

Processing document: 144-0-0.txt
1456


 27%|██████████████▊                                       | 208/760 [04:30<1:23:19,  9.06s/it]

Processing document: 145-0-0.txt
1560


 28%|██████████████▊                                       | 209/760 [04:39<1:24:02,  9.15s/it]

Processing document: 145-1-0.txt
1406


 28%|██████████████▉                                       | 210/760 [05:01<1:59:23, 13.03s/it]

Processing document: 146-0-0.txt
1164


 28%|██████████████▉                                       | 211/760 [05:17<2:06:04, 13.78s/it]

Processing document: 147-0-0.txt
743


 28%|███████████████                                       | 212/760 [05:24<1:46:56, 11.71s/it]

Processing document: 148-0-0.txt
1103


 28%|███████████████▏                                      | 213/760 [05:35<1:45:46, 11.60s/it]

Processing document: 149-0-0.txt
1211


 28%|███████████████▏                                      | 214/760 [05:45<1:39:59, 10.99s/it]

Processing document: 15-0-0.txt
1519


 28%|███████████████▎                                      | 215/760 [05:52<1:30:49, 10.00s/it]

Processing document: 15-1-0.txt
818


 28%|███████████████▎                                      | 216/760 [05:59<1:22:18,  9.08s/it]

Processing document: 150-0-0.txt
1591


 29%|███████████████▍                                      | 217/760 [06:18<1:48:41, 12.01s/it]

Processing document: 150-1-0.txt
1697


 29%|███████████████▍                                      | 218/760 [06:30<1:48:03, 11.96s/it]

Processing document: 150-2-0.txt
1178


 29%|███████████████▌                                      | 219/760 [06:39<1:40:20, 11.13s/it]

Processing document: 16-0-0.txt
1578


 29%|███████████████▋                                      | 220/760 [06:51<1:43:34, 11.51s/it]

Processing document: 16-1-0.txt
855


 29%|███████████████▋                                      | 221/760 [07:01<1:37:52, 10.89s/it]

Processing document: 17-0-0.txt
1423


 29%|███████████████▊                                      | 222/760 [07:15<1:45:41, 11.79s/it]

Processing document: 18-0-0.txt
834


 29%|███████████████▊                                      | 223/760 [07:24<1:39:51, 11.16s/it]

Processing document: 19-0-0.txt
1782


 29%|███████████████▉                                      | 224/760 [07:32<1:30:39, 10.15s/it]

Processing document: 19-1-0.txt
1479


 30%|███████████████▉                                      | 225/760 [07:54<2:01:25, 13.62s/it]

Processing document: 19-10-0.txt
1503


 30%|████████████████                                      | 226/760 [08:17<2:25:08, 16.31s/it]

Processing document: 19-11-0.txt
1603


 30%|████████████████▏                                     | 227/760 [08:35<2:29:25, 16.82s/it]

Processing document: 19-12-0.txt
1623


 30%|████████████████▏                                     | 228/760 [08:52<2:29:32, 16.87s/it]

Processing document: 19-12-1.txt
341


 30%|████████████████▎                                     | 229/760 [08:53<1:48:09, 12.22s/it]

Processing document: 19-13-0.txt
1491


 30%|████████████████▎                                     | 230/760 [09:11<2:03:33, 13.99s/it]

Processing document: 19-14-0.txt
1476


 30%|████████████████▍                                     | 231/760 [09:22<1:55:42, 13.12s/it]

Processing document: 19-15-0.txt
1190


 31%|████████████████▍                                     | 232/760 [09:40<2:07:23, 14.48s/it]

Processing document: 19-16-0.txt
1621


 31%|████████████████▌                                     | 233/760 [09:51<1:59:32, 13.61s/it]

Processing document: 19-17-0.txt
1638


 31%|████████████████▋                                     | 234/760 [10:09<2:10:27, 14.88s/it]

Processing document: 19-18-0.txt
1675


 31%|████████████████▋                                     | 235/760 [10:30<2:25:32, 16.63s/it]

Processing document: 19-19-0.txt
1673


 31%|████████████████▊                                     | 236/760 [10:42<2:12:33, 15.18s/it]

Processing document: 19-2-0.txt
1508


 31%|████████████████▊                                     | 237/760 [11:03<2:27:47, 16.96s/it]

Processing document: 19-20-0.txt
1660


 31%|████████████████▉                                     | 238/760 [11:16<2:17:59, 15.86s/it]

Processing document: 19-21-0.txt
1679


 31%|████████████████▉                                     | 239/760 [11:31<2:15:41, 15.63s/it]

Processing document: 19-22-0.txt
1653


 32%|█████████████████                                     | 240/760 [11:54<2:34:08, 17.78s/it]

Processing document: 19-23-0.txt
1667


 32%|█████████████████                                     | 241/760 [12:12<2:33:02, 17.69s/it]

Processing document: 19-24-0.txt
1214


 32%|█████████████████▏                                    | 242/760 [12:33<2:41:55, 18.76s/it]

Processing document: 19-25-0.txt
1620


 32%|█████████████████▎                                    | 243/760 [12:43<2:20:36, 16.32s/it]

Processing document: 19-26-0.txt
591


 32%|█████████████████▎                                    | 244/760 [12:51<1:58:31, 13.78s/it]

Processing document: 19-3-0.txt
1513


 32%|█████████████████▍                                    | 245/760 [13:14<2:20:59, 16.43s/it]

Processing document: 19-4-0.txt
1645


 32%|█████████████████▍                                    | 246/760 [13:37<2:37:07, 18.34s/it]

Processing document: 19-5-0.txt
1767


 32%|█████████████████▌                                    | 247/760 [14:00<2:48:46, 19.74s/it]

Processing document: 19-6-0.txt
1483


 33%|█████████████████▌                                    | 248/760 [14:20<2:48:43, 19.77s/it]

Processing document: 19-7-0.txt
1449


 33%|█████████████████▋                                    | 249/760 [14:32<2:29:50, 17.59s/it]

Processing document: 19-8-0.txt
1484


 33%|█████████████████▊                                    | 250/760 [14:55<2:42:06, 19.07s/it]

Processing document: 19-9-0.txt
1422


 33%|█████████████████▊                                    | 251/760 [15:17<2:50:19, 20.08s/it]

Processing document: 2-0-0.txt
1546


 33%|█████████████████▉                                    | 252/760 [15:30<2:31:57, 17.95s/it]

Processing document: 2-1-0.txt
1169


 33%|█████████████████▉                                    | 253/760 [15:38<2:06:25, 14.96s/it]

Processing document: 20-0-0.txt
744


 33%|██████████████████                                    | 254/760 [15:46<1:49:38, 13.00s/it]

Processing document: 21-0-0.txt
1197


 34%|██████████████████                                    | 255/760 [15:55<1:37:21, 11.57s/it]

Processing document: 22-0-0.txt
1431


 34%|██████████████████▏                                   | 256/760 [16:07<1:38:44, 11.75s/it]

Processing document: 23-0-0.txt
642


 34%|██████████████████▎                                   | 257/760 [16:16<1:31:58, 10.97s/it]

Processing document: 23255_2024_Operating_Budget-0-0.txt
1704


 34%|██████████████████▎                                   | 258/760 [16:29<1:37:26, 11.65s/it]

Processing document: 23255_2024_Operating_Budget-1-0.txt
1486


 34%|██████████████████▍                                   | 259/760 [16:43<1:42:36, 12.29s/it]

Processing document: 23255_2024_Operating_Budget-10-0.txt
1521


 34%|██████████████████▍                                   | 260/760 [16:53<1:37:47, 11.73s/it]

Processing document: 23255_2024_Operating_Budget-100-0.txt
2480


 34%|██████████████████▌                                   | 261/760 [17:16<2:04:51, 15.01s/it]

Processing document: 23255_2024_Operating_Budget-100-1.txt
3898


 34%|██████████████████▌                                   | 262/760 [17:29<2:00:20, 14.50s/it]

Processing document: 23255_2024_Operating_Budget-100-2.txt
5418


 35%|██████████████████▋                                   | 263/760 [17:55<2:28:48, 17.96s/it]

Processing document: 23255_2024_Operating_Budget-100-3.txt
6271


 35%|██████████████████▊                                   | 264/760 [18:27<3:01:46, 21.99s/it]

Processing document: 23255_2024_Operating_Budget-100-4.txt
4605


 35%|██████████████████▊                                   | 265/760 [18:39<2:37:00, 19.03s/it]

Processing document: 23255_2024_Operating_Budget-11-0.txt
1408


 35%|██████████████████▉                                   | 266/760 [18:52<2:21:07, 17.14s/it]

Processing document: 23255_2024_Operating_Budget-12-0.txt
1270


 35%|██████████████████▉                                   | 267/760 [19:03<2:06:12, 15.36s/it]

Processing document: 23255_2024_Operating_Budget-13-0.txt
2184


 35%|███████████████████                                   | 268/760 [19:13<1:53:48, 13.88s/it]

Processing document: 23255_2024_Operating_Budget-14-0.txt
3450


 35%|███████████████████                                   | 269/760 [19:39<2:23:21, 17.52s/it]

Processing document: 23255_2024_Operating_Budget-14-1.txt
752


 36%|███████████████████▏                                  | 270/760 [19:48<2:00:59, 14.82s/it]

Processing document: 23255_2024_Operating_Budget-15-0.txt
2071


 36%|███████████████████▎                                  | 271/760 [20:11<2:22:10, 17.44s/it]

Processing document: 23255_2024_Operating_Budget-16-0.txt
1637


 36%|███████████████████▎                                  | 272/760 [20:30<2:24:09, 17.72s/it]

Processing document: 23255_2024_Operating_Budget-17-0.txt
1550


 36%|███████████████████▍                                  | 273/760 [20:44<2:14:23, 16.56s/it]

Processing document: 23255_2024_Operating_Budget-18-0.txt
1588


 36%|███████████████████▍                                  | 274/760 [21:06<2:29:04, 18.40s/it]

Processing document: 23255_2024_Operating_Budget-19-0.txt
2170


 36%|███████████████████▌                                  | 275/760 [21:24<2:26:51, 18.17s/it]

Processing document: 23255_2024_Operating_Budget-2-0.txt
1078


 36%|███████████████████▌                                  | 276/760 [21:33<2:03:53, 15.36s/it]

Processing document: 23255_2024_Operating_Budget-20-0.txt
2680


 36%|███████████████████▋                                  | 277/760 [21:57<2:26:00, 18.14s/it]

Processing document: 23255_2024_Operating_Budget-21-0.txt
3280


 37%|███████████████████▊                                  | 278/760 [22:23<2:43:53, 20.40s/it]

Processing document: 23255_2024_Operating_Budget-21-1.txt
1557


 37%|███████████████████▊                                  | 279/760 [22:46<2:48:59, 21.08s/it]

Processing document: 23255_2024_Operating_Budget-22-0.txt
1829


 37%|███████████████████▉                                  | 280/760 [22:57<2:24:37, 18.08s/it]

Processing document: 23255_2024_Operating_Budget-23-0.txt
1609


 37%|███████████████████▉                                  | 281/760 [23:11<2:15:01, 16.91s/it]

Processing document: 23255_2024_Operating_Budget-24-0.txt
1434


 37%|████████████████████                                  | 282/760 [23:24<2:06:02, 15.82s/it]

Processing document: 23255_2024_Operating_Budget-25-0.txt
3745


 37%|████████████████████                                  | 283/760 [23:36<1:55:18, 14.50s/it]

Processing document: 23255_2024_Operating_Budget-25-1.txt
340


 37%|████████████████████▏                                 | 284/760 [23:38<1:25:07, 10.73s/it]

Processing document: 23255_2024_Operating_Budget-26-0.txt
2301


 38%|████████████████████▎                                 | 285/760 [23:46<1:20:14, 10.14s/it]

Processing document: 23255_2024_Operating_Budget-27-0.txt
719


 38%|████████████████████▎                                 | 286/760 [23:53<1:11:41,  9.08s/it]

Processing document: 23255_2024_Operating_Budget-28-0.txt
2395


 38%|████████████████████▍                                 | 287/760 [24:01<1:09:42,  8.84s/it]

Processing document: 23255_2024_Operating_Budget-28-1.txt
2526


 38%|████████████████████▍                                 | 288/760 [24:15<1:22:15, 10.46s/it]

Processing document: 23255_2024_Operating_Budget-28-2.txt
1251


 38%|████████████████████▌                                 | 289/760 [24:23<1:15:06,  9.57s/it]

Processing document: 23255_2024_Operating_Budget-29-0.txt
1281


 38%|████████████████████▌                                 | 290/760 [24:36<1:22:02, 10.47s/it]

Processing document: 23255_2024_Operating_Budget-3-0.txt
1559


 38%|████████████████████▋                                 | 291/760 [24:47<1:25:00, 10.88s/it]

Processing document: 23255_2024_Operating_Budget-30-0.txt
2538


 38%|████████████████████▋                                 | 292/760 [24:57<1:23:01, 10.64s/it]

Processing document: 23255_2024_Operating_Budget-30-1.txt
815


 39%|████████████████████▊                                 | 293/760 [25:05<1:15:40,  9.72s/it]

Processing document: 23255_2024_Operating_Budget-31-0.txt
1120


 39%|████████████████████▉                                 | 294/760 [25:15<1:15:20,  9.70s/it]

Processing document: 24-0-0.txt
1117


 39%|████████████████████▉                                 | 295/760 [25:24<1:14:08,  9.57s/it]

Processing document: 23255_2024_Operating_Budget-32-0.txt
2142


 39%|█████████████████████                                 | 296/760 [25:35<1:18:15, 10.12s/it]

Processing document: 23255_2024_Operating_Budget-33-0.txt
2521


 39%|█████████████████████                                 | 297/760 [25:47<1:22:03, 10.63s/it]

Processing document: 23255_2024_Operating_Budget-34-0.txt
872


 39%|█████████████████████▏                                | 298/760 [25:58<1:22:14, 10.68s/it]

Processing document: 23255_2024_Operating_Budget-35-0.txt
1951


 39%|█████████████████████▏                                | 299/760 [26:13<1:31:06, 11.86s/it]

Processing document: 23255_2024_Operating_Budget-36-0.txt
1030


 39%|█████████████████████▎                                | 300/760 [26:21<1:23:43, 10.92s/it]

Processing document: 23255_2024_Operating_Budget-37-0.txt
2437


 40%|█████████████████████▍                                | 301/760 [26:45<1:53:59, 14.90s/it]

Processing document: 23255_2024_Operating_Budget-37-1.txt
738


 40%|█████████████████████▍                                | 302/760 [26:54<1:38:20, 12.88s/it]

Processing document: 23255_2024_Operating_Budget-38-0.txt
1644


 40%|█████████████████████▌                                | 303/760 [27:05<1:34:16, 12.38s/it]

Processing document: 23255_2024_Operating_Budget-39-0.txt
2358


 40%|█████████████████████▌                                | 304/760 [27:15<1:29:03, 11.72s/it]

Processing document: 23255_2024_Operating_Budget-39-1.txt
1497


 40%|█████████████████████▋                                | 305/760 [27:23<1:19:26, 10.48s/it]

Processing document: 23255_2024_Operating_Budget-4-0.txt
745


 40%|█████████████████████▋                                | 306/760 [27:33<1:18:30, 10.38s/it]

Processing document: 23255_2024_Operating_Budget-40-0.txt
1379


 40%|█████████████████████▊                                | 307/760 [27:45<1:23:07, 11.01s/it]

Processing document: 23255_2024_Operating_Budget-41-0.txt
2469


 41%|█████████████████████▉                                | 308/760 [27:58<1:27:44, 11.65s/it]

Processing document: 23255_2024_Operating_Budget-41-1.txt
371


 41%|█████████████████████▉                                | 309/760 [28:04<1:13:12,  9.74s/it]

Processing document: 23255_2024_Operating_Budget-42-0.txt
1213


 41%|██████████████████████                                | 310/760 [28:12<1:09:04,  9.21s/it]

Processing document: 23255_2024_Operating_Budget-43-0.txt
1780


 41%|██████████████████████                                | 311/760 [28:28<1:24:08, 11.24s/it]

Processing document: 23255_2024_Operating_Budget-44-0.txt
1973


 41%|██████████████████████▏                               | 312/760 [28:39<1:24:43, 11.35s/it]

Processing document: 23255_2024_Operating_Budget-45-0.txt
1202


 41%|██████████████████████▏                               | 313/760 [28:54<1:32:51, 12.46s/it]

Processing document: 23255_2024_Operating_Budget-46-0.txt
2426


 41%|██████████████████████▎                               | 314/760 [29:08<1:35:48, 12.89s/it]

Processing document: 23255_2024_Operating_Budget-46-1.txt
1176


 41%|██████████████████████▍                               | 315/760 [29:19<1:30:03, 12.14s/it]

Processing document: 23255_2024_Operating_Budget-47-0.txt
1716


 42%|██████████████████████▍                               | 316/760 [29:37<1:44:47, 14.16s/it]

Processing document: 23255_2024_Operating_Budget-48-0.txt
599


 42%|██████████████████████▌                               | 317/760 [29:49<1:38:19, 13.32s/it]

Processing document: 23255_2024_Operating_Budget-49-0.txt
2448


 42%|██████████████████████▌                               | 318/760 [29:59<1:31:40, 12.44s/it]

Processing document: 23255_2024_Operating_Budget-49-1.txt
449


 42%|██████████████████████▋                               | 319/760 [30:08<1:23:29, 11.36s/it]

Processing document: 23255_2024_Operating_Budget-5-0.txt
2066


 42%|██████████████████████▋                               | 320/760 [30:18<1:21:16, 11.08s/it]

Processing document: 23255_2024_Operating_Budget-5-1.txt
1270


 42%|██████████████████████▊                               | 321/760 [30:36<1:34:48, 12.96s/it]

Processing document: 23255_2024_Operating_Budget-50-0.txt
1010


 42%|██████████████████████▉                               | 322/760 [30:48<1:33:50, 12.86s/it]

Processing document: 23255_2024_Operating_Budget-51-0.txt
2365


 42%|██████████████████████▉                               | 323/760 [31:04<1:38:30, 13.53s/it]

Processing document: 23255_2024_Operating_Budget-51-1.txt
759


 43%|███████████████████████                               | 324/760 [31:12<1:27:27, 12.04s/it]

Processing document: 23255_2024_Operating_Budget-52-0.txt
1514


 43%|███████████████████████                               | 325/760 [31:22<1:23:18, 11.49s/it]

Processing document: 23255_2024_Operating_Budget-53-0.txt
1415


 43%|███████████████████████▏                              | 326/760 [31:39<1:33:25, 12.92s/it]

Processing document: 23255_2024_Operating_Budget-54-0.txt
1476


 43%|███████████████████████▏                              | 327/760 [31:49<1:26:51, 12.04s/it]

Processing document: 23255_2024_Operating_Budget-55-0.txt
2631


 43%|███████████████████████▎                              | 328/760 [32:13<1:52:50, 15.67s/it]

Processing document: 23255_2024_Operating_Budget-56-0.txt
1360


 43%|███████████████████████▍                              | 329/760 [32:22<1:38:36, 13.73s/it]

Processing document: 23255_2024_Operating_Budget-57-0.txt
1514


 43%|███████████████████████▍                              | 330/760 [32:44<1:57:27, 16.39s/it]

Processing document: 23255_2024_Operating_Budget-58-0.txt
2584


 44%|███████████████████████▌                              | 331/760 [32:59<1:53:25, 15.86s/it]

Processing document: 23255_2024_Operating_Budget-59-0.txt
1589


 44%|███████████████████████▌                              | 332/760 [33:11<1:45:17, 14.76s/it]

Processing document: 23255_2024_Operating_Budget-6-0.txt
1498


 44%|███████████████████████▋                              | 333/760 [33:21<1:34:58, 13.34s/it]

Processing document: 23255_2024_Operating_Budget-60-0.txt
1577


 44%|███████████████████████▋                              | 334/760 [33:31<1:25:58, 12.11s/it]

Processing document: 23255_2024_Operating_Budget-61-0.txt
2525


 44%|███████████████████████▊                              | 335/760 [33:42<1:24:19, 11.90s/it]

Processing document: 23255_2024_Operating_Budget-62-0.txt
1331


 44%|███████████████████████▊                              | 336/760 [33:54<1:24:51, 12.01s/it]

Processing document: 23255_2024_Operating_Budget-63-0.txt
1422


 44%|███████████████████████▉                              | 337/760 [34:03<1:17:27, 10.99s/it]

Processing document: 23255_2024_Operating_Budget-64-0.txt
2407


 44%|████████████████████████                              | 338/760 [34:23<1:37:23, 13.85s/it]

Processing document: 23255_2024_Operating_Budget-65-0.txt
2020


 45%|████████████████████████                              | 339/760 [34:37<1:36:42, 13.78s/it]

Processing document: 23255_2024_Operating_Budget-65-1.txt
531


 45%|████████████████████████▏                             | 340/760 [34:45<1:23:26, 11.92s/it]

Processing document: 23255_2024_Operating_Budget-66-0.txt
1530


 45%|████████████████████████▏                             | 341/760 [34:56<1:21:36, 11.69s/it]

Processing document: 23255_2024_Operating_Budget-67-0.txt
1518


 45%|████████████████████████▎                             | 342/760 [35:06<1:17:30, 11.13s/it]

Processing document: 23255_2024_Operating_Budget-68-0.txt
1382


 45%|████████████████████████▎                             | 343/760 [35:18<1:20:35, 11.60s/it]

Processing document: 23255_2024_Operating_Budget-69-0.txt
2067


 45%|████████████████████████▍                             | 344/760 [35:34<1:29:14, 12.87s/it]

Processing document: 23255_2024_Operating_Budget-69-1.txt
1859


 45%|████████████████████████▌                             | 345/760 [35:43<1:20:23, 11.62s/it]

Processing document: 23255_2024_Operating_Budget-7-0.txt
1405


 46%|████████████████████████▌                             | 346/760 [35:56<1:23:01, 12.03s/it]

Processing document: 23255_2024_Operating_Budget-70-0.txt
841


 46%|████████████████████████▋                             | 347/760 [36:02<1:11:06, 10.33s/it]

Processing document: 23255_2024_Operating_Budget-71-0.txt
2715


 46%|████████████████████████▋                             | 348/760 [36:13<1:11:41, 10.44s/it]

Processing document: 23255_2024_Operating_Budget-71-1.txt
2265


 46%|████████████████████████▊                             | 349/760 [36:23<1:11:02, 10.37s/it]

Processing document: 23255_2024_Operating_Budget-72-0.txt
1568


 46%|████████████████████████▊                             | 350/760 [36:31<1:05:13,  9.55s/it]

Processing document: 23255_2024_Operating_Budget-73-0.txt
2462


 46%|████████████████████████▉                             | 351/760 [36:47<1:19:49, 11.71s/it]

Processing document: 23255_2024_Operating_Budget-74-0.txt
1178


 46%|█████████████████████████                             | 352/760 [36:58<1:16:48, 11.29s/it]

Processing document: 23255_2024_Operating_Budget-75-0.txt
2243


 46%|█████████████████████████                             | 353/760 [37:22<1:42:11, 15.06s/it]

Processing document: 23255_2024_Operating_Budget-75-1.txt
1360


 47%|█████████████████████████▏                            | 354/760 [37:32<1:33:03, 13.75s/it]

Processing document: 23255_2024_Operating_Budget-76-0.txt
1628


 47%|█████████████████████████▏                            | 355/760 [37:43<1:25:43, 12.70s/it]

Processing document: 23255_2024_Operating_Budget-77-0.txt
1083


 47%|█████████████████████████▎                            | 356/760 [37:54<1:23:56, 12.47s/it]

Processing document: 23255_2024_Operating_Budget-78-0.txt
2345


 47%|█████████████████████████▎                            | 357/760 [38:17<1:44:25, 15.55s/it]

Processing document: 23255_2024_Operating_Budget-78-1.txt
1867


 47%|█████████████████████████▍                            | 358/760 [38:28<1:34:42, 14.13s/it]

Processing document: 23255_2024_Operating_Budget-79-0.txt
1493


 47%|█████████████████████████▌                            | 359/760 [38:38<1:25:49, 12.84s/it]

Processing document: 23255_2024_Operating_Budget-8-0.txt
1532


 47%|█████████████████████████▌                            | 360/760 [38:54<1:32:50, 13.93s/it]

Processing document: 23255_2024_Operating_Budget-80-0.txt
2054


 48%|█████████████████████████▋                            | 361/760 [39:05<1:25:21, 12.84s/it]

Processing document: 23255_2024_Operating_Budget-81-0.txt
1468


 48%|█████████████████████████▋                            | 362/760 [39:14<1:18:51, 11.89s/it]

Processing document: 23255_2024_Operating_Budget-82-0.txt
1663


 48%|█████████████████████████▊                            | 363/760 [39:28<1:21:24, 12.30s/it]

Processing document: 23255_2024_Operating_Budget-83-0.txt
1566


 48%|█████████████████████████▊                            | 364/760 [39:39<1:19:36, 12.06s/it]

Processing document: 23255_2024_Operating_Budget-84-0.txt
1618


 48%|█████████████████████████▉                            | 365/760 [39:52<1:20:53, 12.29s/it]

Processing document: 23255_2024_Operating_Budget-85-0.txt
1680


 48%|██████████████████████████                            | 366/760 [40:07<1:26:47, 13.22s/it]

Processing document: 23255_2024_Operating_Budget-86-0.txt
1686


 48%|██████████████████████████                            | 367/760 [40:17<1:20:30, 12.29s/it]

Processing document: 23255_2024_Operating_Budget-87-0.txt
2237


 48%|██████████████████████████▏                           | 368/760 [40:27<1:15:23, 11.54s/it]

Processing document: 23255_2024_Operating_Budget-88-0.txt
1940


 49%|██████████████████████████▏                           | 369/760 [40:38<1:14:22, 11.41s/it]

Processing document: 23255_2024_Operating_Budget-89-0.txt
814


 49%|██████████████████████████▎                           | 370/760 [40:46<1:07:30, 10.39s/it]

Processing document: 23255_2024_Operating_Budget-9-0.txt
1438


 49%|██████████████████████████▎                           | 371/760 [41:00<1:13:43, 11.37s/it]

Processing document: 23255_2024_Operating_Budget-90-0.txt
2239


 49%|██████████████████████████▍                           | 372/760 [41:09<1:09:45, 10.79s/it]

Processing document: 23255_2024_Operating_Budget-91-0.txt
2239


 49%|██████████████████████████▌                           | 373/760 [41:23<1:15:32, 11.71s/it]

Processing document: 23255_2024_Operating_Budget-92-0.txt
2035


 49%|██████████████████████████▌                           | 374/760 [41:35<1:15:21, 11.71s/it]

Processing document: 23255_2024_Operating_Budget-93-0.txt
1558


 49%|██████████████████████████▋                           | 375/760 [41:45<1:12:39, 11.32s/it]

Processing document: 23255_2024_Operating_Budget-94-0.txt
2109


 49%|██████████████████████████▋                           | 376/760 [42:02<1:22:29, 12.89s/it]

Processing document: 23255_2024_Operating_Budget-95-0.txt
1541


 50%|██████████████████████████▊                           | 377/760 [42:14<1:20:57, 12.68s/it]

Processing document: 23255_2024_Operating_Budget-96-0.txt
2031


 50%|██████████████████████████▊                           | 378/760 [42:25<1:16:35, 12.03s/it]

Processing document: 23255_2024_Operating_Budget-97-0.txt
2020


 50%|██████████████████████████▉                           | 379/760 [42:35<1:14:09, 11.68s/it]

Processing document: 23255_2024_Operating_Budget-98-0.txt
1281


 50%|███████████████████████████                           | 380/760 [42:49<1:17:48, 12.29s/it]

Processing document: 23255_2024_Operating_Budget-99-0.txt
2434


 50%|███████████████████████████                           | 381/760 [43:13<1:40:08, 15.85s/it]

Processing document: 23255_2024_Operating_Budget-99-1.txt
475


 50%|███████████████████████████▏                          | 382/760 [43:22<1:26:29, 13.73s/it]

Processing document: 25-0-0.txt
582


 50%|███████████████████████████▏                          | 383/760 [43:29<1:12:50, 11.59s/it]

Processing document: 26-0-0.txt
619


 51%|███████████████████████████▎                          | 384/760 [43:36<1:04:03, 10.22s/it]

Processing document: 27-0-0.txt
1093


 51%|███████████████████████████▎                          | 385/760 [43:45<1:02:36, 10.02s/it]

Processing document: 28-0-0.txt
1510


 51%|███████████████████████████▍                          | 386/760 [43:57<1:05:02, 10.44s/it]

Processing document: 28-1-0.txt
1434


 51%|███████████████████████████▍                          | 387/760 [44:12<1:13:43, 11.86s/it]

Processing document: 28-2-0.txt
1554


 51%|███████████████████████████▌                          | 388/760 [44:24<1:13:36, 11.87s/it]

Processing document: 28-3-0.txt
1482


 51%|███████████████████████████▋                          | 389/760 [44:44<1:29:29, 14.47s/it]

Processing document: 28-4-0.txt
1275


 51%|███████████████████████████▋                          | 390/760 [44:52<1:17:14, 12.52s/it]

Processing document: 28-5-0.txt
1176


 51%|███████████████████████████▊                          | 391/760 [44:59<1:07:07, 10.91s/it]

Processing document: 29-0-0.txt
1615


 52%|███████████████████████████▊                          | 392/760 [45:19<1:22:16, 13.41s/it]

Processing document: 3-0-0.txt
1549


 52%|███████████████████████████▉                          | 393/760 [45:26<1:10:36, 11.54s/it]

Processing document: 3-1-0.txt
1540


 52%|███████████████████████████▉                          | 394/760 [45:42<1:19:29, 13.03s/it]

Processing document: 3-2-0.txt
905


 52%|████████████████████████████                          | 395/760 [46:04<1:34:44, 15.57s/it]

Processing document: 30-0-0.txt
1677


 52%|████████████████████████████▏                         | 396/760 [46:15<1:26:09, 14.20s/it]

Processing document: 30-1-0.txt
1131


 52%|████████████████████████████▏                         | 397/760 [46:33<1:33:09, 15.40s/it]

Processing document: 30-10-0.txt
1559


 52%|████████████████████████████▎                         | 398/760 [46:49<1:33:30, 15.50s/it]

Processing document: 30-11-0.txt
1393


 52%|████████████████████████████▎                         | 399/760 [47:11<1:45:41, 17.57s/it]

Processing document: 30-12-0.txt
1723


 53%|████████████████████████████▍                         | 400/760 [47:34<1:55:03, 19.18s/it]

Processing document: 30-12-1.txt
631


 53%|████████████████████████████▍                         | 401/760 [47:43<1:35:37, 15.98s/it]

Processing document: 30-13-0.txt
1642


 53%|████████████████████████████▌                         | 402/760 [47:54<1:27:34, 14.68s/it]

Processing document: 30-14-0.txt
1494


 53%|████████████████████████████▋                         | 403/760 [48:09<1:26:41, 14.57s/it]

Processing document: 30-15-0.txt
1675


 53%|████████████████████████████▋                         | 404/760 [48:23<1:26:38, 14.60s/it]

Processing document: 30-15-1.txt
1330


 53%|████████████████████████████▊                         | 405/760 [48:34<1:19:24, 13.42s/it]

Processing document: 30-16-0.txt
1600


 53%|████████████████████████████▊                         | 406/760 [48:43<1:11:14, 12.08s/it]

Processing document: 30-17-0.txt
1845


 54%|████████████████████████████▉                         | 407/760 [49:04<1:27:41, 14.90s/it]

Processing document: 30-18-0.txt
1716


 54%|████████████████████████████▉                         | 408/760 [49:27<1:41:32, 17.31s/it]

Processing document: 30-2-0.txt
1848


 54%|█████████████████████████████                         | 409/760 [49:50<1:50:55, 18.96s/it]

Processing document: 30-3-0.txt
1484


 54%|█████████████████████████████▏                        | 410/760 [50:05<1:42:37, 17.59s/it]

Processing document: 30-4-0.txt
1393


 54%|█████████████████████████████▏                        | 411/760 [50:27<1:50:41, 19.03s/it]

Processing document: 30-5-0.txt
1612


 54%|█████████████████████████████▎                        | 412/760 [50:50<1:56:49, 20.14s/it]

Processing document: 30-6-0.txt
922


 54%|█████████████████████████████▎                        | 413/760 [51:01<1:41:25, 17.54s/it]

Processing document: 30-7-0.txt
2359


 54%|█████████████████████████████▍                        | 414/760 [51:25<1:52:23, 19.49s/it]

Processing document: 30-7-1.txt
663


 55%|█████████████████████████████▍                        | 415/760 [51:37<1:38:44, 17.17s/it]

Processing document: 30-8-0.txt
1958


 55%|█████████████████████████████▌                        | 416/760 [52:00<1:49:06, 19.03s/it]

Processing document: 30-9-0.txt
1461


 55%|█████████████████████████████▋                        | 417/760 [52:11<1:33:55, 16.43s/it]

Processing document: 31-0-0.txt
1652


 55%|█████████████████████████████▋                        | 418/760 [52:32<1:41:32, 17.82s/it]

Processing document: 31-1-0.txt
1180


 55%|█████████████████████████████▊                        | 419/760 [52:54<1:48:21, 19.07s/it]

Processing document: 31-2-0.txt
1798


 55%|█████████████████████████████▊                        | 420/760 [53:08<1:39:21, 17.53s/it]

Processing document: 31-3-0.txt
1698


 55%|█████████████████████████████▉                        | 421/760 [53:31<1:48:09, 19.14s/it]

Processing document: 31-4-0.txt
1246


 56%|█████████████████████████████▉                        | 422/760 [53:45<1:40:23, 17.82s/it]

Processing document: 32-0-0.txt
1649


 56%|██████████████████████████████                        | 423/760 [54:04<1:42:09, 18.19s/it]

Processing document: 32-1-0.txt
1180


 56%|██████████████████████████████▏                       | 424/760 [54:19<1:35:12, 17.00s/it]

Processing document: 32-2-0.txt
1798


 56%|██████████████████████████████▏                       | 425/760 [54:39<1:41:28, 18.17s/it]

Processing document: 32-3-0.txt
1698


 56%|██████████████████████████████▎                       | 426/760 [55:02<1:48:36, 19.51s/it]

Processing document: 32-4-0.txt
1246


 56%|██████████████████████████████▎                       | 427/760 [55:16<1:38:38, 17.77s/it]

Processing document: 33-0-0.txt
1578


 56%|██████████████████████████████▍                       | 428/760 [55:28<1:28:28, 15.99s/it]

Processing document: 33-1-0.txt
1528


 56%|██████████████████████████████▍                       | 429/760 [55:40<1:21:58, 14.86s/it]

Processing document: 33-10-0.txt
1537


 57%|██████████████████████████████▌                       | 430/760 [55:55<1:22:33, 15.01s/it]

Processing document: 33-11-0.txt
1581


 57%|██████████████████████████████▌                       | 431/760 [56:12<1:24:50, 15.47s/it]

Processing document: 33-12-0.txt
1530


 57%|██████████████████████████████▋                       | 432/760 [56:34<1:35:39, 17.50s/it]

Processing document: 33-13-0.txt
1530


 57%|██████████████████████████████▊                       | 433/760 [56:57<1:43:40, 19.02s/it]

Processing document: 33-14-0.txt
1548


 57%|██████████████████████████████▊                       | 434/760 [57:12<1:37:12, 17.89s/it]

Processing document: 33-15-0.txt
1616


 57%|██████████████████████████████▉                       | 435/760 [57:26<1:30:08, 16.64s/it]

Processing document: 33-16-0.txt
1527


 57%|██████████████████████████████▉                       | 436/760 [57:40<1:26:51, 16.09s/it]

Processing document: 33-17-0.txt
1599


 57%|███████████████████████████████                       | 437/760 [58:02<1:36:01, 17.84s/it]

Processing document: 33-18-0.txt
1532


 58%|███████████████████████████████                       | 438/760 [58:25<1:43:26, 19.27s/it]

Processing document: 33-19-0.txt
1525


 58%|███████████████████████████████▏                      | 439/760 [58:48<1:48:28, 20.28s/it]

Processing document: 33-2-0.txt
1516


 58%|███████████████████████████████▎                      | 440/760 [59:09<1:50:02, 20.63s/it]

Processing document: 33-20-0.txt
1580


 58%|███████████████████████████████▎                      | 441/760 [59:23<1:38:59, 18.62s/it]

Processing document: 33-21-0.txt
1536


 58%|███████████████████████████████▍                      | 442/760 [59:38<1:33:34, 17.66s/it]

Processing document: 33-22-0.txt
1593


 58%|███████████████████████████████▍                      | 443/760 [59:57<1:35:05, 18.00s/it]

Processing document: 33-23-0.txt
1592


 58%|██████████████████████████████▍                     | 444/760 [1:00:16<1:36:52, 18.39s/it]

Processing document: 33-24-0.txt
1579


 59%|██████████████████████████████▍                     | 445/760 [1:00:31<1:30:25, 17.22s/it]

Processing document: 33-25-0.txt
1509


 59%|██████████████████████████████▌                     | 446/760 [1:00:54<1:38:33, 18.83s/it]

Processing document: 33-26-0.txt
1633


 59%|██████████████████████████████▌                     | 447/760 [1:01:13<1:39:44, 19.12s/it]

Processing document: 33-27-0.txt
1630


 59%|██████████████████████████████▋                     | 448/760 [1:01:26<1:28:46, 17.07s/it]

Processing document: 33-28-0.txt
1630


 59%|██████████████████████████████▋                     | 449/760 [1:01:48<1:36:59, 18.71s/it]

Processing document: 33-29-0.txt
1642


 59%|██████████████████████████████▊                     | 450/760 [1:02:09<1:40:16, 19.41s/it]

Processing document: 33-3-0.txt
1546


 59%|██████████████████████████████▊                     | 451/760 [1:02:24<1:32:30, 17.96s/it]

Processing document: 33-30-0.txt
1553


 59%|██████████████████████████████▉                     | 452/760 [1:02:37<1:25:24, 16.64s/it]

Processing document: 33-31-0.txt
1477


 60%|██████████████████████████████▉                     | 453/760 [1:02:50<1:18:21, 15.31s/it]

Processing document: 33-32-0.txt
1467


 60%|███████████████████████████████                     | 454/760 [1:03:03<1:15:17, 14.76s/it]

Processing document: 33-33-0.txt
1402


 60%|███████████████████████████████▏                    | 455/760 [1:03:14<1:09:46, 13.73s/it]

Processing document: 33-34-0.txt
1491


 60%|███████████████████████████████▏                    | 456/760 [1:03:32<1:15:28, 14.90s/it]

Processing document: 33-35-0.txt
1515


 60%|███████████████████████████████▎                    | 457/760 [1:03:55<1:26:53, 17.21s/it]

Processing document: 33-36-0.txt
1473


 60%|███████████████████████████████▎                    | 458/760 [1:04:12<1:26:29, 17.18s/it]

Processing document: 33-37-0.txt
1500


 60%|███████████████████████████████▍                    | 459/760 [1:04:22<1:15:50, 15.12s/it]

Processing document: 33-38-0.txt
1454


 61%|███████████████████████████████▍                    | 460/760 [1:04:32<1:08:30, 13.70s/it]

Processing document: 33-39-0.txt
1387


 61%|███████████████████████████████▌                    | 461/760 [1:04:41<1:01:24, 12.32s/it]

Processing document: 33-4-0.txt
1504


 61%|███████████████████████████████▌                    | 462/760 [1:04:59<1:09:15, 13.94s/it]

Processing document: 33-40-0.txt
1396


 61%|███████████████████████████████▋                    | 463/760 [1:05:09<1:02:25, 12.61s/it]

Processing document: 33-41-0.txt
1484


 61%|████████████████████████████████▉                     | 464/760 [1:05:18<56:43, 11.50s/it]

Processing document: 33-42-0.txt
1458


 61%|█████████████████████████████████                     | 465/760 [1:05:28<54:51, 11.16s/it]

Processing document: 33-43-0.txt
1432


 61%|█████████████████████████████████                     | 466/760 [1:05:37<51:34, 10.52s/it]

Processing document: 33-44-0.txt
1454


 61%|█████████████████████████████████▏                    | 467/760 [1:05:52<57:59, 11.87s/it]

Processing document: 33-45-0.txt
1365


 62%|████████████████████████████████                    | 468/760 [1:06:08<1:04:06, 13.17s/it]

Processing document: 33-46-0.txt
1464


 62%|████████████████████████████████                    | 469/760 [1:06:23<1:06:50, 13.78s/it]

Processing document: 33-47-0.txt
1466


 62%|████████████████████████████████▏                   | 470/760 [1:06:33<1:00:24, 12.50s/it]

Processing document: 33-48-0.txt
1444


 62%|█████████████████████████████████▍                    | 471/760 [1:06:42<55:28, 11.52s/it]

Processing document: 33-49-0.txt
1441


 62%|█████████████████████████████████▌                    | 472/760 [1:06:49<49:12, 10.25s/it]

Processing document: 33-5-0.txt
1578


 62%|████████████████████████████████▎                   | 473/760 [1:07:11<1:04:56, 13.57s/it]

Processing document: 33-50-0.txt
1484


 62%|████████████████████████████████▍                   | 474/760 [1:07:25<1:05:01, 13.64s/it]

Processing document: 33-6-0.txt
1553


 62%|████████████████████████████████▌                   | 475/760 [1:07:37<1:02:26, 13.15s/it]

Processing document: 33-7-0.txt
1495


 63%|████████████████████████████████▌                   | 476/760 [1:07:51<1:04:08, 13.55s/it]

Processing document: 33-8-0.txt
1581


 63%|████████████████████████████████▋                   | 477/760 [1:08:12<1:13:45, 15.64s/it]

Processing document: 33-9-0.txt
1580


 63%|████████████████████████████████▋                   | 478/760 [1:08:32<1:20:46, 17.19s/it]

Processing document: 34-0-0.txt
1655


 63%|████████████████████████████████▊                   | 479/760 [1:08:49<1:18:58, 16.86s/it]

Processing document: 35-0-0.txt
1565


 63%|████████████████████████████████▊                   | 480/760 [1:09:02<1:14:33, 15.98s/it]

Processing document: 35-1-0.txt
924


 63%|████████████████████████████████▉                   | 481/760 [1:09:13<1:07:21, 14.49s/it]

Processing document: 36-0-0.txt
1549


 63%|████████████████████████████████▉                   | 482/760 [1:09:31<1:10:50, 15.29s/it]

Processing document: 36-1-0.txt
1367


 64%|█████████████████████████████████                   | 483/760 [1:09:47<1:12:38, 15.74s/it]

Processing document: 36-2-0.txt
1073


 64%|█████████████████████████████████                   | 484/760 [1:10:01<1:09:40, 15.15s/it]

Processing document: 37-0-0.txt
1628


 64%|██████████████████████████████████▍                   | 485/760 [1:10:09<59:55, 13.08s/it]

Processing document: 37-1-0.txt
1554


 64%|█████████████████████████████████▎                  | 486/760 [1:10:30<1:10:00, 15.33s/it]

Processing document: 37-2-0.txt
1548


 64%|█████████████████████████████████▎                  | 487/760 [1:10:53<1:19:44, 17.53s/it]

Processing document: 37-3-0.txt
1717


 64%|█████████████████████████████████▍                  | 488/760 [1:11:16<1:26:47, 19.15s/it]

Processing document: 37-4-0.txt
1642


 64%|█████████████████████████████████▍                  | 489/760 [1:11:30<1:20:44, 17.88s/it]

Processing document: 37-5-0.txt
1879


 64%|█████████████████████████████████▌                  | 490/760 [1:11:51<1:24:08, 18.70s/it]

Processing document: 37-6-0.txt
1660


 65%|█████████████████████████████████▌                  | 491/760 [1:12:14<1:28:59, 19.85s/it]

Processing document: 37-7-0.txt
1144


 65%|█████████████████████████████████▋                  | 492/760 [1:12:23<1:15:13, 16.84s/it]

Processing document: 38-0-0.txt
407


 65%|█████████████████████████████████▋                  | 493/760 [1:12:29<1:00:13, 13.54s/it]

Processing document: 38-1-0.txt
2255


 65%|███████████████████████████████████                   | 494/760 [1:12:40<56:39, 12.78s/it]

Processing document: 38-1-1.txt
2340


 65%|███████████████████████████████████▏                  | 495/760 [1:12:53<56:01, 12.69s/it]

Processing document: 38-1-10.txt
2371


 65%|█████████████████████████████████▉                  | 496/760 [1:13:10<1:02:17, 14.16s/it]

Processing document: 38-1-11.txt
2303


 65%|██████████████████████████████████                  | 497/760 [1:13:25<1:02:18, 14.21s/it]

Processing document: 38-1-12.txt
2237


 66%|██████████████████████████████████                  | 498/760 [1:13:40<1:03:42, 14.59s/it]

Processing document: 38-1-2.txt
2303


 66%|██████████████████████████████████▏                 | 499/760 [1:13:56<1:05:31, 15.06s/it]

Processing document: 38-1-3.txt
2367


 66%|██████████████████████████████████▏                 | 500/760 [1:14:11<1:05:08, 15.03s/it]

Processing document: 38-1-4.txt
2294


 66%|███████████████████████████████████▌                  | 501/760 [1:14:22<59:08, 13.70s/it]

Processing document: 38-1-5.txt
2354


 66%|██████████████████████████████████▎                 | 502/760 [1:14:38<1:01:57, 14.41s/it]

Processing document: 38-1-6.txt
2337


 66%|██████████████████████████████████▍                 | 503/760 [1:14:53<1:02:23, 14.57s/it]

Processing document: 38-1-7.txt
2290


 66%|███████████████████████████████████▊                  | 504/760 [1:15:06<59:51, 14.03s/it]

Processing document: 38-1-8.txt
2405


 66%|███████████████████████████████████▉                  | 505/760 [1:15:19<58:28, 13.76s/it]

Processing document: 38-1-9.txt
2344


 67%|██████████████████████████████████▌                 | 506/760 [1:15:35<1:01:01, 14.42s/it]

Processing document: 39-0-0.txt
1167


 67%|████████████████████████████████████                  | 507/760 [1:15:47<58:26, 13.86s/it]

Processing document: 4-0-0.txt
1555


 67%|████████████████████████████████████                  | 508/760 [1:15:59<55:08, 13.13s/it]

Processing document: 4-1-0.txt
1054


 67%|████████████████████████████████████▏                 | 509/760 [1:16:09<51:37, 12.34s/it]

Processing document: 40-0-0.txt
1564


 67%|██████████████████████████████████▉                 | 510/760 [1:16:32<1:04:20, 15.44s/it]

Processing document: 40-1-0.txt
721


 67%|████████████████████████████████████▎                 | 511/760 [1:16:41<56:04, 13.51s/it]

Processing document: 41-0-0.txt
1549


 67%|████████████████████████████████████▍                 | 512/760 [1:16:53<54:06, 13.09s/it]

Processing document: 42-0-0.txt
774


 68%|████████████████████████████████████▍                 | 513/760 [1:17:01<47:36, 11.57s/it]

Processing document: 43-0-0.txt
998


 68%|████████████████████████████████████▌                 | 514/760 [1:17:13<47:48, 11.66s/it]

Processing document: 44-0-0.txt
1547


 68%|████████████████████████████████████▌                 | 515/760 [1:17:27<50:53, 12.46s/it]

Processing document: 45-0-0.txt
1019


 68%|████████████████████████████████████▋                 | 516/760 [1:17:35<45:19, 11.15s/it]

Processing document: 46-0-0.txt
1607


 68%|████████████████████████████████████▋                 | 517/760 [1:17:46<44:36, 11.02s/it]

Processing document: 47-0-0.txt
1678


 68%|████████████████████████████████████▊                 | 518/760 [1:18:01<49:13, 12.21s/it]

Processing document: 47-1-0.txt
589


 68%|████████████████████████████████████▉                 | 519/760 [1:18:08<43:06, 10.73s/it]

Processing document: 48-0-0.txt
1486


 68%|████████████████████████████████████▉                 | 520/760 [1:18:19<42:32, 10.64s/it]

Processing document: 48-1-0.txt
1433


 69%|█████████████████████████████████████                 | 521/760 [1:18:31<44:17, 11.12s/it]

Processing document: 48-2-0.txt
1483


 69%|█████████████████████████████████████                 | 522/760 [1:18:53<57:39, 14.54s/it]

Processing document: 48-3-0.txt
1489


 69%|█████████████████████████████████████▏                | 523/760 [1:19:04<52:41, 13.34s/it]

Processing document: 48-4-0.txt
1454


 69%|███████████████████████████████████▊                | 524/760 [1:19:24<1:00:25, 15.36s/it]

Processing document: 48-5-0.txt
1407


 69%|███████████████████████████████████▉                | 525/760 [1:19:40<1:00:31, 15.45s/it]

Processing document: 48-6-0.txt
1348


 69%|█████████████████████████████████████▎                | 526/760 [1:19:52<56:04, 14.38s/it]

Processing document: 48-7-0.txt
1170


 69%|█████████████████████████████████████▍                | 527/760 [1:20:09<59:29, 15.32s/it]

Processing document: 49-0-0.txt
1593


 69%|█████████████████████████████████████▌                | 528/760 [1:20:20<54:38, 14.13s/it]

Processing document: 49-1-0.txt
1683


 70%|█████████████████████████████████████▌                | 529/760 [1:20:32<51:27, 13.37s/it]

Processing document: 49-2-0.txt
616


 70%|█████████████████████████████████████▋                | 530/760 [1:20:39<43:50, 11.44s/it]

Processing document: 5-0-0.txt
1488


 70%|█████████████████████████████████████▋                | 531/760 [1:20:56<49:32, 12.98s/it]

Processing document: 5-1-0.txt
1546


 70%|█████████████████████████████████████▊                | 532/760 [1:21:05<45:31, 11.98s/it]

Processing document: 5-2-0.txt
1442


 70%|█████████████████████████████████████▊                | 533/760 [1:21:15<42:56, 11.35s/it]

Processing document: 50-0-0.txt
930


 70%|█████████████████████████████████████▉                | 534/760 [1:21:23<38:39, 10.26s/it]

Processing document: 50-1-0.txt
1585


 70%|██████████████████████████████████████                | 535/760 [1:21:39<45:21, 12.10s/it]

Processing document: 51-0-0.txt
935


 71%|██████████████████████████████████████                | 536/760 [1:21:50<43:23, 11.62s/it]

Processing document: 51-1-0.txt
1621


 71%|██████████████████████████████████████▏               | 537/760 [1:22:02<43:59, 11.84s/it]

Processing document: 51-2-0.txt
1162


 71%|██████████████████████████████████████▏               | 538/760 [1:22:16<45:35, 12.32s/it]

Processing document: 52-0-0.txt
1669


 71%|██████████████████████████████████████▎               | 539/760 [1:22:27<44:35, 12.11s/it]

Processing document: 52-1-0.txt
1115


 71%|██████████████████████████████████████▎               | 540/760 [1:22:40<44:46, 12.21s/it]

Processing document: 53-0-0.txt
1672


 71%|██████████████████████████████████████▍               | 541/760 [1:22:51<44:12, 12.11s/it]

Processing document: 53-1-0.txt
811


 71%|██████████████████████████████████████▌               | 542/760 [1:23:09<49:42, 13.68s/it]

Processing document: 54-0-0.txt
1654


 71%|██████████████████████████████████████▌               | 543/760 [1:23:18<44:20, 12.26s/it]

Processing document: 54-1-0.txt
693


 72%|██████████████████████████████████████▋               | 544/760 [1:23:28<41:52, 11.63s/it]

Processing document: 54-2-0.txt
2032


 72%|██████████████████████████████████████▋               | 545/760 [1:23:45<47:54, 13.37s/it]

Processing document: 54-2-1.txt
2155


 72%|██████████████████████████████████████▊               | 546/760 [1:23:55<44:04, 12.36s/it]

Processing document: 54-2-2.txt
522


 72%|██████████████████████████████████████▊               | 547/760 [1:24:01<37:06, 10.46s/it]

Processing document: 55-0-0.txt
1667


 72%|██████████████████████████████████████▉               | 548/760 [1:24:13<38:04, 10.77s/it]

Processing document: 55-1-0.txt
1114


 72%|███████████████████████████████████████               | 549/760 [1:24:27<41:51, 11.90s/it]

Processing document: 56-0-0.txt
1663


 72%|███████████████████████████████████████               | 550/760 [1:24:36<38:25, 10.98s/it]

Processing document: 56-1-0.txt
1160


 72%|███████████████████████████████████████▏              | 551/760 [1:24:54<45:21, 13.02s/it]

Processing document: 57-0-0.txt
1668


 73%|███████████████████████████████████████▏              | 552/760 [1:25:06<43:51, 12.65s/it]

Processing document: 57-1-0.txt
1070


 73%|███████████████████████████████████████▎              | 553/760 [1:25:14<39:00, 11.31s/it]

Processing document: 58-0-0.txt
1671


 73%|███████████████████████████████████████▎              | 554/760 [1:25:35<48:30, 14.13s/it]

Processing document: 58-1-0.txt
965


 73%|███████████████████████████████████████▍              | 555/760 [1:25:46<45:25, 13.30s/it]

Processing document: 59-0-0.txt
1678


 73%|███████████████████████████████████████▌              | 556/760 [1:26:05<50:42, 14.91s/it]

Processing document: 59-1-0.txt
1112


 73%|███████████████████████████████████████▌              | 557/760 [1:26:24<54:31, 16.12s/it]

Processing document: 6-0-0.txt
1428


 73%|███████████████████████████████████████▋              | 558/760 [1:26:31<45:38, 13.56s/it]

Processing document: 6-1-0.txt
1772


 74%|███████████████████████████████████████▋              | 559/760 [1:26:43<43:16, 12.92s/it]

Processing document: 60-0-0.txt
1668


 74%|███████████████████████████████████████▊              | 560/760 [1:26:55<42:32, 12.76s/it]

Processing document: 60-1-0.txt
923


 74%|███████████████████████████████████████▊              | 561/760 [1:27:02<36:56, 11.14s/it]

Processing document: 61-0-0.txt
1649


 74%|███████████████████████████████████████▉              | 562/760 [1:27:14<37:36, 11.39s/it]

Processing document: 61-1-0.txt
749


 74%|████████████████████████████████████████              | 563/760 [1:27:23<35:05, 10.69s/it]

Processing document: 62-0-0.txt
1671


 74%|████████████████████████████████████████              | 564/760 [1:27:34<34:42, 10.63s/it]

Processing document: 62-1-0.txt
1123


 74%|████████████████████████████████████████▏             | 565/760 [1:27:45<35:00, 10.77s/it]

Processing document: 63-0-0.txt
1675


 74%|████████████████████████████████████████▏             | 566/760 [1:27:56<35:18, 10.92s/it]

Processing document: 63-1-0.txt
1330


 75%|████████████████████████████████████████▎             | 567/760 [1:28:13<40:41, 12.65s/it]

Processing document: 64-0-0.txt
1676


 75%|████████████████████████████████████████▎             | 568/760 [1:28:29<43:26, 13.57s/it]

Processing document: 64-1-0.txt
1025


 75%|████████████████████████████████████████▍             | 569/760 [1:28:37<38:31, 12.10s/it]

Processing document: 65-0-0.txt
937


 75%|████████████████████████████████████████▌             | 570/760 [1:28:46<34:46, 10.98s/it]

Processing document: 65-1-0.txt
1503


 75%|████████████████████████████████████████▌             | 571/760 [1:29:03<40:57, 13.00s/it]

Processing document: 66-0-0.txt
1687


 75%|████████████████████████████████████████▋             | 572/760 [1:29:15<39:34, 12.63s/it]

Processing document: 66-1-0.txt
1626


 75%|████████████████████████████████████████▋             | 573/760 [1:29:26<37:56, 12.18s/it]

Processing document: 66-2-0.txt
1408


 76%|████████████████████████████████████████▊             | 574/760 [1:29:34<33:23, 10.77s/it]

Processing document: 67-0-0.txt
942


 76%|████████████████████████████████████████▊             | 575/760 [1:29:45<33:31, 10.87s/it]

Processing document: 67-1-0.txt
1700


 76%|████████████████████████████████████████▉             | 576/760 [1:29:59<36:42, 11.97s/it]

Processing document: 67-2-0.txt
1605


 76%|████████████████████████████████████████▉             | 577/760 [1:30:11<36:00, 11.81s/it]

Processing document: 67-3-0.txt
1204


 76%|█████████████████████████████████████████             | 578/760 [1:30:18<31:35, 10.42s/it]

Processing document: 68-0-0.txt
929


 76%|█████████████████████████████████████████▏            | 579/760 [1:30:25<28:23,  9.41s/it]

Processing document: 68-1-0.txt
1529


 76%|█████████████████████████████████████████▏            | 580/760 [1:30:37<30:13, 10.07s/it]

Processing document: 68-2-0.txt
1629


 76%|█████████████████████████████████████████▎            | 581/760 [1:30:49<31:39, 10.61s/it]

Processing document: 68-3-0.txt
665


 77%|█████████████████████████████████████████▎            | 582/760 [1:30:56<28:40,  9.67s/it]

Processing document: 69-0-0.txt
1691


 77%|█████████████████████████████████████████▍            | 583/760 [1:31:06<28:26,  9.64s/it]

Processing document: 69-1-0.txt
1612


 77%|█████████████████████████████████████████▍            | 584/760 [1:31:19<31:33, 10.76s/it]

Processing document: 69-2-0.txt
1299


 77%|█████████████████████████████████████████▌            | 585/760 [1:31:28<29:35, 10.15s/it]

Processing document: 7-0-0.txt
1511


 77%|█████████████████████████████████████████▋            | 586/760 [1:31:41<31:54, 11.00s/it]

Processing document: 7-1-0.txt
1651


 77%|█████████████████████████████████████████▋            | 587/760 [1:32:04<41:56, 14.55s/it]

Processing document: 7-2-0.txt
1611


 77%|█████████████████████████████████████████▊            | 588/760 [1:32:26<48:44, 17.00s/it]

Processing document: 7-3-0.txt
1577


 78%|█████████████████████████████████████████▊            | 589/760 [1:32:49<53:19, 18.71s/it]

Processing document: 7-4-0.txt
1475


 78%|█████████████████████████████████████████▉            | 590/760 [1:33:02<47:43, 16.84s/it]

Processing document: 7-5-0.txt
1674


 78%|█████████████████████████████████████████▉            | 591/760 [1:33:10<40:02, 14.22s/it]

Processing document: 7-6-0.txt
1624


 78%|██████████████████████████████████████████            | 592/760 [1:33:32<46:30, 16.61s/it]

Processing document: 7-7-0.txt
1188


 78%|██████████████████████████████████████████▏           | 593/760 [1:33:41<40:08, 14.42s/it]

Processing document: 7-8-0.txt
1161


 78%|██████████████████████████████████████████▏           | 594/760 [1:33:49<34:23, 12.43s/it]

Processing document: 70-0-0.txt
934


 78%|██████████████████████████████████████████▎           | 595/760 [1:33:57<30:22, 11.05s/it]

Processing document: 70-1-0.txt
1667


 78%|██████████████████████████████████████████▎           | 596/760 [1:34:11<32:27, 11.87s/it]

Processing document: 70-2-0.txt
1386


 79%|██████████████████████████████████████████▍           | 597/760 [1:34:23<32:56, 12.12s/it]

Processing document: 71-0-0.txt
800


 79%|██████████████████████████████████████████▍           | 598/760 [1:34:32<30:24, 11.26s/it]

Processing document: 72-0-0.txt
835


 79%|██████████████████████████████████████████▌           | 599/760 [1:34:41<28:21, 10.57s/it]

Processing document: 73-0-0.txt
475


 79%|██████████████████████████████████████████▋           | 600/760 [1:34:46<23:25,  8.79s/it]

Processing document: 74-0-0.txt
694


 79%|██████████████████████████████████████████▋           | 601/760 [1:34:56<24:18,  9.17s/it]

Processing document: 75-0-0.txt
457


 79%|██████████████████████████████████████████▊           | 602/760 [1:35:03<22:00,  8.36s/it]

Processing document: 76-0-0.txt
696


 79%|██████████████████████████████████████████▊           | 603/760 [1:35:14<24:01,  9.18s/it]

Processing document: 77-0-0.txt
564


 79%|██████████████████████████████████████████▉           | 604/760 [1:35:26<26:19, 10.12s/it]

Processing document: 78-0-0.txt
435


 80%|██████████████████████████████████████████▉           | 605/760 [1:35:33<23:26,  9.08s/it]

Processing document: 79-0-0.txt
664


 80%|███████████████████████████████████████████           | 606/760 [1:35:43<24:25,  9.52s/it]

Processing document: 8-0-0.txt
1831


 80%|███████████████████████████████████████████▏          | 607/760 [1:35:51<23:20,  9.15s/it]

Processing document: 8-1-0.txt
1549


 80%|███████████████████████████████████████████▏          | 608/760 [1:36:05<26:10, 10.33s/it]

Processing document: 8-2-0.txt
1548


 80%|███████████████████████████████████████████▎          | 609/760 [1:36:27<35:18, 14.03s/it]

Processing document: 8-3-0.txt
1504


 80%|███████████████████████████████████████████▎          | 610/760 [1:36:38<32:41, 13.08s/it]

Processing document: 8-4-0.txt
1551


 80%|███████████████████████████████████████████▍          | 611/760 [1:36:54<34:23, 13.85s/it]

Processing document: 8-5-0.txt
1810


 81%|███████████████████████████████████████████▍          | 612/760 [1:37:03<31:03, 12.59s/it]

Processing document: 8-6-0.txt
1732


 81%|███████████████████████████████████████████▌          | 613/760 [1:37:18<32:01, 13.07s/it]

Processing document: 8-7-0.txt
994


 81%|███████████████████████████████████████████▋          | 614/760 [1:37:28<29:58, 12.32s/it]

Processing document: 80-0-0.txt
807


 81%|███████████████████████████████████████████▋          | 615/760 [1:37:37<26:57, 11.15s/it]

Processing document: 81-0-0.txt
831


 81%|███████████████████████████████████████████▊          | 616/760 [1:37:46<25:44, 10.73s/it]

Processing document: 82-0-0.txt
1746


 81%|███████████████████████████████████████████▊          | 617/760 [1:37:58<26:29, 11.11s/it]

Processing document: 82-0-1.txt
363


 81%|███████████████████████████████████████████▉          | 618/760 [1:38:04<22:16,  9.41s/it]

Processing document: 82-1-0.txt
1693


 81%|███████████████████████████████████████████▉          | 619/760 [1:38:22<28:18, 12.04s/it]

Processing document: 82-2-0.txt
1687


 82%|████████████████████████████████████████████          | 620/760 [1:38:33<27:39, 11.85s/it]

Processing document: 82-3-0.txt
1772


 82%|████████████████████████████████████████████          | 621/760 [1:38:56<35:01, 15.12s/it]

Processing document: 82-4-0.txt
1780


 82%|████████████████████████████████████████████▏         | 622/760 [1:39:19<40:14, 17.50s/it]

Processing document: 82-5-0.txt
1789


 82%|████████████████████████████████████████████▎         | 623/760 [1:39:42<43:30, 19.05s/it]

Processing document: 82-6-0.txt
1632


 82%|████████████████████████████████████████████▎         | 624/760 [1:40:03<44:28, 19.62s/it]

Processing document: 82-7-0.txt
1151


 82%|████████████████████████████████████████████▍         | 625/760 [1:40:25<45:43, 20.32s/it]

Processing document: 83-0-0.txt
1501


 82%|████████████████████████████████████████████▍         | 626/760 [1:40:37<40:05, 17.95s/it]

Processing document: 83-1-0.txt
1171


 82%|████████████████████████████████████████████▌         | 627/760 [1:40:45<33:20, 15.04s/it]

Processing document: 84-0-0.txt
1637


 83%|████████████████████████████████████████████▌         | 628/760 [1:40:58<31:35, 14.36s/it]

Processing document: 84-1-0.txt
1513


 83%|████████████████████████████████████████████▋         | 629/760 [1:41:10<29:48, 13.65s/it]

Processing document: 84-2-0.txt
1533


 83%|████████████████████████████████████████████▊         | 630/760 [1:41:21<27:47, 12.82s/it]

Processing document: 84-3-0.txt
834


 83%|████████████████████████████████████████████▊         | 631/760 [1:41:34<27:31, 12.80s/it]

Processing document: 85-0-0.txt
1601


 83%|████████████████████████████████████████████▉         | 632/760 [1:41:46<26:49, 12.57s/it]

Processing document: 85-1-0.txt
1548


 83%|████████████████████████████████████████████▉         | 633/760 [1:41:58<26:02, 12.30s/it]

Processing document: 85-2-0.txt
1578


 83%|█████████████████████████████████████████████         | 634/760 [1:42:17<30:21, 14.46s/it]

Processing document: 85-3-0.txt
1554


 84%|█████████████████████████████████████████████         | 635/760 [1:42:28<28:03, 13.47s/it]

Processing document: 85-4-0.txt
984


 84%|█████████████████████████████████████████████▏        | 636/760 [1:42:37<24:40, 11.94s/it]

Processing document: 86-0-0.txt
1600


 84%|█████████████████████████████████████████████▎        | 637/760 [1:42:56<29:17, 14.29s/it]

Processing document: 86-1-0.txt
1547


 84%|█████████████████████████████████████████████▎        | 638/760 [1:43:13<30:48, 15.15s/it]

Processing document: 86-2-0.txt
1345


 84%|█████████████████████████████████████████████▍        | 639/760 [1:43:26<28:42, 14.23s/it]

Processing document: 87-0-0.txt
471


 84%|█████████████████████████████████████████████▍        | 640/760 [1:43:32<23:43, 11.86s/it]

Processing document: 87-1-0.txt
1816


 84%|█████████████████████████████████████████████▌        | 641/760 [1:43:49<26:54, 13.57s/it]

Processing document: 87-1-1.txt
1873


 84%|█████████████████████████████████████████████▌        | 642/760 [1:44:13<32:21, 16.45s/it]

Processing document: 87-1-2.txt
1854


 85%|█████████████████████████████████████████████▋        | 643/760 [1:44:36<36:00, 18.46s/it]

Processing document: 87-1-3.txt
477


 85%|█████████████████████████████████████████████▊        | 644/760 [1:44:43<29:05, 15.05s/it]

Processing document: 87-2-0.txt
717


 85%|█████████████████████████████████████████████▊        | 645/760 [1:44:54<26:20, 13.74s/it]

Processing document: 88-0-0.txt
475


 85%|█████████████████████████████████████████████▉        | 646/760 [1:45:00<21:50, 11.49s/it]

Processing document: 88-1-0.txt
2009


 85%|█████████████████████████████████████████████▉        | 647/760 [1:45:23<28:23, 15.08s/it]

Processing document: 88-1-1.txt
2063


 85%|██████████████████████████████████████████████        | 648/760 [1:45:47<32:53, 17.62s/it]

Processing document: 88-1-2.txt
2045


 85%|██████████████████████████████████████████████        | 649/760 [1:46:09<35:05, 18.97s/it]

Processing document: 88-1-3.txt
1953


 86%|██████████████████████████████████████████████▏       | 650/760 [1:46:20<30:11, 16.47s/it]

Processing document: 88-2-0.txt
2079


 86%|██████████████████████████████████████████████▎       | 651/760 [1:46:43<33:48, 18.61s/it]

Processing document: 88-2-1.txt
387


 86%|██████████████████████████████████████████████▎       | 652/760 [1:46:48<26:19, 14.63s/it]

Processing document: 88-3-0.txt
2072


 86%|██████████████████████████████████████████████▍       | 653/760 [1:47:12<30:46, 17.26s/it]

Processing document: 88-3-1.txt
2043


 86%|██████████████████████████████████████████████▍       | 654/760 [1:47:35<33:48, 19.14s/it]

Processing document: 88-3-2.txt
2042


 86%|██████████████████████████████████████████████▌       | 655/760 [1:47:48<30:18, 17.32s/it]

Processing document: 88-3-3.txt
541


 86%|██████████████████████████████████████████████▌       | 656/760 [1:47:56<25:04, 14.47s/it]

Processing document: 88-4-0.txt
845


 86%|██████████████████████████████████████████████▋       | 657/760 [1:48:06<22:24, 13.05s/it]

Processing document: 89-0-0.txt
470


 87%|██████████████████████████████████████████████▊       | 658/760 [1:48:14<19:39, 11.56s/it]

Processing document: 89-1-0.txt
1876


 87%|██████████████████████████████████████████████▊       | 659/760 [1:48:37<25:19, 15.05s/it]

Processing document: 89-1-1.txt
1956


 87%|██████████████████████████████████████████████▉       | 660/760 [1:49:01<29:14, 17.54s/it]

Processing document: 89-1-2.txt
784


 87%|██████████████████████████████████████████████▉       | 661/760 [1:49:17<28:13, 17.11s/it]

Processing document: 89-2-0.txt
701


 87%|███████████████████████████████████████████████       | 662/760 [1:49:26<24:19, 14.89s/it]

Processing document: 9-0-0.txt
1461


 87%|███████████████████████████████████████████████       | 663/760 [1:49:36<21:14, 13.13s/it]

Processing document: 90-0-0.txt
475


 87%|███████████████████████████████████████████████▏      | 664/760 [1:49:44<18:53, 11.80s/it]

Processing document: 90-1-0.txt
1918


 88%|███████████████████████████████████████████████▎      | 665/760 [1:50:07<23:55, 15.11s/it]

Processing document: 90-1-1.txt
2014


 88%|███████████████████████████████████████████████▎      | 666/760 [1:50:30<27:35, 17.62s/it]

Processing document: 90-1-2.txt
1972


 88%|███████████████████████████████████████████████▍      | 667/760 [1:50:54<29:59, 19.35s/it]

Processing document: 90-1-3.txt
361


 88%|███████████████████████████████████████████████▍      | 668/760 [1:50:59<23:02, 15.03s/it]

Processing document: 90-2-0.txt
843


 88%|███████████████████████████████████████████████▌      | 669/760 [1:51:08<20:03, 13.22s/it]

Processing document: 91-0-0.txt
1111


 88%|███████████████████████████████████████████████▌      | 670/760 [1:51:27<22:31, 15.02s/it]

Processing document: 91-1-0.txt
2043


 88%|███████████████████████████████████████████████▋      | 671/760 [1:51:51<26:03, 17.57s/it]

Processing document: 91-1-1.txt
2032


 88%|███████████████████████████████████████████████▋      | 672/760 [1:52:14<28:22, 19.35s/it]

Processing document: 91-1-2.txt
446


 89%|███████████████████████████████████████████████▊      | 673/760 [1:52:21<22:39, 15.62s/it]

Processing document: 91-2-0.txt
852


 89%|███████████████████████████████████████████████▉      | 674/760 [1:52:31<19:51, 13.86s/it]

Processing document: 92-0-0.txt
473


 89%|███████████████████████████████████████████████▉      | 675/760 [1:52:39<17:05, 12.07s/it]

Processing document: 92-1-0.txt
1961


 89%|████████████████████████████████████████████████      | 676/760 [1:53:02<21:38, 15.46s/it]

Processing document: 92-1-1.txt
849


 89%|████████████████████████████████████████████████      | 677/760 [1:53:12<19:06, 13.82s/it]

Processing document: 92-2-0.txt
1176


 89%|████████████████████████████████████████████████▏     | 678/760 [1:53:30<20:45, 15.19s/it]

Processing document: 93-0-0.txt
473


 89%|████████████████████████████████████████████████▏     | 679/760 [1:53:37<16:50, 12.47s/it]

Processing document: 93-1-0.txt
1932


 89%|████████████████████████████████████████████████▎     | 680/760 [1:53:51<17:34, 13.19s/it]

Processing document: 93-1-1.txt
1420


 90%|████████████████████████████████████████████████▍     | 681/760 [1:54:09<19:18, 14.66s/it]

Processing document: 93-2-0.txt
840


 90%|████████████████████████████████████████████████▍     | 682/760 [1:54:31<21:40, 16.68s/it]

Processing document: 94-0-0.txt
1532


 90%|████████████████████████████████████████████████▌     | 683/760 [1:54:49<22:01, 17.16s/it]

Processing document: 94-1-0.txt
1048


 90%|████████████████████████████████████████████████▌     | 684/760 [1:54:59<18:50, 14.88s/it]

Processing document: 95-0-0.txt
473


 90%|████████████████████████████████████████████████▋     | 685/760 [1:55:07<16:00, 12.80s/it]

Processing document: 95-1-0.txt
1956


 90%|████████████████████████████████████████████████▋     | 686/760 [1:55:30<19:41, 15.97s/it]

Processing document: 95-1-1.txt
436


 90%|████████████████████████████████████████████████▊     | 687/760 [1:55:37<16:05, 13.23s/it]

Processing document: 95-2-0.txt
853


 91%|████████████████████████████████████████████████▉     | 688/760 [1:55:51<16:08, 13.46s/it]

Processing document: 96-0-0.txt
1230


 91%|████████████████████████████████████████████████▉     | 689/760 [1:56:00<14:22, 12.15s/it]

Processing document: 9622_Amusement_Tax_Regulations-0-0.txt
1591


 91%|█████████████████████████████████████████████████     | 690/760 [1:56:09<12:55, 11.08s/it]

Processing document: 9622_Amusement_Tax_Regulations-1-0.txt
1419


 91%|█████████████████████████████████████████████████     | 691/760 [1:56:23<13:45, 11.96s/it]

Processing document: 9622_Amusement_Tax_Regulations-10-0.txt
1328


 91%|█████████████████████████████████████████████████▏    | 692/760 [1:56:31<12:29, 11.02s/it]

Processing document: 9622_Amusement_Tax_Regulations-11-0.txt
1483


 91%|█████████████████████████████████████████████████▏    | 693/760 [1:56:45<13:18, 11.92s/it]

Processing document: 9622_Amusement_Tax_Regulations-12-0.txt
1457


 91%|█████████████████████████████████████████████████▎    | 694/760 [1:56:58<13:15, 12.06s/it]

Processing document: 9622_Amusement_Tax_Regulations-13-0.txt
862


 91%|█████████████████████████████████████████████████▍    | 695/760 [1:57:08<12:22, 11.42s/it]

Processing document: 9622_Amusement_Tax_Regulations-2-0.txt
1400


 92%|█████████████████████████████████████████████████▍    | 696/760 [1:57:20<12:36, 11.81s/it]

Processing document: 9622_Amusement_Tax_Regulations-3-0.txt
1434


 92%|█████████████████████████████████████████████████▌    | 697/760 [1:57:29<11:14, 10.70s/it]

Processing document: 9622_Amusement_Tax_Regulations-4-0.txt
1391


 92%|█████████████████████████████████████████████████▌    | 698/760 [1:57:41<11:41, 11.32s/it]

Processing document: 9622_Amusement_Tax_Regulations-5-0.txt
1460


 92%|█████████████████████████████████████████████████▋    | 699/760 [1:57:50<10:48, 10.63s/it]

Processing document: 9622_Amusement_Tax_Regulations-6-0.txt
1392


 92%|█████████████████████████████████████████████████▋    | 700/760 [1:58:04<11:30, 11.52s/it]

Processing document: 9622_Amusement_Tax_Regulations-7-0.txt
1509


 92%|█████████████████████████████████████████████████▊    | 701/760 [1:58:14<11:01, 11.22s/it]

Processing document: 9622_Amusement_Tax_Regulations-8-0.txt
1475


 92%|█████████████████████████████████████████████████▉    | 702/760 [1:58:24<10:29, 10.86s/it]

Processing document: 9622_Amusement_Tax_Regulations-9-0.txt
1406


 92%|█████████████████████████████████████████████████▉    | 703/760 [1:58:34<09:59, 10.51s/it]

Processing document: 9623_ISP_Tax_Regulations-0-0.txt
1686


 93%|██████████████████████████████████████████████████    | 704/760 [1:58:45<10:00, 10.73s/it]

Processing document: 9623_ISP_Tax_Regulations-1-0.txt
1499


 93%|██████████████████████████████████████████████████    | 705/760 [1:58:57<10:07, 11.04s/it]

Processing document: 9623_ISP_Tax_Regulations-2-0.txt
1524


 93%|██████████████████████████████████████████████████▏   | 706/760 [1:59:09<10:06, 11.24s/it]

Processing document: 9623_ISP_Tax_Regulations-3-0.txt
1508


 93%|██████████████████████████████████████████████████▏   | 707/760 [1:59:20<09:48, 11.10s/it]

Processing document: 9623_ISP_Tax_Regulations-4-0.txt
1458


 93%|██████████████████████████████████████████████████▎   | 708/760 [1:59:28<08:56, 10.32s/it]

Processing document: 9623_ISP_Tax_Regulations-5-0.txt
1380


 93%|██████████████████████████████████████████████████▍   | 709/760 [1:59:44<10:17, 12.10s/it]

Processing document: 9623_ISP_Tax_Regulations-6-0.txt
1414


 93%|██████████████████████████████████████████████████▍   | 710/760 [1:59:53<09:18, 11.17s/it]

Processing document: 9623_ISP_Tax_Regulations-7-0.txt
1437


 94%|██████████████████████████████████████████████████▌   | 711/760 [2:00:07<09:36, 11.76s/it]

Processing document: 9623_ISP_Tax_Regulations-8-0.txt
1143


 94%|██████████████████████████████████████████████████▌   | 712/760 [2:00:21<10:10, 12.72s/it]

Processing document: 9624_Local_Services_Tax_Regulations-0-0.txt
1640


 94%|██████████████████████████████████████████████████▋   | 713/760 [2:00:27<08:17, 10.58s/it]

Processing document: 9624_Local_Services_Tax_Regulations-1-0.txt
1469


 94%|██████████████████████████████████████████████████▋   | 714/760 [2:00:38<08:18, 10.83s/it]

Processing document: 9624_Local_Services_Tax_Regulations-2-0.txt
1511


 94%|██████████████████████████████████████████████████▊   | 715/760 [2:00:51<08:31, 11.36s/it]

Processing document: 9624_Local_Services_Tax_Regulations-3-0.txt
1462


 94%|██████████████████████████████████████████████████▊   | 716/760 [2:01:07<09:19, 12.72s/it]

Processing document: 9624_Local_Services_Tax_Regulations-4-0.txt
1453


 94%|██████████████████████████████████████████████████▉   | 717/760 [2:01:23<09:50, 13.73s/it]

Processing document: 9624_Local_Services_Tax_Regulations-5-0.txt
1447


 94%|███████████████████████████████████████████████████   | 718/760 [2:01:37<09:42, 13.88s/it]

Processing document: 9624_Local_Services_Tax_Regulations-6-0.txt
1456


 95%|███████████████████████████████████████████████████   | 719/760 [2:01:47<08:42, 12.74s/it]

Processing document: 9624_Local_Services_Tax_Regulations-6-1.txt
334


 95%|███████████████████████████████████████████████████▏  | 720/760 [2:01:48<06:09,  9.25s/it]

Processing document: 9624_Local_Services_Tax_Regulations-7-0.txt
1429


 95%|███████████████████████████████████████████████████▏  | 721/760 [2:02:01<06:36, 10.16s/it]

Processing document: 9624_Local_Services_Tax_Regulations-8-0.txt
1454


 95%|███████████████████████████████████████████████████▎  | 722/760 [2:02:13<06:52, 10.85s/it]

Processing document: 9624_Local_Services_Tax_Regulations-9-0.txt
764


 95%|███████████████████████████████████████████████████▎  | 723/760 [2:02:24<06:36, 10.71s/it]

Processing document: 9625_Parking_Tax_Regulations-0-0.txt
1641


 95%|███████████████████████████████████████████████████▍  | 724/760 [2:02:33<06:11, 10.33s/it]

Processing document: 9625_Parking_Tax_Regulations-1-0.txt
1423


 95%|███████████████████████████████████████████████████▌  | 725/760 [2:02:40<05:29,  9.41s/it]

Processing document: 9625_Parking_Tax_Regulations-10-0.txt
1483


 96%|███████████████████████████████████████████████████▌  | 726/760 [2:02:54<06:02, 10.67s/it]

Processing document: 9625_Parking_Tax_Regulations-11-0.txt
1227


 96%|███████████████████████████████████████████████████▋  | 727/760 [2:03:06<06:09, 11.19s/it]

Processing document: 9625_Parking_Tax_Regulations-2-0.txt
1498


 96%|███████████████████████████████████████████████████▋  | 728/760 [2:03:16<05:44, 10.77s/it]

Processing document: 9625_Parking_Tax_Regulations-3-0.txt
1464


 96%|███████████████████████████████████████████████████▊  | 729/760 [2:03:28<05:41, 11.02s/it]

Processing document: 9625_Parking_Tax_Regulations-4-0.txt
1399


 96%|███████████████████████████████████████████████████▊  | 730/760 [2:03:41<05:51, 11.70s/it]

Processing document: 9625_Parking_Tax_Regulations-5-0.txt
1417


 96%|███████████████████████████████████████████████████▉  | 731/760 [2:03:53<05:43, 11.83s/it]

Processing document: 9625_Parking_Tax_Regulations-6-0.txt
1465


 96%|████████████████████████████████████████████████████  | 732/760 [2:04:05<05:28, 11.73s/it]

Processing document: 9625_Parking_Tax_Regulations-7-0.txt
1463


 96%|████████████████████████████████████████████████████  | 733/760 [2:04:12<04:44, 10.53s/it]

Processing document: 9625_Parking_Tax_Regulations-8-0.txt
1434


 97%|████████████████████████████████████████████████████▏ | 734/760 [2:04:19<04:04,  9.42s/it]

Processing document: 9625_Parking_Tax_Regulations-8-1.txt
334


 97%|████████████████████████████████████████████████████▏ | 735/760 [2:04:22<03:05,  7.43s/it]

Processing document: 9625_Parking_Tax_Regulations-9-0.txt
1431


 97%|████████████████████████████████████████████████████▎ | 736/760 [2:04:35<03:39,  9.14s/it]

Processing document: 9626_Payroll_Tax_Regulations-0-0.txt
1583


 97%|████████████████████████████████████████████████████▎ | 737/760 [2:04:46<03:41,  9.64s/it]

Processing document: 9626_Payroll_Tax_Regulations-1-0.txt
1452


 97%|████████████████████████████████████████████████████▍ | 738/760 [2:04:58<03:48, 10.39s/it]

Processing document: 9626_Payroll_Tax_Regulations-10-0.txt
996


 97%|████████████████████████████████████████████████████▌ | 739/760 [2:05:13<04:06, 11.72s/it]

Processing document: 9626_Payroll_Tax_Regulations-2-0.txt
1500


 97%|████████████████████████████████████████████████████▌ | 740/760 [2:05:21<03:35, 10.78s/it]

Processing document: 9626_Payroll_Tax_Regulations-3-0.txt
1448


 98%|████████████████████████████████████████████████████▋ | 741/760 [2:05:39<04:05, 12.94s/it]

Processing document: 9626_Payroll_Tax_Regulations-4-0.txt
1444


 98%|████████████████████████████████████████████████████▋ | 742/760 [2:05:56<04:12, 14.02s/it]

Processing document: 9626_Payroll_Tax_Regulations-5-0.txt
1444


 98%|████████████████████████████████████████████████████▊ | 743/760 [2:06:12<04:09, 14.67s/it]

Processing document: 9626_Payroll_Tax_Regulations-6-0.txt
1488


 98%|████████████████████████████████████████████████████▊ | 744/760 [2:06:23<03:36, 13.51s/it]

Processing document: 9626_Payroll_Tax_Regulations-7-0.txt
1491


 98%|████████████████████████████████████████████████████▉ | 745/760 [2:06:32<03:01, 12.13s/it]

Processing document: 9626_Payroll_Tax_Regulations-8-0.txt
1466


 98%|█████████████████████████████████████████████████████ | 746/760 [2:06:48<03:08, 13.45s/it]

Processing document: 9626_Payroll_Tax_Regulations-9-0.txt
1447


 98%|█████████████████████████████████████████████████████ | 747/760 [2:07:02<02:56, 13.56s/it]

Processing document: 9627_UF_Regulations-0-0.txt
1582


 98%|█████████████████████████████████████████████████████▏| 748/760 [2:07:14<02:35, 12.99s/it]

Processing document: 9627_UF_Regulations-1-0.txt
1454


 99%|█████████████████████████████████████████████████████▏| 749/760 [2:07:23<02:08, 11.68s/it]

Processing document: 9627_UF_Regulations-2-0.txt
1468


 99%|█████████████████████████████████████████████████████▎| 750/760 [2:07:32<01:51, 11.11s/it]

Processing document: 9627_UF_Regulations-2-1.txt
334


 99%|█████████████████████████████████████████████████████▎| 751/760 [2:07:35<01:16,  8.50s/it]

Processing document: 9627_UF_Regulations-3-0.txt
1391


 99%|█████████████████████████████████████████████████████▍| 752/760 [2:07:43<01:07,  8.39s/it]

Processing document: 9627_UF_Regulations-4-0.txt
1402


 99%|█████████████████████████████████████████████████████▌| 753/760 [2:07:57<01:09,  9.99s/it]

Processing document: 9627_UF_Regulations-5-0.txt
872


 99%|█████████████████████████████████████████████████████▌| 754/760 [2:08:03<00:54,  9.06s/it]

Processing document: 97-0-0.txt
1347


 99%|█████████████████████████████████████████████████████▋| 755/760 [2:08:21<00:58, 11.63s/it]

Processing document: 98-0-0.txt
1208


 99%|█████████████████████████████████████████████████████▋| 756/760 [2:08:31<00:44, 11.22s/it]

Processing document: 99-0-0.txt
1509


100%|█████████████████████████████████████████████████████▊| 757/760 [2:08:43<00:34, 11.44s/it]

Processing document: 99-1-0.txt
1404


100%|█████████████████████████████████████████████████████▊| 758/760 [2:09:00<00:25, 12.88s/it]

Processing document: cmu-leadership-org-chart-0-0.txt
660


100%|█████████████████████████████████████████████████████▉| 759/760 [2:09:06<00:10, 10.86s/it]

Processing document: cmu_fact_sheet_02-0-0.txt
1306


100%|██████████████████████████████████████████████████████| 760/760 [2:09:28<00:00, 10.22s/it]


In [18]:
all_qa_df.head()

Unnamed: 0,Doc_id,Question,Answer
0,10-0-0.txt,What is the name of the cul-de-sac located in ...,Roslyn Place
1,10-0-0.txt,What is the name of the street with the larges...,East Carson Street
2,10-0-0.txt,What is the name of the highway connecting Pit...,I-376
3,10-0-0.txt,What is the name of the highway that runs nort...,I-279
4,10-0-0.txt,What is the name of the belt system in Alleghe...,Allegheny County Belt System


In [13]:
# Save the new QA pairs back to the CSV file (optional)
all_qa_df.to_csv('../data/annotated/new_generated_qa_pairs_4.csv', index=False)

In [2]:
# read in the generated qa pairs and randomly select 50% of them for validation and rest for testing
import pandas as pd
import numpy as np
import random

qa_pairs = pd.read_csv('../data/annotated/QA_pairs.csv')
qa_pairs = qa_pairs.dropna()
qa_pairs = qa_pairs.drop_duplicates()

# randomly select 50% of the qa pairs for validation
validation_qa_pairs = qa_pairs.sample(frac=0.5, random_state=42)
validation_qa_pairs.to_csv('../data/annotated/QA_pairs_1.csv', index=False)

# get the rest of the qa pairs for testing
testing_qa_pairs = qa_pairs[~qa_pairs.index.isin(validation_qa_pairs.index)]
testing_qa_pairs.to_csv('../data/annotated/QA_pairs_2.csv', index=False)