# Peparing triplet data for ColBERT/Ragatouille fine-tuning

In [1]:
from pathlib import Path
from ragatouille import RAGTrainer
from ragatouille import RAGPretrainedModel

  from .autonotebook import tqdm as notebook_tqdm


In [65]:
# Initialize the RAGTrainer
# RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
trainer = RAGTrainer(model_name="colbertv2.0_cmu_lti_finetunev1.0", pretrained_model_name="colbert-ir/colbertv2.0", n_usable_gpus=4)

In [66]:
# List to store all documents
all_documents = []
# List to store raw data in the required format
raw_data = []

In [67]:
# Iterate through each folder and read documents and annotations
for folder in ["About Scottie", "Buggy News", "history_of_cmu", "history_of_scs", "Kiltie Band", "lti_faculty", "lti_programs", "Tartan Facts"]:
    folder_path = Path(folder)
    # Read and store documents
    for doc_file in folder_path.glob("*.txt"):
        if doc_file.name != "annotation.txt":
            with open(doc_file, 'r', encoding='utf-8') as file:
                all_documents.append(file.read())

    # Read annotations and prepare raw data
    annotation_path = folder_path / "annotation.txt"
    with open(annotation_path, 'r', encoding='utf-8') as file:
        annotations = file.read().split("\n\n")  # Assuming each Q/A/D/T block is separated by two newlines
#         raw_data = [(lines[0].replace("Q: ", ""), lines[1].replace("A: ", "")) for block in annotations for lines in [block.strip().split("\n")] if len(lines) == 4]
        for block in annotations:
            lines = block.split("\n")
#             print(len(lines))
            if len(lines) == 4:  # Ensure it's a full block
                question = lines[0].replace("Q: ", "")
                answer = lines[1].replace("A: ", "")
                raw_data.append((question, answer))
# #                 question = lines[0][3:]  # Remove "Q: "
# #                 print(question)
# #                 answer = lines[1][3:]  # Remove "A: "
# #                 print(answer)
# #                 doc_ref = lines[2][3:]  # Remove "D: "

#                 question = lines[0].replace("Q: ", "")
#                 answer = lines[1].replace("A: ", "")
#                 print(f"Appending: {question} | {answer}")
#                 # Here you could use the doc_ref to link the question-answer pair with the specific document content if needed
#                 # For simplicity, we'll just use the question and answer for now
#                 raw_data.append((question, answer))

In [68]:
# print(annotation_path)
print(raw_data)
# print(all_documents)

[('When was The Kiltie Band founded?', 'The Kiltie Band was founded in 1908.'), ('Who founded Carnegie Mellon University and what type of pet did he keep?', 'Andrew Carnegie founded Carnegie Mellon University and had a Scottish terrier.'), ('When did Carnegie Mellon officially start the process to select a mascot?', 'The mascot selection process at Carnegie Mellon began in November 2006.'), ('Who co-chaired the Mascot Identity Task Force at Carnegie Mellon?', 'Susan Bassett and Jennifer Church co-chaired the Mascot Identity Task Force.'), ('How did the Carnegie Mellon community participate in the mascot selection process?', 'The community participated in the mascot choice through surveys and a Town Hall.'), ('What percentage of students voted for the Scottish terrier as the mascot in the 2007 survey?', '78% of 2,370 students voted for the Scottish terrier as the mascot in 2007.'), ('What misconception did approximately 25 percent of surveyed alumni have about the Scottish terrier?', '2

In [69]:
# Prepare training data
data_out_path = trainer.prepare_training_data(raw_data=raw_data, all_documents=all_documents)
print(f"Training data prepared and stored at: {data_out_path}")

Loading Hard Negative SimpleMiner dense embedding model BAAI/bge-small-en-v1.5...
Building hard negative index for 357 documents...
All documents embedded, now adding to index...
save_index set to False, skipping saving hard negative index
Hard negative index generated
Training data prepared and stored at: ./data/


In [70]:
# Path to the directory where the processed training data has been saved
# data_dir = './triplet_data/'

# Parameters for fine-tuning used the default ones but added for future configuration
batch_size = 32
nbits = 2
maxsteps = 500000
use_ib_negatives = True
learning_rate = 5e-6  # Adjust based on your needs and observations
dim = 128
doc_maxlen = 256
use_relu = False
warmup_steps = 'auto'  # Auto will default to 10% of total steps
accumsteps = 1

# Call the train method to start fine-tuning
model_path = trainer.train(
    batch_size=batch_size,
    nbits=nbits,
    maxsteps=maxsteps,
    use_ib_negatives=use_ib_negatives,
    learning_rate=learning_rate,
    dim=dim,
    doc_maxlen=doc_maxlen,
    use_relu=use_relu,
    warmup_steps=warmup_steps,
    accumsteps=accumsteps
)
    

print(f"Model fine-tuned and saved at: {model_path}")

#> Starting...
#> Starting...
#> Starting...
#> Starting...
nranks = 4 	 num_gpus = 4 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "index_bsize": 64,
    "nbits": 2,
    "kmeans_niters": 20,
    "resume": false,
    "similarity": "cosine",
    "bsize": 32,
    "accumsteps": 1,
    "lr": 5e-6,
    "maxsteps": 500000,
    "save_every": 10,
    "warmup": 10,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": true,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": "colbertv2.0_cmu_lti_finetunev1.0",
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 256,
    "mask_punctuation": true,
    "checkpoint": "colbert-ir\/c

config.json: 100%|██████████| 570/570 [00:00<00:00, 5.48MB/s]


#> LR will use 10 warmup steps and linear decay over 500000 steps.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . What is the mascot for Carnegie Mellon University (CMU)?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2054,  2003,  1996, 13314,  2005, 11298, 22181,  2118,
         1006,  4642,  2226,  1007,  1029,   102,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

				 0.5804923176765442 2.2281312942504883
#>>>    16.63 11.21 		|		 5.419999999999998
[Mar 06, 14:31:22] 0 2.8086235523223877
				 2.778505802154541 3.4276678562164307
#>>>    17.9 16.81 		|		 1.0899999999999999
[Mar 06, 14:31:23] 1 2.8120211026668547
				 1.7655978202819824 3.7759363651275635
#>>

In [71]:
RAG = RAGPretrainedModel.from_pretrained(".ragatouille/colbert/none/2024-03/06/12.27.36/checkpoints/colbert")

In [73]:
import os

# Base directory containing your folders
base_dir = "."

# Folders containing your data
folders = ["About Scottie", "Buggy News", "history_of_cmu", "history_of_scs", "Kiltie Band", "lti_faculty", "lti_programs", "Tartan Facts"]

# Initialize empty list to hold all document texts
collection = []
# Optionally, prepare a list for document IDs if you wish to use custom IDs
# document_ids = []

for folder in folders:
    folder_path = os.path.join(base_dir, folder)
    
    # Iterate over each text file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt") and filename != "annotation.txt":
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                # Read the document content and add it to the collection
                document_text = file.read()
                collection.append(document_text)
                
                # Optionally, add a custom document ID, e.g., foldername-filename
#                 document_id = f"{folder}-{filename}"
#                 document_ids.append(document_id)

# At this point, `collection` contains all your documents, and `document_ids` contains their IDs


In [76]:
print(len(collection))

34


In [80]:
index_name = "colbertv2.0_cmu_lti_finetunev1.0"  # Choose an appropriate name for your index

# Assuming RAG is your initialized RAGPretrainedModel with the loaded checkpoint
index_path = RAG.index(
    collection=collection,
#     document_ids=document_ids,  # Include this only if you prepared document IDs
    index_name=index_name,
    overwrite_index=True,  # Set to True to overwrite existing index of the same name
    max_document_length=256,  # Adjust based on your document length distribution
    split_documents=True,  # Set to True if documents should be split into shorter segments
)

print(f"Index created at: {index_path}")

________________________________________________________________________________
 This means that indexing will be slow. To make use of your GPU.
Please install `faiss-gpu` by running:
pip uninstall --y faiss-cpu & pip install faiss-gpu
 ________________________________________________________________________________
Will continue with CPU indexing in 5 seconds...
New index_name received! Updating current index_name (colbertv2.0_cmu_lti_finetunev1.0) to colbertv2.0_cmu_lti_finetunev1.0


[Mar 06, 16:49:46] #> Note: Output directory .ragatouille/colbert/indexes/colbertv2.0_cmu_lti_finetunev1.0 already exists


[Mar 06, 16:49:46] #> Will delete 1 files already at .ragatouille/colbert/indexes/colbertv2.0_cmu_lti_finetunev1.0 in 20 seconds...
#> Starting...
#> Starting...
#> Starting...
#> Starting...
nranks = 4 	 num_gpus = 4 	 device=1
[Mar 06, 16:50:13] [1] 		 #> Encoding 31 passages..
nranks = 4 	 num_gpus = 4 	 device=3
[Mar 06, 16:50:13] [3] 		 #> Encoding 28 passages..
nranks = 4 	 



Clustering 19014 points in 128D to 2048 clusters, redo 1 times, 20 iterations
  Preprocessing in 0.00 s
  Iteration 19 (0.17 s, search 0.15 s): objective=3017.31 imbalance=1.549 nsplit=0       
[Mar 06, 16:50:18] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


Process Process-19:
Traceback (most recent call last):
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/site-packages/colbert/infra/launcher.py", line 134, in setup_new_process
    return_val = callee(config, *args)
                 ^^^^^^^^^^^^^^^^^^^^^
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/site-packages/colbert/indexing/collection_indexer.py", line 33, in encode
    encoder.run(shared_lists)
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/site-packages/colbert/indexing/collection_indexer.py", line 68, in run
    self.train(shared_lists) # Trains centroids from selected passages
    ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12

KeyboardInterrupt: 