# Peparing triplet data for ColBERT/Ragatouille fine-tuning

In [17]:
from pathlib import Path
from ragatouille import RAGTrainer
from ragatouille import RAGPretrainedModel

In [3]:
# Initialize the RAGTrainer
# RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
trainer = RAGTrainer(model_name="colbertv2.0_cmu_lti_finetunev2.0", pretrained_model_name="colbert-ir/colbertv2.0", n_usable_gpus=4)

In [4]:
# List to store all documents
all_documents = []
# List to store raw data in the required format
raw_data = []

In [5]:
for folder in ["courses", "../academic_calendars", "../program_handbooks", "../Web Scholar PDFs", "../fall23"]:
    folder_path = Path(folder)
    # Read and store documents
    for doc_file in folder_path.glob("*.txt"):
        if doc_file.name != "annotation.txt":
            with open(doc_file, 'r', encoding='utf-8') as file:
                all_documents.append(file.read())

In [6]:
print(all_documents[:1])

["The course 10301, 'Introduction to Machine Learning,' offers 12 units in section A, meeting on Mondays, Wednesdays, and Fridays from 11:00 AM to 12:20 PM in Pittsburgh, Pennsylvania, at TEP 1403, with an in-person expectation, taught by Matthew Gormley, Henry Chai, and Hoda Heidari.\nThe course 10315, 'Introduction to Machine Learning (SCS Majors),' provides 12 units in section Lec 1, convening on Mondays and Wednesdays from 12:30 PM to 01:50 PM in Pittsburgh, Pennsylvania, at SH 105, under an in-person expectation, instructed by Patrick Virtue.\nThe course 10335, 'Art and Machine Learning,' awards 12 units in section A, scheduled for Mondays and Wednesdays from 7:00 PM to 8:20 PM in Pittsburgh, Pennsylvania, at TEP 1403, featuring an in-person expectation, taught by Eunsu Kang.\nThe course 10403, 'Deep Reinforcement Learning & Control,' offers 12 units in section A, gathering on Mondays, Wednesdays, and Fridays from 12:30 PM to 01:50 PM in Pittsburgh, Pennsylvania, at GHC 4215, with

In [7]:
# Iterate through each folder and read documents and annotations
for folder in ["About Scottie", "Buggy News", "history_of_cmu", "history_of_scs", "Kiltie Band", "lti_faculty", "lti_programs", "Tartan Facts"]:
    folder_path = Path(folder)
    # Read and store documents
    for doc_file in folder_path.glob("*.txt"):
        if doc_file.name != "annotation.txt":
            with open(doc_file, 'r', encoding='utf-8') as file:
                all_documents.append(file.read())

    # Read annotations and prepare raw data
    annotation_path = folder_path / "annotation.txt"
    with open(annotation_path, 'r', encoding='utf-8') as file:
        annotations = file.read().split("\n\n")  # Assuming each Q/A/D/T block is separated by two newlines
#         raw_data = [(lines[0].replace("Q: ", ""), lines[1].replace("A: ", "")) for block in annotations for lines in [block.strip().split("\n")] if len(lines) == 4]
        for block in annotations:
            lines = block.split("\n")
#             print(len(lines))
            if len(lines) == 4:  # Ensure it's a full block
                question = lines[0].replace("Q: ", "")
                answer = lines[1].replace("A: ", "")
                raw_data.append((question, answer))

# #                 question = lines[0][3:]  # Remove "Q: "
# #                 print(question)
# #                 answer = lines[1][3:]  # Remove "A: "
# #                 print(answer)
# #                 doc_ref = lines[2][3:]  # Remove "D: "

#                 question = lines[0].replace("Q: ", "")
#                 answer = lines[1].replace("A: ", "")
#                 print(f"Appending: {question} | {answer}")
#                 # Here you could use the doc_ref to link the question-answer pair with the specific document content if needed
#                 # For simplicity, we'll just use the question and answer for now
#                 raw_data.append((question, answer))

In [8]:
# print(annotation_path)
print(raw_data[:15])
# print(all_documents)

[('When was The Kiltie Band founded?', 'The Kiltie Band was founded in 1908.'), ('Who founded Carnegie Mellon University and what type of pet did he keep?', 'Andrew Carnegie founded Carnegie Mellon University and had a Scottish terrier.'), ('When did Carnegie Mellon officially start the process to select a mascot?', 'The mascot selection process at Carnegie Mellon began in November 2006.'), ('Who co-chaired the Mascot Identity Task Force at Carnegie Mellon?', 'Susan Bassett and Jennifer Church co-chaired the Mascot Identity Task Force.'), ('How did the Carnegie Mellon community participate in the mascot selection process?', 'The community participated in the mascot choice through surveys and a Town Hall.'), ('What percentage of students voted for the Scottish terrier as the mascot in the 2007 survey?', '78% of 2,370 students voted for the Scottish terrier as the mascot in 2007.'), ('What misconception did approximately 25 percent of surveyed alumni have about the Scottish terrier?', '2

In [9]:
# Prepare training data
data_out_path = trainer.prepare_training_data(raw_data=raw_data, all_documents=all_documents, data_out_path="./colbertv2data/")
print(f"Training data prepared and stored at: {data_out_path}")

Loading Hard Negative SimpleMiner dense embedding model BAAI/bge-small-en-v1.5...
Building hard negative index for 380 documents...
All documents embedded, now adding to index...
save_index set to False, skipping saving hard negative index
Hard negative index generated
Training data prepared and stored at: ./colbertv2data/


In [10]:
import os

# Set the MKL_THREADING_LAYER environment variable
os.environ['MKL_SERVICE_FORCE_INTE'] = '1'

# Now you can import numpy and other libraries
import numpy as np


In [11]:
!pip install -U numpy

Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: numpy in /home/trevea/.local/lib/python3.7/site-packages (1.21.6)


In [10]:
import os
import numpy as np

In [12]:
os.environ['MKL_THREADING_LAYER'] = 'GNU'

In [13]:
# Path to the directory where the processed training data has been saved
# data_dir = './triplet_data/'
import numpy as np

# Parameters for fine-tuning used the default ones but added for future configuration
batch_size = 32
nbits = 2
maxsteps = 500000
use_ib_negatives = True
learning_rate = 5e-6  # Adjust based on your needs and observations
dim = 128
doc_maxlen = 256
use_relu = False
warmup_steps = 'auto'  # Auto will default to 10% of total steps
accumsteps = 1

# Call the train method to start fine-tuning
model_path = trainer.train(
    batch_size=batch_size,
    nbits=nbits,
    maxsteps=maxsteps,
    use_ib_negatives=use_ib_negatives,
    learning_rate=learning_rate,
    dim=dim,
    doc_maxlen=doc_maxlen,
    use_relu=use_relu,
    warmup_steps=warmup_steps,
    accumsteps=accumsteps
)
    

print(f"Model fine-tuned and saved at: {model_path}")

#> Starting...
#> Starting...
#> Starting...
#> Starting...
nranks = 4 	 num_gpus = 4 	 device=2
Using config.bsize = 8 (per process) and config.accumsteps = 1
[Mar 16, 21:15:48] #> Loading the queries from colbertv2data/queries.train.colbert.tsv ...
[Mar 16, 21:15:48] #> Got 328 queries. All QIDs are unique.

[Mar 16, 21:15:48] #> Loading collection...
0M nranks = 4 	 num_gpus = 4 	 device=1
Using config.bsize = 8 (per process) and config.accumsteps = 1
[Mar 16, 21:15:48] #> Loading the queries from colbertv2data/queries.train.colbert.tsv ...
[Mar 16, 21:15:48] #> Got 328 queries. All QIDs are unique.

[Mar 16, 21:15:48] #> Loading collection...
0M nranks = 4 	 num_gpus = 4 	 device=3
Using config.bsize = 8 (per process) and config.accumsteps = 1
[Mar 16, 21:15:48] #> Loading the queries from colbertv2data/queries.train.colbert.tsv ...
[Mar 16, 21:15:48] #> Got 328 queries. All QIDs are unique.

[Mar 16, 21:15:48] #> Loading collection...
0M nranks = 4 	 num_gpus = 4 	 device=0
{
    




#> LR will use 10 warmup steps and linear decay over 500000 steps.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Who co-chaired the Mascot Identity Task Force at Carnegie Mellon?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2040,  2522,  1011, 12282,  1996, 13314,  4767,  4708,
         2486,  2012, 11298, 22181,  1029,   102,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

				 0.7071167230606079 1.472010850906372
#>>>    17.93 13.41 		|		 4.52
[Mar 16, 21:15:56] 0 2.1791276931762695
				 0.015664443373680115 0.39106568694114685
#>>>    19.01 10.37 		|		 8.640000000000002
[Mar 16, 21:15:57] 1 2.177355295598507
				 0.6828833222389221 1.952773928642273
#>>> 

In [16]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import Repository
repo = Repository(".ragatouille/colbert/none/2024-03/16/21.14.40/checkpoints/colbert", clone_from="your-hf-username/your-model-name", use_auth_token=True)
repo.push_to_hub()

In [19]:
!export CUDA_HOME=$CONDA_PREFIX

In [20]:
pip install https://github.com/kyamagu/faiss-wheels/releases/download/v1.7.3/faiss_gpu-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

[31mERROR: faiss_gpu-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl is not a supported wheel on this platform.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [18]:
RAG = RAGPretrainedModel.from_pretrained("EddieT/colbert_cmu_lti_finetunev2.0")

In [19]:
import os

# Base directory containing your folders
base_dir = "."

# Folders containing your data
folders = ["About Scottie", "Buggy News", "history_of_cmu", "history_of_scs", "Kiltie Band", "lti_faculty", "lti_programs", "Tartan Facts", "courses", "../academic_calendars", "../program_handbooks", "../Web Scholar PDFs"]

# Initialize empty list to hold all document texts
collection = []
# Optionally, prepare a list for document IDs if you wish to use custom IDs
# document_ids = []

for folder in folders:
    folder_path = os.path.join(base_dir, folder)
    
    # Iterate over each text file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt") and filename != "annotation.txt":
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                # Read the document content and add it to the collection
                document_text = file.read()
                collection.append(document_text)
                
                # Optionally, add a custom document ID, e.g., foldername-filename
#                 document_id = f"{folder}-{filename}"
#                 document_ids.append(document_id)

# At this point, `collection` contains all your documents, and `document_ids` contains their IDs


In [20]:
print(len(collection))

52


In [11]:
!pip install -U ninja

Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: ninja in /home/trevea/.local/lib/python3.7/site-packages (1.11.1.1)


In [21]:
index_name = "colbertv2.0_cmu_lti_finetunev2.0"  # Choose an appropriate name for your index

# Assuming RAG is your initialized RAGPretrainedModel with the loaded checkpoint
index_path = RAG.index(
    collection=collection,
#     document_ids=document_ids,  # Include this only if you prepared document IDs
    index_name=index_name,
    overwrite_index=True,  # Set to True to overwrite existing index of the same name
    max_document_length=256,  # Adjust based on your document length distribution
    split_documents=True,  # Set to True if documents should be split into shorter segments
)

print(f"Index created at: {index_path}")



[Mar 16, 22:42:52] #> Note: Output directory .ragatouille/colbert/indexes/colbertv2.0_cmu_lti_finetunev2.0 already exists


[Mar 16, 22:42:52] #> Will delete 1 files already at .ragatouille/colbert/indexes/colbertv2.0_cmu_lti_finetunev2.0 in 20 seconds...
#> Starting...
#> Starting...
#> Starting...
nranks = 4 	 num_gpus = 4 	 device=1
[Mar 16, 22:43:20] [1] 		 #> Encoding 282 passages..
#> Starting...
nranks = 4 	 num_gpus = 4 	 device=2
[Mar 16, 22:43:23] [2] 		 #> Encoding 282 passages..
nranks = 4 	 num_gpus = 4 	 device=3
[Mar 16, 22:43:26] [3] 		 #> Encoding 281 passages..
nranks = 4 	 num_gpus = 4 	 device=0
[Mar 16, 22:43:26] [0] 		 #> Encoding 282 passages..
[Mar 16, 22:43:28] [0] 		 avg_doclen_est = 172.61654663085938 	 len(local_sample) = 282
[Mar 16, 22:43:28] [3] 		 avg_doclen_est = 172.61654663085938 	 len(local_sample) = 281
[Mar 16, 22:43:28] [2] 		 avg_doclen_est = 172.61654663085938 	 len(local_sample) = 282
[Mar 16, 22:43:28] [1] 		 avg_doclen_est = 172.61654663085

Process Process-6:
Traceback (most recent call last):
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/site-packages/colbert/infra/launcher.py", line 134, in setup_new_process
    return_val = callee(config, *args)
                 ^^^^^^^^^^^^^^^^^^^^^
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/site-packages/colbert/indexing/collection_indexer.py", line 33, in encode
    encoder.run(shared_lists)
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/site-packages/colbert/indexing/collection_indexer.py", line 68, in run
    self.train(shared_lists) # Trains centroids from selected passages
    ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/trevea/miniconda3/envs/nlp-rag/lib/python3.12/

KeyboardInterrupt: 