# Load dependecies and the setup environment

In [1]:
%load_ext autoreload
%autoreload 2

import csv
import itertools
import multiprocessing
import os
import resource
import sys
import time
from concurrent.futures import ProcessPoolExecutor

import chromadb
import pandas as pd
from langchain.docstore.document import Document
from langchain.document_loaders import CSVLoader, DirectoryLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from tqdm.notebook import tqdm

# Project specific modules
from scripts import split_documents

csv.field_size_limit(sys.maxsize) # Prevents the "Error: field larger than field limit" error in CSVLoader
os.environ['TOKENIZERS_PARALLELISM'] = 'True' # Prevent huggingface tokenizers from disabling parallelism

# # Limit max RAM usage to 64 GB (may cause out of memory errors)
# soft, hard = resource.getrlimit(resource.RLIMIT_AS)
# resource.setrlimit(resource.RLIMIT_AS, (64*1024*1024*1024, hard))

# Define helper functions

In [2]:
def convert_parquet_file_to_csv(parquet_input_file, csv_output_file, columns=None):
    """ Convert the input parquet file into a csv file. If columns is not None, only
    specified columns will be copied to the output csv file.

    Parameters:
    - parquet_input_file (str): input parguet file
    - csv_output_file (str): csv output file where the converted data will be saved
    - columns (list[str], optional): list of column names to be included in the output csv file

    Returns:
        None
    """
    if columns == None:
        df = pd.read_parquet(parquet_file_path)
    else:
        df = pd.read_parquet(parquet_file_path, columns=columns)
    df.to_csv(csv_output_file, index=False)
       

def split_list(input_list, split_count):
    """
    Split the input_list into split_count sublists

    Parameters:
    - input_list (list): List to be split
    - split_count (int): Number of splits

    Yields:
    (list): A sublit of elements from input_list
    """
    if split_count > len(input_list):
        raise Exception('split_count must be less or equal to input_list length')
    sub_list_element_count = len(input_list) // split_count
    remaining_elements = len(input_list) % split_count
    start = 0
    for i in range(split_count):
        end = start + sub_list_element_count + (1 if i < remaining_elements else 0)
        yield input_list[start:end]
        start = end


def load_split_docs(csv_input_file, thread_count=8):
    """ 
    Load the specified csv file, split it into LangChain Documents no longer than
    1536 tokens and process them, so they can be used effectively with nomic-embed

    Parameters:
    - csv_input_file (str): csv input file that will be loaded using the CSVLoader from LangChain
    - thread_count (int, optional): number of threads to use during text splitting

    Returns:
    (list[str]) - list of LangChain Documents created from the input csv file
    """
    #Prepare the csv loader
    langchain_loader = CSVLoader(
        file_path=csv_input_file,
        metadata_columns=['title', 'categories']
    )
    
    # Split the documents using multiple threads and an external splitting script
    documents_raw = langchain_loader.load()
    documents_raw_sublists = split_list(documents_raw, thread_count)

    documents_split_sublists = split_documents.main(documents_raw_sublists, thread_count=thread_count)
    documents_split = list(itertools.chain(*documents_split_sublists))

    # Prepend 'search_document: ' to each document (required by nomic-embed)
    documents_final = []
    for doc in documents_split:
        documents_final.append(Document(page_content='search_document: ' + doc.page_content, metadata=doc.metadata))
    return documents_final

# Convert parquet files to CSVs

In [3]:
#import psutil
parquet_files_dir = os.path.join('data', 'parquet')
parquet_file_names = os.listdir(parquet_files_dir)
parquet_file_paths = [os.path.join(parquet_files_dir, file_name) for file_name in parquet_file_names] 
csv_files_dir = os.path.join('data', 'csv')
for parquet_file_path in parquet_file_paths:
    csv_output_filename = os.path.splitext(os.path.basename(parquet_file_path))[0] + '.csv'
    csv_output_file_path = os.path.join(csv_files_dir, csv_output_filename)
    # Process each file in a new thread to minimize memory usage
    with multiprocessing.Pool(1) as pool:
        pool.starmap(convert_parquet_file_to_csv, [(parquet_file_path, csv_output_file_path, ['title', 'text', 'categories'])])
    #print(psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024))

# Initialize objects for data processing, embedding and storage

In [3]:
# langchain_loader = DirectoryLoader(
#     os.path.join('data', 'csv'),
#     glob='**/*.csv', loader_cls=CSVLoader,
#     loader_kwargs={
#         'metadata_columns':['title', 'categories']
#     },
#     use_multithreading=True
# )

embedding_model = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={
        'device': 'cuda',
        'trust_remote_code': True
    }
)

chroma_client = chromadb.PersistentClient(path='chroma_data')
try:
    chroma_collection = chroma_client.create_collection(name='english_wikipedia')
except chromadb.db.base.UniqueConstraintError as e:
    print('Collection already exists: skipping')
langchain_vector_db = Chroma(client=chroma_client, embedding_function=embedding_model)

You try to use a model that was created with version 2.4.0.dev0, however, your version is 2.3.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-embed-text-v1:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/52.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-embed-text-v1:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
<All keys matched successfully>


# Load and split the documents, generate embeddings

In [4]:
%%time

csv_files_dir = os.path.join('data', 'csv')
csv_file_names = os.listdir(csv_files_dir)
csv_file_paths = [os.path.join(csv_files_dir, file_name) for file_name in csv_file_names]
# Process each file one-by-one in a seperate process to minimize RAM consumption
for csv_file in csv_file_paths:
    with ProcessPoolExecutor(1) as executor:
        future = executor.submit(load_split_docs, csv_file, 48)
        documents = future.result()
        # Split the documents_final list into multiple sublists to reduce GPU memory consumption
        split_count=len(documents) // 20
        for docs in tqdm(split_list(documents, split_count), total=split_count):
            langchain_vector_db.add_documents(docs)

Token indices sequence length is longer than the specified maximum sequence length for this model (1030 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/34555 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (593 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/32794 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (792 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/22476 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (615 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (641 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1156 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/41986 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1666 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/34033 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1424 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (694 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (726 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (601 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

  0%|          | 0/50099 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2223 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2477 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/24563 [00:00<?, ?it/s]

  0%|          | 0/60841 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (721 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (733 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (770 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (615 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

  0%|          | 0/68288 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/18116 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/54645 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2333 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (642 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/32769 [00:00<?, ?it/s]

  0%|          | 0/1364 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/8885 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1133 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/29371 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (717 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (925 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/22846 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (536 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1736 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/39730 [00:00<?, ?it/s]

  0%|          | 0/3712 [00:00<?, ?it/s]

  0%|          | 0/12091 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/54713 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1638 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/2282 [00:00<?, ?it/s]

  0%|          | 0/2652 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2658 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/14139 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (953 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/5431 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (843 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/26351 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (799 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (6651 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4896 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/63303 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (93379 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (754 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/54373 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/24066 [00:00<?, ?it/s]

CPU times: user 5d 4h 6min 52s, sys: 18h 7min 9s, total: 5d 22h 14min 2s
Wall time: 6d 1h 37min 38s


In [None]:
# %%time
# thread_count = 32
# documents_raw = langchain_loader.load()
# documents_raw_sublists = split_list(documents_raw, thread_count)

# from scripts import split_documents
# documents_split_sublists = split_documents.main(documents_raw_sublists, thread_count=thread_count)
# documents_split = list(itertools.chain(*documents_split_sublists))

# # Delete unnecessary variables to free up memory
# del documents_raw
# del documents_split_sublists
# del documents_raw_sublists

In [6]:
# %%time
# documents_final = []
# # Prepend 'search_document: ' to each document (required by nomic-embed)
# for doc in documents_split:
#     documents_final.append(Document(page_content='search_document: ' + doc.page_content, metadata=doc.metadata))

CPU times: user 2.81 s, sys: 447 ms, total: 3.26 s
Wall time: 3.4 s


# Generate document embeddings

In [6]:
# %%time
# # Split the documents_final list into multiple sublists to reduce GPU memory consumption
# split_count = 1_400_000
# for docs in tqdm(split_list(documents_final, split_count), total=split_count):
#     langchain_vector_db.add_documents(docs)

  0%|          | 0/10000 [00:00<?, ?it/s]

CPU times: user 18min 54s, sys: 1min 15s, total: 20min 9s
Wall time: 18min


In [6]:
def search_vector_db(query, vector_db, k=10):
    most_similar_docs = vector_db.similarity_search_with_score(query, k=k)
    print(most_similar_docs[3])

search_vector_db("search_query: the letter X?", langchain_vector_db)

(Document(page_content='search_document: text: Xi is the fourteenth letter of the Greek alphabet (uppercase Ξ, lowercase ξ; ), representing the voiceless consonant cluster . Its name is pronounced in Modern Greek, and generally or in English."xi". New Oxford American Dictionary, 2nd Edition. In the system of Greek numerals, it has a value of 60. Xi was derived from the Phoenician letter samekh 20px. Xi is distinct from the letter chi, which gave its form to the Latin letter X. ==Greek == thumb |upright 1.5|left|A joined variant of Ξ (New Athena Unicode font) Both in classical Ancient Greek and in Modern Greek, the letter Ξ represents the consonant cluster /ks/. In some archaic local variants of the Greek alphabet, this letter was missing. Instead, especially in the dialects of most of the Greek mainland and Euboea, the cluster /ks/ was represented by Χ (which in classical Greek is chi, used for ). Because this variant of the Greek alphabet was used in Magna Graecia (the Greek colonies 