In [1]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

## Dataset 1: Python Code

In [2]:
# Specify the dataset name and the column containing the content
ds1 = "flytech/python-codes-25k"
page_content_column = "text"  # or any other column you're interested in

# Create a loader instance
loader1 = HuggingFaceDatasetLoader(ds1, page_content_column)

# Load the data
data1 = loader1.load()

# Display the first 2 entries
data1[:2]

  from .autonotebook import tqdm as notebook_tqdm


[Document(metadata={'input': 'Setting up your daily to-do list...', 'instruction': 'Help me set up my daily to-do list!', 'output': "```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```"}, page_content='"Help me set up my daily to-do list! Setting up your daily to-do list... ```python\\ntasks = []\\nwhile True:\\n    task = input(\'Enter a task or type \'done\' to finish: \')\\n    if task == \'done\': break\\n    tasks.append(task)\\nprint(f\'Your to-do list for today: {tasks}\')\\n```"'),
 Document(metadata={'input': 'Creating a shopping list...', 'instruction': 'Create a shopping list based on my inputs!', 'output': "```python\nshopping_list = {}\nwhile True:\n    item = input('Enter an item or type 'done' to finish: ')\n    if item == 'done': break\n    quantity = input(f'Enter the quantity for {item}: ')\n    shopping_list[item] = quan

In [3]:
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs1 = text_splitter.split_documents(data1)

docs1[:2]

[Document(metadata={'input': 'Setting up your daily to-do list...', 'instruction': 'Help me set up my daily to-do list!', 'output': "```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```"}, page_content='"Help me set up my daily to-do list! Setting up your daily to-do list... ```python\\ntasks = []\\nwhile True:\\n    task = input(\'Enter a task or type \'done\' to finish: \')\\n    if task == \'done\': break\\n    tasks.append(task)\\nprint(f\'Your to-do list for today: {tasks}\')\\n```"'),
 Document(metadata={'input': 'Creating a shopping list...', 'instruction': 'Create a shopping list based on my inputs!', 'output': "```python\nshopping_list = {}\nwhile True:\n    item = input('Enter an item or type 'done' to finish: ')\n    if item == 'done': break\n    quantity = input(f'Enter the quantity for {item}: ')\n    shopping_list[item] = quan

In [4]:
# Define the path to the pre-trained model you want to use
modelPath1 = "sentence-transformers/all-MiniLM-L12-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs1 = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs1 = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings1 = HuggingFaceEmbeddings(
    model_name=modelPath1,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs1, # Pass the model configuration options
    encode_kwargs=encode_kwargs1 # Pass the encoding options
)

  warn_deprecated(


In [5]:
text = "Create a count down function."
query_result1 = embeddings1.embed_query(text)
query_result1[:3]

[-0.011383002623915672, 0.042903706431388855, -0.06694354861974716]

In [6]:
db = FAISS.from_documents(docs1, embeddings1)

In [7]:
#Save VDB
db.save_local("faiss_index")

## Dataset 2: CS

In [8]:
# Specify the dataset name and the column containing the content
ds2 = "AlaaElhilo/Wikipedia_ComputerScience"
page_content_column2 = "Text"  # or any other column you're interested in

# Create a loader instance
loader2 = HuggingFaceDatasetLoader(ds2, page_content_column2)

# Load the data
data2 = loader2.load()

# Display the first 15 entries
data2[:5]

Downloading readme: 100%|██████████| 50.0/50.0 [00:00<00:00, 351kB/s]
Downloading data: 100%|██████████| 3.15M/3.15M [00:00<00:00, 4.71MB/s]
Generating train split: 100%|██████████| 7609/7609 [00:00<00:00, 42630.37 examples/s]


[Document(metadata={'Number': 1}, page_content='"A computer is a machine that can be programmed to automatically carry out sequences of arithmetic or logical operations . Modern digital electronic computers can perform generic sets of operations known as programs. These programs enable computers to perform a wide range of tasks. The term computer system may refer to a nominally complete computer that includes the hardware, operating system, software, and peripheral equipment needed and used for full operation; or to a group of computers that are linked and function together, such as a computer network or computer cluster."'),
 Document(metadata={'Number': 2}, page_content='"A broad range of industrial and consumer products use computers as control systems, including simple special-purpose devices like microwave ovens and remote controls, and factory devices like industrial robots. Computers are at the core of general-purpose devices such as personal computers and mobile devices such as

In [9]:
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs2 = text_splitter.split_documents(data2)

# Viewing first few entries
docs2[:2]

[Document(metadata={'Number': 1}, page_content='"A computer is a machine that can be programmed to automatically carry out sequences of arithmetic or logical operations . Modern digital electronic computers can perform generic sets of operations known as programs. These programs enable computers to perform a wide range of tasks. The term computer system may refer to a nominally complete computer that includes the hardware, operating system, software, and peripheral equipment needed and used for full operation; or to a group of computers that are linked and function together, such as a computer network or computer cluster."'),
 Document(metadata={'Number': 2}, page_content='"A broad range of industrial and consumer products use computers as control systems, including simple special-purpose devices like microwave ovens and remote controls, and factory devices like industrial robots. Computers are at the core of general-purpose devices such as personal computers and mobile devices such as

In [10]:
# Define the path to the pre-trained model you want to use
modelPath2 = "sentence-transformers/all-MiniLM-L12-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs2 = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs2 = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings2 = HuggingFaceEmbeddings(
    model_name=modelPath2,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs2, # Pass the model configuration options
    encode_kwargs=encode_kwargs2 # Pass the encoding options
)

In [11]:
text = "Create a count down function."
query_result2 = embeddings2.embed_query(text)
query_result2[:3]

[-0.011383002623915672, 0.042903706431388855, -0.06694354861974716]

In [12]:
db2 = FAISS.from_documents(docs2, embeddings2)

In [13]:
#Save VDB
db2.save_local("faiss_index1")

## Merging Datasets and VDB into 1

In [14]:
# Load existing VDBs
embeddings = embeddings2
vdb1 = FAISS.load_local("./faiss_index", embeddings, allow_dangerous_deserialization=True)
vdb2 = FAISS.load_local("./faiss_index1", embeddings, allow_dangerous_deserialization=True)

In [None]:
import uuid


def merge_faiss_indexes(vdb1, vdb2):
    # Get the IDs from both indexes
    ids_vdb1 = set(vdb1.docstore.keys())
    ids_vdb2 = set(vdb2.docstore.keys())
    
    # Find overlapping IDs
    overlapping_ids = ids_vdb1.intersection(ids_vdb2)
    
    # If there are overlapping IDs, we need to handle them
    if overlapping_ids:
        print(f"Found {len(overlapping_ids)} overlapping IDs. Generating new IDs for the second index...")
        new_docs = []
        for doc_id, doc in vdb2.docstore.items():
            if doc_id in overlapping_ids:
                # Generate a new unique ID
                new_id = str(uuid.uuid4())
                doc['id'] = new_id
                new_docs.append(doc)
            else:
                new_docs.append(doc)
        
        # Create a new FAISS index for the modified documents
        vdb2_modified = FAISS.from_documents(new_docs, embeddings)
        return vdb1.merge_from(vdb2_modified)
    
    return vdb1.merge_from(vdb2)

# Merge FAISS indexes
combine_vdb = merge_faiss_indexes(vdb1, vdb2)