In [12]:
# imports

import numpy as np
import math
import os
import glob
from sklearn.manifold import TSNE
from dotenv import load_dotenv
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import plotly.graph_objects as go

# from langchain.vectorstores import FAISS
from langchain_community.vectorstores import FAISS


In [3]:
load_dotenv(override=True)

True

In [4]:
# Define the path to your content directory
path = "../documents/clean/"

# Use DirectoryLoader to recursively find and load all markdown files
loader = DirectoryLoader(path, glob="**/*.md", loader_cls=TextLoader)
docs = loader.load()

In [5]:
print(len(docs))

960


In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["#", "##", "###"],
    chunk_size=1000,
    chunk_overlap=100,
)
chunks = text_splitter.split_documents(docs)

In [7]:
print(f"Total number of chunks: {len(chunks)}")

Total number of chunks: 1238


In [8]:
chunk_content = [chunk.page_content for chunk in chunks]
longest_chunk = max(chunk_content)

In [9]:
print("Length:"+str(len(longest_chunk)))

Length:942


In [10]:
embeddings = OpenAIEmbeddings()

In [11]:
### SWITCH TO FAISS

# Define the path for the FAISS database
db_path = '../../db'

# Delete the FAISS directory if it already exists to start fresh
if os.path.exists(db_path):
    print(f"Removing existing FAISS directory at: {db_path}")
    shutil.rmtree(db_path)

# Ensure the parent directory exists
os.makedirs(os.path.dirname(db_path), exist_ok=True)

# Define a batch size
batch_size = 210  # Number of documents per batch

# Check if there are any chunks to process
if not chunks:
    print("No document chunks to process. Exiting.")
else:
    num_batches = math.ceil(len(chunks) / batch_size)

    # Create the vectorstore with the first batch
    print("Processing the first batch...")
    first_batch = chunks[:batch_size]
    vectorstore = FAISS.from_documents(
        documents=first_batch,
        embedding=embeddings
    )
    print(f"Batch 1 processed. Vectorstore now has {vectorstore.index.ntotal} documents.")

    # Add the remaining batches in a loop
    for i in range(1, num_batches):
        print(f"Processing batch {i+1}/{num_batches}...")
        start_index = i * batch_size
        end_index = start_index + batch_size
        next_batch = chunks[start_index:end_index]
        
        # Add the next batch of documents to the existing FAISS index
        vectorstore.add_documents(documents=next_batch)
        print(f"Batch {i+1} processed. Vectorstore now has {vectorstore.index.ntotal} documents.")

    # Save the final FAISS index to the specified path
    print(f"\nSaving FAISS index to {db_path}...")
    vectorstore.save_local(db_path)
    print("FAISS index saved successfully.")

Processing the first batch...
Batch 1 processed. Vectorstore now has 210 documents.
Processing batch 2/6...
Batch 2 processed. Vectorstore now has 420 documents.
Processing batch 3/6...
Batch 3 processed. Vectorstore now has 630 documents.
Processing batch 4/6...
Batch 4 processed. Vectorstore now has 840 documents.
Processing batch 5/6...
Batch 5 processed. Vectorstore now has 1050 documents.
Processing batch 6/6...
Batch 6 processed. Vectorstore now has 1238 documents.

Saving FAISS index to ../../db...
FAISS index saved successfully.


In [None]:
db_path = '../../vector_db'
# Delete if already exists

if os.path.exists(db_path):
    Chroma(persist_directory=db_path, embedding_function=embeddings).delete_collection()

# Define a batch size
batch_size = 210  # Number of documents per batch
num_batches = math.ceil(len(chunks) / batch_size)

# Create the vectorstore with the first batch
first_batch = chunks[:batch_size]
vectorstore = Chroma.from_documents(
    documents=first_batch,
    embedding=embeddings, 
    persist_directory=db_path
)
print(f"Batch 1 processed. Vectorstore now has {vectorstore._collection.count()} documents.")

# Add the remaining batches in a loop
for i in range(1, num_batches):
    start_index = i * batch_size
    end_index = start_index + batch_size
    next_batch = chunks[start_index:end_index]
    
    vectorstore.add_documents(documents=next_batch)
    print(f"Batch {i+1} processed. Vectorstore now has {vectorstore._collection.count()} documents.")

print("All batches processed successfully.")

In [None]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
# Visualize
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']

In [None]:
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, opacity=0.8),
    text=[f"Text: {d[:100]}..." for d in documents],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()