In [15]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np

# Path to your CSV file
path = './scriptures/csv/lds-scriptures.csv'

# Load your CSV file
df = pd.read_csv(path)

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to split text into overlapping chunks
def chunk_text(text, chunk_size=512, overlap=128):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        if chunk:
            chunks.append(chunk)
        if i + chunk_size >= len(words):
            break
    return chunks

# Create a new DataFrame to store the embeddings and all original columns
embedding_data = []

for index, row in df.iterrows():
    # if index == 100:
    #     break

    # progres log 
    if index % 1000 == 0:
        progress = index / len(df) * 100
        print(f'Progress: {progress:.2f}%')

    scripture_text = row['scripture_text']
    chunks = chunk_text(scripture_text)
    for chunk_index, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()
        new_row = row.to_dict()
        new_row['chunk_id'] = f'{index}_{chunk_index}'
        new_row['chunk_text'] = chunk
        new_row['embedding'] = embedding
        embedding_data.append(new_row)

# Convert the embedding data to a DataFrame
embedding_df = pd.DataFrame(embedding_data)

# Save the DataFrame to a new CSV file
embedding_df.to_csv('./embeddings/lds-scriptures-chunked-embeddings.csv', index=False)


 



Progress: 0.00%
Progress: 2.38%
Progress: 4.76%
Progress: 7.14%
Progress: 9.52%
Progress: 11.91%
Progress: 14.29%
Progress: 16.67%
Progress: 19.05%
Progress: 21.43%
Progress: 23.81%
Progress: 26.19%
Progress: 28.57%
Progress: 30.96%
Progress: 33.34%
Progress: 35.72%
Progress: 38.10%
Progress: 40.48%
Progress: 42.86%
Progress: 45.24%
Progress: 47.62%
Progress: 50.01%
Progress: 52.39%
Progress: 54.77%
Progress: 57.15%
Progress: 59.53%
Progress: 61.91%
Progress: 64.29%
Progress: 66.67%
Progress: 69.06%
Progress: 71.44%
Progress: 73.82%
Progress: 76.20%
Progress: 78.58%
Progress: 80.96%
Progress: 83.34%
Progress: 85.72%
Progress: 88.11%
Progress: 90.49%
Progress: 92.87%
Progress: 95.25%
Progress: 97.63%


In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np

# Load the embeddings DataFrame
embedding_df = pd.read_csv('./embeddings/lds-scriptures-chunked-embeddings.csv')

# Initialize ChromaDB client
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

# Create a collection
collection = chroma_client.get_or_create_collection('scriptures')

# Function to get metadata for a row
def get_metadata(row):
    metadata = {
        "volume_id": row['volume_id'],
        "book_id": row['book_id'],
        "chapter_id": row['chapter_id'],
        "verse_id": row['verse_id'],
        "volume_title": row['volume_title'],
        "book_title": row['book_title'],
        "volume_long_title": row['volume_long_title'],
        "book_long_title": row['book_long_title'],
        "volume_subtitle": row['volume_subtitle'],
        "book_subtitle": row['book_subtitle'],
        "volume_short_title": row['volume_short_title'],
        "book_short_title": row['book_short_title'],
        "volume_lds_url": row['volume_lds_url'],
        "book_lds_url": row['book_lds_url'],
        "chapter_number": row['chapter_number'],
        "verse_number": row['verse_number'],
        "scripture_text": row['scripture_text'],
        "verse_title": row['verse_title'],
        "verse_short_title": row['verse_short_title'],
        "chunk_id": row['chunk_id'],
    }
    return metadata

# Ensure 'chunk_id' exists and is unique
embedding_df['chunk_id'] = embedding_df.index

# Convert embeddings from string representation to list of floats
embedding_df['embedding'] = embedding_df['embedding'].apply(lambda x: np.fromstring(x.strip('[]'), sep=','))

# Define batch size
batch_size = 10000

# Function to add data in batches
def add_data_in_batches(df, batch_size):
    for start in range(0, len(df), batch_size):
        if start > 1:
            break
        end = min(start + batch_size, len(df))
        batch_df = df.iloc[start:end]
        collection.add(
            documents=[row["chunk_text"] for index, row in batch_df.iterrows()], 
            embeddings=[row["embedding"].tolist() for index, row in batch_df.iterrows()],
            metadatas=[get_metadata(row) for index, row in batch_df.iterrows()],
            ids=[str(row["chunk_id"]) for index, row in batch_df.iterrows()]
        )

# Add data to the collection in batches
add_data_in_batches(embedding_df, batch_size)




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np

# Path to your CSV file
path = './scriptures/csv/lds-scriptures.csv'

# Load your CSV file
df = pd.read_csv(path)

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize ChromaDB client
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

# Create a collection
collection = chroma_client.get_or_create_collection('scriptures')

# Function to store metadata and embedding in ChromaDB
def store_metadata(index, row, embedding):
    metadata = {
        "volume_id": row['volume_id'],
        "book_id": row['book_id'],
        "chapter_id": row['chapter_id'],
        "verse_id": row['verse_id'],
        "volume_title": row['volume_title'],
        "book_title": row['book_title'],
        "volume_long_title": row['volume_long_title'],
        "book_long_title": row['book_long_title'],
        "volume_subtitle": row['volume_subtitle'],
        "book_subtitle": row['book_subtitle'],
        "volume_short_title": row['volume_short_title'],
        "book_short_title": row['book_short_title'],
        "volume_lds_url": row['volume_lds_url'],
        "book_lds_url": row['book_lds_url'],
        "chapter_number": row['chapter_number'],
        "verse_number": row['verse_number'],
        "scripture_text": row['scripture_text'],
        "verse_title": row['verse_title'],
        "verse_short_title": row['verse_short_title']
    }
    collection.add(ids=[f"scripture_{index}"], embeddings=[embedding], metadatas=[metadata])

# Generate and store embeddings with metadata
for index, row in df.iterrows():
    embedding = model.encode(row['scripture_text']).tolist()
    store_metadata(index, row, embedding)

print("Embeddings and metadata stored in ChromaDB")




In [3]:
import chromadb
from chromaviz import visualize_collection

# Initialize ChromaDB client
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

# Create a collection
collection = chroma_client.get_or_create_collection('scriptures')

visualize_collection(collection)

 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [27/May/2024 13:41:03] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:41:03] "GET /assets/index-26d05a53.css HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:41:03] "GET /assets/index-351494fc.js HTTP/1.1" 200 -


           0         1         2         3         4         5         6    \
0     0.022823  0.065861 -0.036052 -0.017967 -0.010313  0.065761 -0.011444   
1    -0.004418  0.091539  0.067995  0.068877  0.023347 -0.038192 -0.010090   
2    -0.037773  0.120210  0.009190  0.044382  0.068671  0.038092  0.044289   
3     0.013303  0.126572 -0.006762 -0.055223 -0.164984  0.056559  0.061735   
4     0.044378  0.111903 -0.050361  0.011581 -0.001332 -0.034457  0.030829   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.010052  0.147688  0.018472 -0.064112 -0.067290 -0.078391  0.051142   
9996  0.003477  0.126376  0.047970 -0.018702  0.062161 -0.009174  0.017889   
9997  0.046349  0.084094 -0.037882  0.004399 -0.049076 -0.096141 -0.016350   
9998  0.023923  0.142483  0.010234 -0.038099 -0.040133 -0.040321  0.022106   
9999  0.006446  0.124298  0.028744  0.020818 -0.045881 -0.026334 -0.037289   

           7         8         9    ...       374       375    

127.0.0.1 - - [27/May/2024 13:42:09] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:42:09] "GET /assets/index-26d05a53.css HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:42:09] "GET /assets/index-351494fc.js HTTP/1.1" 200 -


           0         1         2         3         4         5         6    \
0     0.022823  0.065861 -0.036052 -0.017967 -0.010313  0.065761 -0.011444   
1    -0.004418  0.091539  0.067995  0.068877  0.023347 -0.038192 -0.010090   
2    -0.037773  0.120210  0.009190  0.044382  0.068671  0.038092  0.044289   
3     0.013303  0.126572 -0.006762 -0.055223 -0.164984  0.056559  0.061735   
4     0.044378  0.111903 -0.050361  0.011581 -0.001332 -0.034457  0.030829   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.010052  0.147688  0.018472 -0.064112 -0.067290 -0.078391  0.051142   
9996  0.003477  0.126376  0.047970 -0.018702  0.062161 -0.009174  0.017889   
9997  0.046349  0.084094 -0.037882  0.004399 -0.049076 -0.096141 -0.016350   
9998  0.023923  0.142483  0.010234 -0.038099 -0.040133 -0.040321  0.022106   
9999  0.006446  0.124298  0.028744  0.020818 -0.045881 -0.026334 -0.037289   

           7         8         9    ...       374       375    

127.0.0.1 - - [27/May/2024 13:42:11] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:42:11] "GET /assets/index-26d05a53.css HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:42:11] "GET /assets/index-351494fc.js HTTP/1.1" 200 -


           0         1         2         3         4         5         6    \
0     0.022823  0.065861 -0.036052 -0.017967 -0.010313  0.065761 -0.011444   
1    -0.004418  0.091539  0.067995  0.068877  0.023347 -0.038192 -0.010090   
2    -0.037773  0.120210  0.009190  0.044382  0.068671  0.038092  0.044289   
3     0.013303  0.126572 -0.006762 -0.055223 -0.164984  0.056559  0.061735   
4     0.044378  0.111903 -0.050361  0.011581 -0.001332 -0.034457  0.030829   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.010052  0.147688  0.018472 -0.064112 -0.067290 -0.078391  0.051142   
9996  0.003477  0.126376  0.047970 -0.018702  0.062161 -0.009174  0.017889   
9997  0.046349  0.084094 -0.037882  0.004399 -0.049076 -0.096141 -0.016350   
9998  0.023923  0.142483  0.010234 -0.038099 -0.040133 -0.040321  0.022106   
9999  0.006446  0.124298  0.028744  0.020818 -0.045881 -0.026334 -0.037289   

           7         8         9    ...       374       375    

127.0.0.1 - - [27/May/2024 13:42:12] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:42:12] "GET /assets/index-351494fc.js HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:42:12] "GET /assets/index-26d05a53.css HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:42:13] "GET / HTTP/1.1" 200 -


           0         1         2         3         4         5         6    \
0     0.022823  0.065861 -0.036052 -0.017967 -0.010313  0.065761 -0.011444   
1    -0.004418  0.091539  0.067995  0.068877  0.023347 -0.038192 -0.010090   
2    -0.037773  0.120210  0.009190  0.044382  0.068671  0.038092  0.044289   
3     0.013303  0.126572 -0.006762 -0.055223 -0.164984  0.056559  0.061735   
4     0.044378  0.111903 -0.050361  0.011581 -0.001332 -0.034457  0.030829   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.010052  0.147688  0.018472 -0.064112 -0.067290 -0.078391  0.051142   
9996  0.003477  0.126376  0.047970 -0.018702  0.062161 -0.009174  0.017889   
9997  0.046349  0.084094 -0.037882  0.004399 -0.049076 -0.096141 -0.016350   
9998  0.023923  0.142483  0.010234 -0.038099 -0.040133 -0.040321  0.022106   
9999  0.006446  0.124298  0.028744  0.020818 -0.045881 -0.026334 -0.037289   

           7         8         9    ...       374       375    

127.0.0.1 - - [27/May/2024 13:42:13] "GET /assets/index-351494fc.js HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:42:13] "GET /assets/index-26d05a53.css HTTP/1.1" 200 -


Cumulative explained variation for 50 principal components: 0.5947753820143803


127.0.0.1 - - [27/May/2024 13:42:15] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:42:15] "GET /assets/index-351494fc.js HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:42:15] "GET /assets/index-26d05a53.css HTTP/1.1" 200 -


           0         1         2         3         4         5         6    \
0     0.022823  0.065861 -0.036052 -0.017967 -0.010313  0.065761 -0.011444   
1    -0.004418  0.091539  0.067995  0.068877  0.023347 -0.038192 -0.010090   
2    -0.037773  0.120210  0.009190  0.044382  0.068671  0.038092  0.044289   
3     0.013303  0.126572 -0.006762 -0.055223 -0.164984  0.056559  0.061735   
4     0.044378  0.111903 -0.050361  0.011581 -0.001332 -0.034457  0.030829   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.010052  0.147688  0.018472 -0.064112 -0.067290 -0.078391  0.051142   
9996  0.003477  0.126376  0.047970 -0.018702  0.062161 -0.009174  0.017889   
9997  0.046349  0.084094 -0.037882  0.004399 -0.049076 -0.096141 -0.016350   
9998  0.023923  0.142483  0.010234 -0.038099 -0.040133 -0.040321  0.022106   
9999  0.006446  0.124298  0.028744  0.020818 -0.045881 -0.026334 -0.037289   

           7         8         9    ...       374       375    

127.0.0.1 - - [27/May/2024 13:42:15] "GET /assets/glasses-5d966a6f.svg HTTP/1.1" 200 -


Cumulative explained variation for 50 principal components: 0.5947753820143803
           0         1         2         3         4         5         6    \
0     0.022823  0.065861 -0.036052 -0.017967 -0.010313  0.065761 -0.011444   
1    -0.004418  0.091539  0.067995  0.068877  0.023347 -0.038192 -0.010090   
2    -0.037773  0.120210  0.009190  0.044382  0.068671  0.038092  0.044289   
3     0.013303  0.126572 -0.006762 -0.055223 -0.164984  0.056559  0.061735   
4     0.044378  0.111903 -0.050361  0.011581 -0.001332 -0.034457  0.030829   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.010052  0.147688  0.018472 -0.064112 -0.067290 -0.078391  0.051142   
9996  0.003477  0.126376  0.047970 -0.018702  0.062161 -0.009174  0.017889   
9997  0.046349  0.084094 -0.037882  0.004399 -0.049076 -0.096141 -0.016350   
9998  0.023923  0.142483  0.010234 -0.038099 -0.040133 -0.040321  0.022106   
9999  0.006446  0.124298  0.028744  0.020818 -0.045881 -0.02633



t-SNE done! Time elapsed: 83.35424017906189 seconds


127.0.0.1 - - [27/May/2024 13:42:27] "GET /data HTTP/1.1" 200 -


t-SNE done! Time elapsed: 98.90042877197266 seconds


127.0.0.1 - - [27/May/2024 13:43:50] "GET /data HTTP/1.1" 200 -


t-SNE done! Time elapsed: 98.142733335495 seconds


127.0.0.1 - - [27/May/2024 13:43:50] "GET /data HTTP/1.1" 200 -


t-SNE done! Time elapsed: 98.61967778205872 seconds


127.0.0.1 - - [27/May/2024 13:43:52] "GET /data HTTP/1.1" 200 -
127.0.0.1 - - [27/May/2024 13:43:54] "GET /data HTTP/1.1" 200 -


t-SNE done! Time elapsed: 97.16350960731506 seconds


127.0.0.1 - - [27/May/2024 13:43:54] "GET /data HTTP/1.1" 200 -


t-SNE done! Time elapsed: 99.31933450698853 seconds


In [4]:
# Create csv containing the embeddings and id as text, where the embeddings are pca reduced to 2 dimensions

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

# Load the embeddings DataFrame
embedding_df = pd.read_csv('./embeddings/lds-scriptures-chunked-embeddings.csv')

# Convert embeddings from string representation to list of floats
embedding_df['embedding'] = embedding_df['embedding'].apply(lambda x: np.fromstring(x.strip('[]'), sep=','))
# Initialize the PCA model
pca = PCA(n_components=2)

# Fit the PCA model to the embeddings
pca.fit(embedding_df['embedding'].tolist())

# Transform the embeddings to 2 dimensions
embedding_df['embedding_2d'] = pca.transform(embedding_df['embedding'].tolist()).tolist()

# Save the DataFrame to a new CSV file columns should be id, x, y
embedding_df['id'] = embedding_df['scripture_text']
embedding_df['x'] = embedding_df['embedding_2d'].apply(lambda x: x[0])
embedding_df['y'] = embedding_df['embedding_2d'].apply(lambda x: x[1])

embedding_df[['id', 'x', 'y']].to_csv('./embeddings/lds-scriptures-chunked-embeddings-2d.csv', index=False)



In [5]:

# now do 3d

import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

# Load the embeddings DataFrame
embedding_df = pd.read_csv('./embeddings/lds-scriptures-chunked-embeddings.csv')

# Convert embeddings from string representation to list of floats
embedding_df['embedding'] = embedding_df['embedding'].apply(lambda x: np.fromstring(x.strip('[]'), sep=','))
# Initialize the PCA model
pca = PCA(n_components=3)

# Fit the PCA model to the embeddings
pca.fit(embedding_df['embedding'].tolist())

# Transform the embeddings to 3 dimensions
embedding_df['embedding_3d'] = pca.transform(embedding_df['embedding'].tolist()).tolist()

# Save the DataFrame to a new CSV file columns should be id, x, y, z
embedding_df['id'] = embedding_df['scripture_text']

embedding_df['x'] = embedding_df['embedding_3d'].apply(lambda x: x[0])
embedding_df['y'] = embedding_df['embedding_3d'].apply(lambda x: x[1])
embedding_df['z'] = embedding_df['embedding_3d'].apply(lambda x: x[2])

embedding_df[['id', 'x', 'y', 'z']].to_csv('./embeddings/lds-scriptures-chunked-embeddings-3d.csv', index=False)

