In [6]:
from sqlalchemy import create_engine
import pandas as pd
import uuid

# Sample data list
data_list = ['LLM Powered Autonomous Agents is a concept discussed by Lilian Weng.',
 'The date of the article is June 23, 2023.',
 'The estimated reading time for the article is 31 minutes.',
 'Lilian Weng is the author of the article.',
 'Building agents with LLM as its core controller is a cool concept.',
 'Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer, and BabyAGI, serve as inspiring examples.',
 'The potentiality of LLM extends beyond generating well-written copies, stories, essays, and programs.',
 'LLM can be framed as a powerful general problem solver.',
 'In a LLM-powered autonomous agent system, LLM functions as the agent’s brain.',
 'Several key components complement LLM in the agent system.',
 'Planning is one of the key components of the agent system.',
 'Task decomposition allows the agent to break down large tasks into smaller, manageable subgoals.',
 'Self-reflection enables the agent to learn from mistakes and refine future actions.',
 'Memory is another key component of the agent system.',
 'Short-term memory is utilized for in-context learning.',
 'Long-term memory provides the agent with the capability to retain and recall information over extended periods.',
 'Tool use is a distinguishing characteristic of human beings.',
 'Equipping LLMs with external tools can significantly extend the model capabilities.',
 'MRKL is a neuro-symbolic architecture for autonomous agents.',
 'MRKL systems contain a collection of expert modules.',
 'The general-purpose LLM works as a router to route inquiries to the best suitable expert module.',
 'TALM and Toolformer fine-tune a language model to learn to use external tool APIs.',
 'ChatGPT Plugins and OpenAI API function calling are examples of LLMs augmented with tool use capability.',
 'HuggingGPT is a framework to use ChatGPT as the task planner.',
 'ChemCrow is a domain-specific example where LLM is augmented with expert-designed tools for scientific tasks.',
 'Generative Agents is an experiment where virtual characters controlled by LLM-powered agents interact in a sandbox environment.',
 'AutoGPT is a proof-of-concept demo for setting up autonomous agents with LLM as the main controller.',
 'The article discusses challenges such as finite context length and reliability of natural language interface.',
 'The article includes citations and references for further reading.']

# Create DataFrame
df = pd.DataFrame(data_list, columns=['sentences'])

# Add a unique ID for each entry
df['uuid'] = [str(uuid.uuid4()) for _ in range(len(df))]

# Add the specified UUID to all entries
df['text_id'] = "78a7d0ea-0d7d-4b99-827b-83fab343a86e"

# Create SQLite engine
engine = create_engine('sqlite:///test.db')

# Upload DataFrame to SQLite
df.to_sql('sentences', engine, if_exists='replace', index=False)

print("Data uploaded successfully!")

Data uploaded successfully!


In [7]:
from sqlalchemy import create_engine, text

# Create an SQLite database
DATABASE_URL = "sqlite:///./test.db"

# Create an engine
engine = create_engine(DATABASE_URL)

# Retrieve UUIDs from the 'page_content' table
ids = []
with engine.connect() as connection:
    result = connection.execute(text("SELECT uuid FROM sentences"))
    ids.extend([str(row[0]) for row in result])

page_contents = {}
with engine.connect() as connection:
    for uuid in ids:
        result = connection.execute(text("SELECT sentences FROM sentences WHERE uuid = :uuid"), {'uuid': uuid})
        for row in result:
            page_contents[uuid] = row[0]

# Separate the page contents into a list
sentences = list(page_contents.values())

In [8]:
sentences

['LLM Powered Autonomous Agents is a concept discussed by Lilian Weng.',
 'The date of the article is June 23, 2023.',
 'The estimated reading time for the article is 31 minutes.',
 'Lilian Weng is the author of the article.',
 'Building agents with LLM as its core controller is a cool concept.',
 'Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer, and BabyAGI, serve as inspiring examples.',
 'The potentiality of LLM extends beyond generating well-written copies, stories, essays, and programs.',
 'LLM can be framed as a powerful general problem solver.',
 'In a LLM-powered autonomous agent system, LLM functions as the agent’s brain.',
 'Several key components complement LLM in the agent system.',
 'Planning is one of the key components of the agent system.',
 'Task decomposition allows the agent to break down large tasks into smaller, manageable subgoals.',
 'Self-reflection enables the agent to learn from mistakes and refine future actions.',
 'Memory is another key compo

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
ASTRA_DB_API_KEY = os.getenv('ASTRA_DB_API_KEY')
ASTRA_DB_ENDPOINT = os.getenv('ASTRA_DB_ENDPOINT')
ASTRA_DB_KEYSPACE = os.getenv('ASTRA_DB_KEYSPACE')

model = ChatOpenAI(model='gpt-4o')

embeddings = OpenAIEmbeddings()

text_embeddings = embeddings.embed_documents(sentences)

In [27]:
from langchain_community.vectorstores import SQLiteVec
from sqlalchemy import create_engine

# Create SQLite engine and connection
engine = create_engine('sqlite:///vec.db')
connection = engine.connect()

db = SQLiteVec.from_texts(
    texts=sentences,
    embedding=embeddings,
    connection=connection,
    table_name='vector_store'
)

In [35]:
query = "What is article about?"
data = db.similarity_search(query, k=5)

# print results
data

[Document(metadata={}, page_content='The article includes citations and references for further reading.'),
 Document(metadata={}, page_content='The article discusses challenges such as finite context length and reliability of natural language interface.'),
 Document(metadata={}, page_content='Lilian Weng is the author of the article.'),
 Document(metadata={}, page_content='The date of the article is June 23, 2023.'),
 Document(metadata={}, page_content='The estimated reading time for the article is 31 minutes.')]

In [36]:
retriever = db.as_retriever()
retriever.invoke(query)

[Document(metadata={}, page_content='The article includes citations and references for further reading.'),
 Document(metadata={}, page_content='The article discusses challenges such as finite context length and reliability of natural language interface.'),
 Document(metadata={}, page_content='Lilian Weng is the author of the article.'),
 Document(metadata={}, page_content='The date of the article is June 23, 2023.')]

In [16]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Function to find the optimal number of clusters using silhouette score
def find_optimal_clusters(embeddings, max_k):
    iters = range(2, max_k+1)
    s_scores = []
    
    for k in iters:
        kmeans = KMeans(n_clusters=k, random_state=0).fit(embeddings)
        s_scores.append(silhouette_score(embeddings, kmeans.labels_))
    
    optimal_k = iters[s_scores.index(max(s_scores))]
    return optimal_k

# Example usage
max_k = 10  # You can adjust this value based on your needs
optimal_clusters = find_optimal_clusters(text_embeddings, max_k)

# Cluster the embeddings with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_clusters, random_state=0).fit(text_embeddings)
cluster_assignments = kmeans.labels_

# Create a list of dict to store the embeddings, the text, and the cluster assignment
props_clustered = [
    {"text": prop, "embeddings": emb, "cluster": cluster}
    for prop, emb, cluster in zip(sentences, text_embeddings, cluster_assignments)
]

# Display clusters and their propositions
for cluster in range(optimal_clusters):
    print(f"Cluster {cluster}:")
    for prop in props_clustered:
        if prop["cluster"] == cluster:
            print(f"  - {prop['text']}")
    print()

Cluster 0:
  - The date of the article is June 23, 2023.
  - The estimated reading time for the article is 31 minutes.
  - Lilian Weng is the author of the article.
  - The potentiality of LLM extends beyond generating well-written copies, stories, essays, and programs.
  - Planning is one of the key components of the agent system.
  - Task decomposition allows the agent to break down large tasks into smaller, manageable subgoals.
  - Self-reflection enables the agent to learn from mistakes and refine future actions.
  - Memory is another key component of the agent system.
  - Short-term memory is utilized for in-context learning.
  - Long-term memory provides the agent with the capability to retain and recall information over extended periods.
  - Tool use is a distinguishing characteristic of human beings.
  - The article discusses challenges such as finite context length and reliability of natural language interface.
  - The article includes citations and references for further read