In [1]:
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

# Load the CSV data
df = pd.read_csv("combined_data.csv")

# Inspect the first few rows
print(df.head())


                                      document_title document_number  \
0  Uniform Administrative Requirements, Cost Prin...      2020-28429   
1  Notification of Inflation Adjustments for Civi...      2020-28942   
2  The Public Musical Works Database and Transpar...      2020-28958   
3  Prioritization and Allocation of Certain Scarc...      2020-29060   
4  Fisheries of the Northeastern United States; I...      2020-28898   

  publication_date                                            pdf_url  \
0       31-12-2020  https://www.govinfo.gov/content/pkg/FR-2020-12...   
1       31-12-2020  https://www.govinfo.gov/content/pkg/FR-2020-12...   
2       31-12-2020  https://www.govinfo.gov/content/pkg/FR-2020-12...   
3       31-12-2020  https://www.govinfo.gov/content/pkg/FR-2020-12...   
4       31-12-2020  https://www.govinfo.gov/content/pkg/FR-2020-12...   

                                            abstract  \
0  This document announces the availability of th...   
1  The Office of

In [3]:
df.columns

Index(['document_title', 'document_number', 'publication_date', 'pdf_url',
       'abstract', 'excerpts', 'dates', 'page_length', 'start_page',
       'end_page', 'agency_names', 'agency_ids', 'agency_urls',
       'agency_slugs'],
      dtype='object')

In [4]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [5]:
# Extract the 'abstract' column
abstracts = df['abstract'].tolist()

# Generate embeddings
embeddings = model.encode(abstracts, convert_to_tensor=True)

print("Generated Embeddings Shape:", embeddings.shape)


Generated Embeddings Shape: torch.Size([3640, 384])


In [8]:
import chromadb

# Initialize ChromaDB client with local persistent storage
chroma_client = chromadb.PersistentClient(path="local_chromadb_storage")
# Initialize ChromaDB client
chroma_client = chromadb.Client()

# Create a collection to store embeddings and metadata
collection = chroma_client.create_collection(name="abstract_embeddings")


In [9]:
# Prepare metadata as a list of dictionaries
metadata = df.drop(columns=['abstract']).to_dict(orient='records')

# Add embeddings and metadata to the collection
collection.add(
    embeddings=embeddings.tolist(),
    documents=abstracts,
    metadatas=metadata,
    ids=[str(i) for i in range(len(abstracts))]
)


In [12]:

query = "Fruad"
query_embedding = model.encode(query).tolist()

# Search for the top 3 most similar abstracts
results = collection.query(
    query_embeddings=[query_embedding],
    n_results = 5
)

# Display results with original metadata
for i, doc in enumerate(results['documents'][0]):
    print(f"\nResult {i+1}: {doc}")
    print("Metadata:")
    for key, value in results['metadatas'][0][i].items():
        print(f"  {key}: {value}")



Result 1: The State Department will hold a public meeting at 2 p.m.-3:30 p.m. (ET) on WebEx with the Bureau of Cyberspace and Digital Policy's International Information and Communications Policy (CDP/ICP) division. The purpose of the meeting is to brief stakeholders on CDP/ICP's past and upcoming international engagements. These include engagement at the International Telecommunication Union (ITU), the Organization of American States Inter-American Telecommunication Commission (CITEL), the Organization for Economic Cooperation and Development (OECD), the Asia Pacific Economic Cooperation (APEC) Forum Telecommunications and Information Working Group, the Group of Seven (G7) Digital & Tech Working Group, the Group of Twenty (G20) Digital Economy Task Force, and other multilateral processes and bilateral digital and ICT dialogues.
Metadata:
  agency_ids: 476
  agency_names: State Department
  agency_slugs: state-department
  agency_urls: https://www.federalregister.gov/agencies/state-dep