## Langchain
%pip install -U langchain-community langgraph langchain-anthropic tavily-python langgraph-checkpoint-sqlite
pip install -qU langchain-postgres

## Langchain With PGVector
* pip install -qU langchain-postgres
* %pip install -U langchain-community langgraph langchain-anthropic tavily-python langgraph-checkpoint-sqlite
* %pip install -qU langchain-groq
* %pip install -qU langchain-ollama

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

Note: you may need to restart the kernel to use updated packages.


In [8]:
import psycopg2
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()

# Load the MiniLM embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

DATABASE_CONFIG = {
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'host': os.getenv('DB_HOST', 'localhost'),  # Default to localhost if not set
    'port': os.getenv('DB_PORT', '5432')
}

# Connect to the database ** is used to unpack dictionary
conn = psycopg2.connect(**DATABASE_CONFIG)
cur = conn.cursor()

# Create a table for our documents
cur.execute("""
    CREATE TABLE IF NOT EXISTS similarity_search_pdf (
        id SERIAL PRIMARY KEY,
        content TEXT,
        embedding FLOAT8[]
    )
""")

# Function to get embeddings using MiniLM
def get_embedding(text):
    embedding = model.encode(text, convert_to_numpy=True)
    return embedding.tolist()  # Convert to list for storage in PostgreSQL

# Function to add a document
def add_document(content):
    embedding = get_embedding(content)
    cur.execute("INSERT INTO similarity_search_pdf (content, embedding) VALUES (%s, %s)", (content, embedding))
    conn.commit()

# Function to search for similar documents
def search_documents(query, limit=5):
    query_embedding = np.array(get_embedding(query))
    cur.execute("SELECT content, embedding FROM similarity_search_pdf")
    results = cur.fetchall()
    
    # Calculate cosine similarity between query and each document
    scores = [(content, np.dot(query_embedding, np.array(embedding)) / (np.linalg.norm(query_embedding) * np.linalg.norm(embedding)))
              for content, embedding in results]
    # Sort by similarity (highest cosine similarity first)
    scores.sort(key=lambda x: x[1], reverse=True)
    
    # Return the top `limit` results
    return scores[:limit]

# Add some sample documents
sample_docs = [
    "The quick brown fox jumps over the lazy dog.",
    "Python is a high-level programming language.",
    "Vector databases are essential for modern AI applications.",
    "PostgreSQL is a powerful open-source relational database.",
]
for doc in sample_docs:
    add_document(doc)

# Perform a search
search_query = "Tell me about programming languages"
results = search_documents(search_query)
print(f"Search results for: '{search_query}'")
for i, (content, similarity) in enumerate(results, 1):
    print(f"{i}. {content} (Similarity: {similarity:.4f})")

# Clean up
cur.close()
conn.close()


Search results for: 'Tell me about programming languages'
1. Python is a high-level programming language. (Similarity: 0.6264)
2. Python is a high-level programming language. (Similarity: 0.6264)
3. Python is a high-level programming language. (Similarity: 0.6264)
4. PostgreSQL is a powerful open-source relational database. (Similarity: 0.2716)
5. PostgreSQL is a powerful open-source relational database. (Similarity: 0.2716)
