Generate Embeddings

In [5]:
import tensorflow_hub as hub
import numpy as np

# Load the Universal Sentence Encoder model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def get_embedding(text):
    return np.array(embed([text])[0])


Storing Embeddings in Faiss Vector Database

In [6]:
import faiss

class FaissDB:
    def __init__(self, embedding_dim):
        self.index = faiss.IndexFlatL2(embedding_dim)

    def add(self, embeddings, ids):
        """Add embeddings to the database."""
        self.index.add_with_ids(embeddings, ids)

    def search(self, embedding, k=1):
        """Find k most similar embeddings in the database."""
        distances, indices = self.index.search(embedding, k)
        return distances, indices


Creating Database Table to store embeddings

In [11]:
import psycopg2
from config.config_secrets import DATABASE_CREDENTIALS

DATABASE_CREDENTIALS = DATABASE_CREDENTIALS

def create_text_embeddings_table():
    try:
        conn = psycopg2.connect(**DATABASE_CREDENTIALS)
        cursor = conn.cursor()
        
        # Check if the 'dreams' table already exists
        cursor.execute("SELECT EXISTS(SELECT FROM information_schema.tables WHERE table_name='dreams');")
        table_exists = cursor.fetchone()[0]

        if not table_exists:
            create_table_query = '''
            CREATE TABLE TextEmbeddings (
                story_id TEXT PRIMARY KEY,
                culture_name TEXT,
                title TEXT,
                embedding REAL[] -- Assuming the embedding is a 1D array of REAL numbers
            );
            '''
            
            cursor.execute(create_table_query)
            conn.commit()
            
            print("Table 'TextEmbeddings' created successfully!")
        else:
            print("Table 'dreams' already exists.")

    except (Exception, psycopg2.Error) as error:
        print("Error while connecting to PostgreSQL or executing the SQL command:", error)

    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()

if __name__ == "__main__":
    create_text_embeddings_table()


Table 'dreams' already exists.


Intergration

In [None]:
import faiss
import numpy as np
import psycopg2
import tensorflow_hub as hub
from hashlib import sha256
from config.config_secrets import DATABASE_CREDENTIALS

# Setting the Scene & Preparing the Wizard's Chamber:
database_info = DATABASE_CREDENTIALS
DATABASE_CREDENTIALS = DATABASE_CREDENTIALS

# Summoning the Magic Mirror: or Initialize the FAISS index
faiss_index = faiss.IndexFlatL2(512)

# Calling Upon a Knowledgeable Spirit: or Load Universal Sentence Encoder for embeddings
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# --- Helper Functions ---

# Understanding the Spirit's Insight:
def get_embedding(text):
    """Returns a 512-dimensional vector using Universal Sentence Encoder."""
    embeddings = embed([text])
    return embeddings.numpy()[0]

# Crafting a Unique Sigil:
def generate_unique_id(title):
    """Generate a unique ID using SHA256 hashing."""
    return sha256(title.encode()).hexdigest()

# Storing Treasures in the Vault:
def insert_into_postgresql(story_id, culture_name, title, embedding):
    """Insert metadata, culture_name, title, and embedding into PostgreSQL."""
    try:
        with psycopg2.connect(**DATABASE_CREDENTIALS) as conn:
            with conn.cursor() as cursor:
                insert_query = """
                INSERT INTO TextEmbeddings (story_id, culture_name, title, embedding)
                VALUES (%s, %s, %s, %s);
                """
                cursor.execute(insert_query, (story_id, culture_name, title, embedding))
            conn.commit()
    except Exception as e:
        print(f"Error inserting into PostgreSQL: {e}")

# Understanding the Essence:
def normalize_vector(embedding):
    return embedding / np.linalg.norm(embedding)

# Storing the Essence and the Story:
def insert_story_and_embedding(culture_name, title, text):
    embedding = get_embedding(text)  # Generate embedding from text
    embedding = normalize_vector(embedding)  # Normalize the embedding
    story_id = generate_unique_id(title)

    # Convert the NumPy array to a list **Remember to convert back into NumPy for FAISS
    embedding_list = embedding.tolist()

    # Store the story in PostgreSQL along with the embedding list
    insert_into_postgresql(story_id, culture_name, title, embedding_list)

    # Store the normalized embedding in Faiss
    faiss_index.add(np.array([embedding]))


# Seeking Similar Tales:
def get_similar_stories(query_text, top_k=5):
    query_embedding = get_embedding(query_text)
    query_embedding = normalize_vector(query_embedding).reshape(1, -1)  # Normalize the query embedding
    _, indices = faiss_index.search(query_embedding, top_k)
    
    story_ids = [story[0] for story in fetch_stories_by_ids(indices[0])]
    return story_ids

# Fetching Tales from the Vault:
def fetch_stories_by_ids(story_ids):
    """Fetch stories from PostgreSQL using a list of story IDs."""
    stories = []
    try:
        with psycopg2.connect(**DATABASE_CREDENTIALS) as conn:
            with conn.cursor() as cursor:
                select_query = """
                SELECT story_id, culture_name, title, embedding 
                FROM TextEmbeddings 
                WHERE story_id = ANY(%s);
                """
                cursor.execute(select_query, (story_ids,))
                stories = cursor.fetchall()
    except Exception as e:
        print(f"Error fetching stories from PostgreSQL: {e}")
    
    return stories

# Preserving the Magic Mirror's Knowledge:
def save_faiss_index(filename="faiss_index.index"):
    """Save the Faiss index to a file."""
    faiss.write_index(faiss_index, filename)

def load_faiss_index(filename="faiss_index.index"):
    """Load the Faiss index from a file."""
    global faiss_index
    faiss_index = faiss.read_index(filename)

# Modify this function to retrieve data from the PostgreSQL database
def process_data_from_database():
    try:
        with psycopg2.connect(**DATABASE_CREDENTIALS) as conn:
            with conn.cursor() as cursor:
                select_query = """
                SELECT culture_name, title, text
                FROM mythology;
                """
                cursor.execute(select_query)
                rows = cursor.fetchall()
                for row in rows:
                    culture_name, title, data = row
                    insert_story_and_embedding(culture_name, title, data)
    except Exception as e:
        print(f"Error fetching data from PostgreSQL: {e}")

# Call the modified data retrieval function
process_data_from_database()

try:
    load_faiss_index()
except Exception as e:
    print(f"Error: {e}")


In [10]:
import tensorflow_hub as hub
model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
print("Model loaded successfully!")


Model loaded successfully!
