## Data Pipeline - Azure Database for PostgreSQL

### Prerequisites

- Generate embeddings - [generate_embeddings.ipynb](../common/generate_embeddings.ipynb) 

#### Set environment variables

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

pg_host  = os.getenv("POSTGRESQL_HOST")
if pg_host is None or pg_host == "":
    print("POSTGRESQL_HOST environment variable not set.")
    exit()

pg_user  = os.getenv("POSTGRESQL_USERNAME")
if pg_user is None or pg_user == "":
    print("POSTGRESQL_USERNAME environment variable not set.")
    exit()

pg_password  = os.getenv("POSTGRESQL_PASSWORD")
if pg_password is None or pg_password == "":
    print("POSTGRESQL_PASSWORD environment variable not set.")
    exit()

db_name  = os.getenv("POSTGRESQL_DATABASE")
if db_name is None or db_name == "":
    print("POSTGRESQL_DATABASE environment variable not set.")
    exit()

text_table_name = 'text_sample'
doc_table_name = 'doc_sample'
image_table_name = 'image_sample'

postgresql_params = {
    "host": pg_host,
    "port": "5432", 
    "dbname": db_name,
    "user": pg_user,
    "password": pg_password
}

POSTGRESQL_HOST environment variable not set.
POSTGRESQL_USERNAME environment variable not set.
POSTGRESQL_PASSWORD environment variable not set.
POSTGRESQL_DATABASE environment variable not set.


: 

#### Add vector extension

The vector extension needs to be enabled in every database.

In [None]:
from psycopg2 import connect

with connect(**postgresql_params) as connection:
    with connection.cursor() as cursor:
        cursor.execute('CREATE EXTENSION IF NOT EXISTS vector;')
        
        print('Vector extension added.')

#### Create table

In [None]:
from psycopg2 import connect

def create_table(table_name, table_schema):

    with connect(**postgresql_params) as connection:
        with connection.cursor() as cursor:
            cursor.execute(f"DROP TABLE IF  EXISTS {table_name};")
            cursor.execute(f"CREATE TABLE IF NOT EXISTS {table_name} ({table_schema});")

            print(f"Table {table_name} created.")

## Create text_sample table
table_schema = """
    id smallint PRIMARY KEY,
    title text,
    content text,
    category text,
    title_vector VECTOR(1536),
    content_vector VECTOR(1536)
 """
create_table(text_table_name, table_schema)

## Create doc_sample table
table_schema = """
    id smallint PRIMARY KEY,
    chunk_content text,
    chunk_content_vector VECTOR(1536)
 """
create_table(doc_table_name, table_schema)

## Create image_sample table
table_schema = """
    id smallint PRIMARY KEY,
    image text,
    image_vector VECTOR(1024)
 """
create_table(image_table_name, table_schema)

#### Ingest text sample with embeddings

In [None]:
import pandas as pd
from psycopg2 import connect

text_df = pd.read_json('../data/text/product_docs_embeddings.json')
records = text_df.values.tolist()

with connect(**postgresql_params) as connection:
    with connection.cursor() as cursor:
        insert_sql = f"INSERT INTO {text_table_name}(id, title, content, category, title_vector, content_vector) VALUES(%s, %s, %s, %s, %s, %s);"
        cursor.executemany(insert_sql, records)

        print("Text sample ingested.")

#### Ingest doc sample with embeddings

In [None]:
import pandas as pd
from psycopg2 import connect

doc_df = pd.read_json('../data/docs/employee_handbook_embeddings.json')
records = doc_df.values.tolist()

with connect(**postgresql_params) as connection:
    with connection.cursor() as cursor:
        insert_sql = f"INSERT INTO {doc_table_name}(id, chunk_content, chunk_content_vector) VALUES(%s, %s, %s)"
        cursor.executemany(insert_sql, records)

        print("Doc sample ingested.")

#### Ingest image sample with embeddings

In [None]:
import pandas as pd
from psycopg2 import connect

image_df = pd.read_json('../data/images/images_embeddings.json')
records = image_df.values.tolist()

with connect(**postgresql_params) as connection:
    with connection.cursor() as cursor:
        insert_sql = f"INSERT INTO {image_table_name}(id, image, image_vector) VALUES(%s, %s, %s)"
        cursor.executemany(insert_sql, records)

        print("Image sample ingested.")

#### Create HNSW Index

Details - https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw

In [None]:
from psycopg2 import connect

with connect(**postgresql_params) as connection:
    with connection.cursor() as cursor:
        index_query = f"""
            CREATE INDEX ON {text_table_name} USING hnsw (content_vector vector_l2_ops) WITH (m = 4, ef_construction = 400);
        """
        cursor.execute(index_query)

        print(f"HNSW index created for {text_table_name}.")

with connect(**postgresql_params) as connection:
    with connection.cursor() as cursor:
        index_query = f"""
            CREATE INDEX ON {doc_table_name} USING hnsw (chunk_content_vector vector_l2_ops) WITH (m = 4, ef_construction = 400);
        """
        cursor.execute(index_query)

        print(f"HNSW index created for {doc_table_name}.")

with connect(**postgresql_params) as connection:
    with connection.cursor() as cursor:
        index_query = f"""
            CREATE INDEX ON {image_table_name} USING hnsw (image_vector vector_l2_ops) WITH (m = 4, ef_construction = 400);
        """
        cursor.execute(index_query)

        print(f"HNSW index created for {image_table_name}.")

