In [1]:
import os
from dotenv import load_dotenv
import json
import psycopg2
from psycopg2.extras import Json, execute_batch
from azure.storage.blob import BlobServiceClient
from langchain_openai import OpenAIEmbeddings
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
from typing import List, Dict, Any

# Load environment variables
load_dotenv()

# Print a message to confirm the cell has run
print("Libraries imported and environment variables loaded.")

Libraries imported and environment variables loaded.


In [2]:
# Azure Blob Storage setup
azure_connection_string = os.getenv("AZURE_CONNECTION_STRING")
container_name = os.getenv("ADLS_CONTAINER_NAME")
blob_service_client = BlobServiceClient.from_connection_string(azure_connection_string)
container_client = blob_service_client.get_container_client(container_name)

# PostgreSQL connection setup
POSTGRESQL_CONNECTION = f"postgresql://{os.getenv('PG_ADMIN_USERNAME')}:{os.getenv('PG_ADMIN_PASSWORD')}@{os.getenv('PG_SERVER_NAME')}:{os.getenv('POSTGRES_PORT')}/{os.getenv('PG_DATABASE')}"

# OpenAI Embeddings setup
openai_api_key = os.getenv("OPENAI_API_KEY")
embedding_model = OpenAIEmbeddings(api_key=openai_api_key, model="text-embedding-ada-002")

print("Azure Blob Storage, PostgreSQL, and OpenAI Embeddings set up.")

Azure Blob Storage, PostgreSQL, and OpenAI Embeddings set up.


In [3]:
def connect_to_db():
    return psycopg2.connect(POSTGRESQL_CONNECTION)

def list_blobs_in_directory(directory_name):
    return [blob.name for blob in container_client.list_blobs(name_starts_with=directory_name) if '.' in blob.name]

def download_blob_to_string(blob_name):
    blob_client = container_client.get_blob_client(blob_name)
    return blob_client.download_blob().readall().decode("utf-8")

api_call_count = 0

def process_questions_batch(questions_data: List[Dict[Any, Any]], batch_size: int = 100):
    global api_call_count
    start_time = time.time()

    all_texts = []
    for question_data in questions_data:
        all_texts.append(question_data['question_full'])
        for answer_data in question_data['answers']:
            all_texts.append(answer_data['answer'])

    # Process embeddings in batches
    all_embeddings = []
    for i in range(0, len(all_texts), batch_size):
        batch = all_texts[i:i+batch_size]
        embed_start = time.time()
        batch_embeddings = embedding_model.embed_documents(batch)
        embed_end = time.time()
        print(f"Batch embedding time: {embed_end - embed_start:.2f} seconds")
        all_embeddings.extend(batch_embeddings)
        api_call_count += 1

    results = []
    embedding_index = 0
    for question_data in questions_data:
        question_id = question_data['question_id']
        topic = question_data['topic']
        question_title = question_data['question_title']
        question_full = question_data['question_full']
        answers = question_data['answers']

        question_embedding = all_embeddings[embedding_index]
        embedding_index += 1

        answer_embeddings = []
        answer_sources = []
        for answer_data in answers:
            answer_source = answer_data['source']
            answer_embedding = all_embeddings[embedding_index]
            embedding_index += 1
            answer_embeddings.append(answer_embedding)
            answer_sources.append(answer_source)

        combined_answer_embedding = [sum(x) / len(x) for x in zip(*answer_embeddings)]

        metadata = {
            "topic": topic,
            "question_title": question_title,
            "sources": answer_sources
        }

        results.append((question_id, question_full, Json(answers), question_embedding, combined_answer_embedding, Json(metadata)))

    end_time = time.time()
    print(f"Total batch processing time: {end_time - start_time:.2f} seconds")
    print(f"API calls made: {api_call_count}")

    return results

def process_blob(blob_name):
    blob_content = download_blob_to_string(blob_name)
    json_data = json.loads(blob_content)
    return process_questions_batch(json_data)

def insert_batch(conn, data_batch):
    with conn.cursor() as cur:
        execute_batch(cur, """
            INSERT INTO talk (question_id, question_full, answers, question_vector, answers_vector, metadata)
            VALUES (%s, %s, %s, %s, %s, %s)
            ON CONFLICT (question_id) DO NOTHING;
        """, data_batch)
    conn.commit()

print("Helper functions defined.")

Helper functions defined.


In [4]:
directories = ["counsel_chat_data/", "mentalhealth_data/"]
all_blobs = []
for directory in directories:
    all_blobs.extend(list_blobs_in_directory(directory))

processed_data = []
with ThreadPoolExecutor(max_workers=5) as executor:
    future_to_blob = {executor.submit(process_blob, blob_name): blob_name for blob_name in all_blobs}
    for future in tqdm(as_completed(future_to_blob), total=len(all_blobs), desc="Processing blobs"):
        blob_name = future_to_blob[future]
        try:
            processed_data.extend(future.result())
        except Exception as exc:
            print(f'{blob_name} generated an exception: {exc}')

print(f"Processed {len(processed_data)} items.")

Processing blobs:   0%|                                                                                            | 0/11 [00:00<?, ?it/s]

Batch embedding time: 1.60 seconds
Batch embedding time: 1.40 seconds
Batch embedding time: 1.44 seconds
Batch embedding time: 1.41 seconds
Batch embedding time: 1.31 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 0.89 seconds
Batch embedding time: 1.37 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 1.03 seconds
Batch embedding time: 1.39 seconds
Batch embedding time: 1.07 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 0.98 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 1.40 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 1.14 seconds
Batch embedding time: 0.89 seconds
Batch embedding time: 1.03 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 1.33 seconds
Batch embedding time: 1.03 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 1.08 seconds
Batch embedding time

Processing blobs:   9%|███████▋                                                                            | 1/11 [00:56<09:29, 56.92s/it]

Total batch processing time: 54.78 seconds
API calls made: 224
Batch embedding time: 1.16 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 1.02 seconds
Batch embedding time: 1.19 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 1.04 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.98 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 1.32 seconds
Batch embedding time: 0.98 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 0.86 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 1.00 seconds
Batch embedding time: 1.24 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 1.03 seconds
Batch embedding time: 1.07 seconds
Batch embedding time: 1.05 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 1.46 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 0.92 

Processing blobs:  18%|██████████████▋                                                                  | 2/11 [17:00<1:28:30, 590.01s/it]

Total batch processing time: 1006.42 seconds
API calls made: 3127
Batch embedding time: 7.11 seconds
Batch embedding time: 5.55 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 4.55 seconds
Batch embedding time: 1.03 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 1.11 seconds
Batch embedding time: 1.06 seconds
Batch embedding time: 1.00 seconds
Batch embedding time: 1.06 seconds
Batch embedding time: 1.08 seconds
Batch embedding time: 1.06 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 1.28 seconds
Batch embedding time: 1.05 seconds
Batch embedding time: 1.26 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 1.12 seconds
Batch embedding time: 1.12 seconds
Batch embedding time: 1.05 seconds
Batch embedding time: 1.01 seconds
Batch embedding time: 1.01 seconds
Batch embedding time: 1.09 seconds
Batch embedding time: 1.06 seconds
Batch embedding time: 1.00 seconds
Batch embedding time: 1.03 seconds
Batch embedding time: 0.

Processing blobs:  27%|██████████████████████▋                                                            | 3/11 [17:59<46:21, 347.64s/it]

Total batch processing time: 1065.03 seconds
API calls made: 3321
Batch embedding time: 4.22 seconds
Batch embedding time: 4.14 seconds
Batch embedding time: 4.15 seconds
Batch embedding time: 4.94 seconds
Batch embedding time: 1.08 seconds
Batch embedding time: 1.12 seconds
Batch embedding time: 1.05 seconds
Batch embedding time: 1.09 seconds
Batch embedding time: 1.01 seconds
Batch embedding time: 1.58 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 1.04 seconds
Batch embedding time: 1.08 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 1.00 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 1.01 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 0.78 seconds
Batch embedding time: 1.02 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.72 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 1.

Processing blobs:  36%|██████████████████████████████▏                                                    | 4/11 [19:38<29:07, 249.67s/it]

Total batch processing time: 1115.20 seconds
API calls made: 3594
Batch embedding time: 5.19 seconds
Batch embedding time: 4.00 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.88 seconds
Batch embedding time: 1.03 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 0.98 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 1.03 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.98 seconds
Batch embedding time: 1.02 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 0.89 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 0.53 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 0.

Processing blobs:  45%|█████████████████████████████████████▋                                             | 5/11 [39:21<58:36, 586.08s/it]

Total batch processing time: 2347.25 seconds
API calls made: 6522
Batch embedding time: 8.42 seconds
Batch embedding time: 7.35 seconds
Batch embedding time: 6.62 seconds
Batch embedding time: 1.11 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 2.18 seconds
Batch embedding time: 1.27 seconds
Batch embedding time: 1.02 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 1.11 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 1.12 seconds
Batch embedding time: 1.06 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 1.18 secondsBatch embedding time: 1.11 seconds

Batch embedding time: 1.03 seconds
Batch embedding time: 1.07 seconds
Batch embedding time: 1.09 seconds
Batch embedding time: 1.11 seconds
Batch embedding time: 1.05 seconds
Batch embedding time: 1.04 seconds
Batch embedding time: 4.10 seconds
Batch embedding time: 1.09 seconds
Batch embedding time: 1.22 seconds
Batch embedding time: 1.

Processing blobs:  55%|█████████████████████████████████████████████▎                                     | 6/11 [40:17<33:49, 405.92s/it]

Batch embedding time: 6.04 seconds
Batch embedding time: 5.55 seconds
Batch embedding time: 5.33 seconds
Batch embedding time: 1.09 seconds
Batch embedding time: 1.05 seconds
Batch embedding time: 1.29 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.87 seconds
Batch embedding time: 0.85 seconds
Batch embedding time: 1.10 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 1.07 seconds
Batch embedding time: 1.02 seconds
Batch embedding time: 1.11 seconds
Batch embedding time: 1.14 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 1.10 seconds
Batch embedding time: 1.50 seconds
Batch embedding time: 1.08 seconds
Batch embedding time: 1.19 seconds
Batch embedding time: 1.02 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 1.15 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 1.18 seconds
Batch embedding time

Processing blobs:  64%|████████████████████████████████████████████████████▊                              | 7/11 [44:58<24:20, 365.04s/it]

Batch embedding time: 8.04 seconds
Batch embedding time: 6.16 seconds
Batch embedding time: 1.05 seconds
Batch embedding time: 1.06 seconds
Batch embedding time: 1.33 seconds
Batch embedding time: 0.52 seconds
Batch embedding time: 0.43 seconds
Batch embedding time: 1.00 seconds
Batch embedding time: 1.19 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 1.11 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.81 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 1.18 seconds
Batch embedding time: 1.05 seconds
Batch embedding time: 1.05 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 0.85 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 1.02 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 1.01 seconds
Batch embedding time

Processing blobs:  73%|████████████████████████████████████████████████████████████▎                      | 8/11 [53:20<20:25, 408.57s/it]

Total batch processing time: 2115.02 seconds
API calls made: 8200
Batch embedding time: 4.17 seconds
Batch embedding time: 3.92 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 0.77 seconds
Batch embedding time: 1.06 seconds
Batch embedding time: 1.07 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.85 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.89 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 1.10 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 0.

Processing blobs:  82%|██████████████████████████████████████████████████████████████████▎              | 9/11 [1:02:39<15:11, 455.78s/it]

Total batch processing time: 1392.94 seconds
API calls made: 8956
Batch embedding time: 5.59 seconds
Batch embedding time: 3.77 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 0.88 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.85 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 0.83 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 0.87 seconds
Batch embedding time: 1.04 seconds
Batch embedding time: 0.89 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 0.53 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 1.22 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 0.94 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 1.06 seconds
Batch embedding time: 0.

Processing blobs:  91%|████████████████████████████████████████████████████████████████████████▋       | 10/11 [1:03:41<05:34, 334.19s/it]

Batch embedding time: 3.94 seconds
Batch embedding time: 0.97 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 1.06 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 0.39 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.91 seconds
Batch embedding time: 0.88 seconds
Batch embedding time: 1.05 seconds
Batch embedding time: 0.89 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.84 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 0.95 seconds
Batch embedding time: 0.98 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 0.89 seconds
Batch embedding time: 0.93 seconds
Batch embedding time: 0.92 seconds
Batch embedding time: 0.98 seconds
Batch embedding time: 0.90 seconds
Batch embedding time: 0.99 seconds
Batch embedding time: 0.98 seconds
Batch embedding time: 0.96 seconds
Batch embedding time: 1.00 seconds
Batch embedding time: 1.22 seconds
Batch embedding time

Processing blobs: 100%|████████████████████████████████████████████████████████████████████████████████| 11/11 [1:20:55<00:00, 441.42s/it]

Total batch processing time: 2430.03 seconds
API calls made: 9402
Processed 7195 items.





In [5]:
conn = connect_to_db()
try:
    batch_size = 100
    for i in tqdm(range(0, len(processed_data), batch_size), desc="Inserting batches"):
        batch = processed_data[i:i+batch_size]
        insert_batch(conn, batch)
    print("Data insertion complete.")
finally:
    conn.close()
    print("Database connection closed.")

Inserting batches: 100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [05:57<00:00,  4.96s/it]

Data insertion complete.
Database connection closed.





In [None]:
# CHECKS

In [6]:
# Check database record count

import psycopg2
from psycopg2.extras import Json

def check_database_record_count():
    # Connect to the database
    connection = psycopg2.connect(POSTGRESQL_CONNECTION)
    
    # Create a cursor to execute SQL commands
    cursor = connection.cursor()
    
    try:
        # Execute a simple COUNT query
        cursor.execute("SELECT COUNT(*) FROM talk")
        
        # Fetch the result (it will be a tuple with one item)
        result = cursor.fetchone()
        
        # The count is the first (and only) item in the tuple
        count = result[0]
        
        print(f"Total records in the database: {count}")
        return count
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
    finally:
        # Always close the cursor and connection
        cursor.close()
        connection.close()

# Run the check
total_records = check_database_record_count()

Total records in the database: 7195


In [7]:
# Check for null embeddings

def check_null_embeddings():
    # Connect to the database
    connection = psycopg2.connect(POSTGRESQL_CONNECTION)
    cursor = connection.cursor()
    
    try:
        # Count records where either question_vector or answers_vector is NULL
        cursor.execute("""
            SELECT COUNT(*) 
            FROM talk 
            WHERE question_vector IS NULL OR answers_vector IS NULL
        """)
        
        # Fetch the result
        result = cursor.fetchone()
        null_count = result[0]
        
        print(f"Records with null embeddings: {null_count}")
        return null_count
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
    finally:
        cursor.close()
        connection.close()

# Run the check
null_embeddings = check_null_embeddings()

Records with null embeddings: 0


In [12]:
# Check embedding dimensions

import psycopg2

def check_vector_dimensions_generic():
    connection = psycopg2.connect(POSTGRESQL_CONNECTION)
    cursor = connection.cursor()
    
    try:
        # Fetch the first row to examine the structure
        cursor.execute("SELECT question_vector, answers_vector FROM talk LIMIT 1")
        sample_row = cursor.fetchone()
        
        if sample_row is None:
            print("No data found in the table.")
            return None
        
        question_vector, answers_vector = sample_row
        
        print(f"Type of question_vector: {type(question_vector)}")
        print(f"Type of answers_vector: {type(answers_vector)}")
        
        if hasattr(question_vector, '__len__'):
            question_dim = len(question_vector)
            answer_dim = len(answers_vector)
            print(f"Question vector dimension: {question_dim}")
            print(f"Answer vector dimension: {answer_dim}")
        else:
            print("Unable to determine vector dimensions directly.")
            print("Sample question_vector:", question_vector)
            print("Sample answers_vector:", answers_vector)
        
        # Check for any NULL vectors
        cursor.execute("""
            SELECT COUNT(*) 
            FROM talk 
            WHERE question_vector IS NULL OR answers_vector IS NULL
        """)
        null_count = cursor.fetchone()[0]
        print(f"Number of records with NULL vectors: {null_count}")
        
        return sample_row
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
    finally:
        cursor.close()
        connection.close()

# Run the check
vector_sample = check_vector_dimensions_generic()

Type of question_vector: <class 'str'>
Type of answers_vector: <class 'str'>
Question vector dimension: 19389
Answer vector dimension: 19531
Number of records with NULL vectors: 0


In [13]:
import psycopg2
import re

def parse_vector_strings():
    connection = psycopg2.connect(POSTGRESQL_CONNECTION)
    cursor = connection.cursor()
    
    try:
        # Fetch the first row
        cursor.execute("SELECT question_vector, answers_vector FROM talk LIMIT 1")
        sample_row = cursor.fetchone()
        
        if sample_row is None:
            print("No data found in the table.")
            return None
        
        question_vector_str, answers_vector_str = sample_row
        
        # Function to parse vector string and get dimension
        def get_vector_dimension(vector_str):
            # Remove any surrounding brackets or parentheses
            vector_str = vector_str.strip('[](){}')
            # Split the string by commas and count elements
            elements = [e.strip() for e in re.split(r'[,\s]+', vector_str) if e.strip()]
            return len(elements)
        
        question_dim = get_vector_dimension(question_vector_str)
        answer_dim = get_vector_dimension(answers_vector_str)
        
        print(f"Actual question vector dimension: {question_dim}")
        print(f"Actual answer vector dimension: {answer_dim}")
        
        # Print first few elements of each vector
        print("\nFirst 5 elements of question vector:")
        print(', '.join(question_vector_str.strip('[](){}').split(',')[:5]))
        
        print("\nFirst 5 elements of answer vector:")
        print(', '.join(answers_vector_str.strip('[](){}').split(',')[:5]))
        
        return question_dim, answer_dim
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
    finally:
        cursor.close()
        connection.close()

# Run the check
actual_dimensions = parse_vector_strings()

Actual question vector dimension: 1536
Actual answer vector dimension: 1536

First 5 elements of question vector:
-0.006378479, 9.1961985e-05, 0.03157249, -0.013749311, -0.015982104

First 5 elements of answer vector:
-0.00027473792, -0.004568908, 0.03342442, -0.033418152, -0.013350848


In [9]:
# Check for duplicate question IDs

def check_duplicate_questions():
    connection = psycopg2.connect(POSTGRESQL_CONNECTION)
    cursor = connection.cursor()
    
    try:
        # Find question_ids that appear more than once
        cursor.execute("""
            SELECT question_id, COUNT(*)
            FROM talk
            GROUP BY question_id
            HAVING COUNT(*) > 1
        """)
        
        # Fetch all results
        duplicates = cursor.fetchall()
        
        if duplicates:
            print(f"Found {len(duplicates)} duplicate question IDs:")
            for dup in duplicates:
                print(f"Question ID: {dup[0]}, Count: {dup[1]}")
        else:
            print("No duplicate question IDs found.")
        
        return duplicates
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
    finally:
        cursor.close()
        connection.close()

# Run the check
duplicate_questions = check_duplicate_questions()

No duplicate question IDs found.


In [10]:
# Check metadata completeness

def check_metadata_completeness():
    connection = psycopg2.connect(POSTGRESQL_CONNECTION)
    cursor = connection.cursor()
    
    try:
        # Count records with incomplete metadata
        cursor.execute("""
            SELECT COUNT(*) 
            FROM talk 
            WHERE metadata->>'topic' IS NULL 
               OR metadata->>'question_title' IS NULL 
               OR metadata->>'sources' IS NULL
        """)
        
        incomplete_count = cursor.fetchone()[0]
        
        print(f"Records with incomplete metadata: {incomplete_count}")
        return incomplete_count
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
    finally:
        cursor.close()
        connection.close()

# Run the check
incomplete_metadata = check_metadata_completeness()

Records with incomplete metadata: 0


In [15]:
import psycopg2

def simple_vector_consistency_check():
    connection = psycopg2.connect(POSTGRESQL_CONNECTION)
    cursor = connection.cursor()
    
    try:
        # Get total count
        cursor.execute("SELECT COUNT(*) FROM talk")
        total_count = cursor.fetchone()[0]
        print(f"Total records: {total_count}")
        
        # Check for NULL vectors
        cursor.execute("""
            SELECT COUNT(*) 
            FROM talk 
            WHERE question_vector IS NULL OR answers_vector IS NULL
        """)
        null_count = cursor.fetchone()[0]
        print(f"Records with NULL vectors: {null_count}")
        
        # Fetch a sample to examine vector format
        cursor.execute("SELECT question_vector, answers_vector FROM talk LIMIT 1")
        sample = cursor.fetchone()
        if sample:
            question_vector, answers_vector = sample
            print("\nSample vector formats:")
            print(f"Question vector (first 100 chars): {str(question_vector)[:100]}")
            print(f"Answer vector (first 100 chars): {str(answers_vector)[:100]}")
            
            # Attempt to count elements (this may not work if the format is unexpected)
            try:
                q_elements = str(question_vector).count(',') + 1
                a_elements = str(answers_vector).count(',') + 1
                print(f"\nApproximate number of elements:")
                print(f"Question vector: {q_elements}")
                print(f"Answer vector: {a_elements}")
            except:
                print("Unable to count elements reliably.")
        
        return total_count, null_count, sample
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
    finally:
        cursor.close()
        connection.close()

# Run the check
consistency_results = simple_vector_consistency_check()

Total records: 7195
Records with NULL vectors: 0

Sample vector formats:
Question vector (first 100 chars): [-0.006378479,9.1961985e-05,0.03157249,-0.013749311,-0.015982104,0.019337825,0.004106513,-6.492934e-
Answer vector (first 100 chars): [-0.00027473792,-0.004568908,0.03342442,-0.033418152,-0.013350848,0.011456053,-0.0006457297,-0.01528

Approximate number of elements:
Question vector: 1536
Answer vector: 1536


In [17]:
import psycopg2
import json

def fetch_random_entry():
    connection = psycopg2.connect(POSTGRESQL_CONNECTION)
    cursor = connection.cursor()
    
    try:
        # Fetch a random entry
        cursor.execute("""
            SELECT question_id, question_full, answers, metadata, 
                   LEFT(question_vector::text, 50) as question_vector_sample, 
                   LEFT(answers_vector::text, 50) as answers_vector_sample
            FROM talk
            ORDER BY RANDOM()
            LIMIT 1
        """)
        
        result = cursor.fetchone()
        
        if result:
            question_id, question_full, answers, metadata, question_vector_sample, answers_vector_sample = result
            
            print("Random entry from the 'talk' table:")
            print(f"Question ID: {question_id}")
            print(f"Question: {question_full}")
            print(f"Answers: {json.dumps(answers, indent=2)}")
            print(f"Metadata: {json.dumps(metadata, indent=2)}")
            print(f"Question Vector (first 50 chars): {question_vector_sample}...")
            print(f"Answers Vector (first 50 chars): {answers_vector_sample}...")
        else:
            print("No entries found in the table.")
        
        return result
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
    finally:
        cursor.close()
        connection.close()

# Fetch and display a random entry
random_entry = fetch_random_entry()

Random entry from the 'talk' table:
Question ID: mh_2458
Question: I feel like I'm suffocating in my own anxieties
Answers: [
  {
    "answer": "It's only natural to feel scared sometimes. Remember, you're not alone.",
    "source": "AskTheraRAGBuddy"
  },
  {
    "answer": "Feeling scared is okay. Take a deep breath and try to focus on something positive.",
    "source": "AskTheraRAGBuddy"
  },
  {
    "answer": "When you're scared, it's important to reach out for support. You don't have to face it alone.",
    "source": "AskTheraRAGBuddy"
  },
  {
    "answer": "Scared feelings are temporary. They will pass, and you'll feel stronger for having faced them.",
    "source": "AskTheraRAGBuddy"
  },
  {
    "answer": "Feeling scared is a normal human emotion. Be gentle with yourself as you navigate through it.",
    "source": "AskTheraRAGBuddy"
  },
  {
    "answer": "Remember, it's okay to feel scared. Allow yourself to acknowledge your emotions without judgment.",
    "source": "AskTher

In [6]:
# Lets try some test data

import psycopg2
import pandas as pd



# Connect to the PostgreSQL database
conn = connect_to_db()
cursor = conn.cursor()

# Example usage with sample data
question_data = {
    "question_id": "cc_0",
    "topic": "depression",
    "question_title": "Do I have too many issues for counseling?",
    "question_full": "I have so many issues to address. I have a history of sexual abuse...",
    "answers": [
        {"answer": "It is very common for people to have multiple issues...", "source": "Jennifer Molinari"},
        {"answer": "I've never heard of someone having 'too many issues'...", "source": "Jason Lynch"}
    ]
}

insert_data(question_data)

# Query to retrieve data from the table
query = "SELECT question_id, question_full, answers, question_vector, answers_vector, metadata FROM talk"
cursor.execute(query)

# Fetch all rows from the query
rows = cursor.fetchall()

# Process the rows into a list format
result_list = []
for row in rows:
    result_list.append({
        "question_id": row[0],
        "question_full": row[1],
        "answers": row[2],
        "question_vector": row[3],
        "answers_vector": row[4],
        "metadata": row[5]
    })

# Print the list of results
for item in result_list:
    print("Question ID:", item['question_id'])
    print("Question Full:", item['question_full'])
    print("Answers:", item['answers'])
    print("Question Vector:", item['question_vector'][:5], "...")  # Print first 5 vector values for brevity
    print("Answers Vector:", item['answers_vector'][:5], "...")  # Print first 5 vector values for brevity
    print("Metadata:", item['metadata'])  # This will include 'topic', 'question_title', and 'sources'
    print("="*50)

# Close the cursor and connection
cursor.close()
conn.close()

Question ID: cc_0
Question Full: I have so many issues to address. I have a history of sexual abuse...
Answers: [{'answer': 'It is very common for people to have multiple issues...', 'source': 'Jennifer Molinari'}, {'answer': "I've never heard of someone having 'too many issues'...", 'source': 'Jason Lynch'}]
Question Vector: [-0.0 ...
Answers Vector: [0.00 ...
Metadata: {'topic': 'depression', 'sources': ['Jennifer Molinari', 'Jason Lynch'], 'question_title': 'Do I have too many issues for counseling?'}
