# Embedding Creation

This notebook demonstrates the process of creating embeddings for extracted text using Amazon SageMaker. The embeddings will be stored in a vector database for efficient retrieval during the Q&A process.

In [8]:
import boto3
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
import json

# Initialize AWS clients
s3_client = boto3.client('s3')
sagemaker_runtime_client = boto3.client('sagemaker-runtime')

# Define constants
BUCKET_NAME = 'genaiprojectawsbucket'
EMBEDDING_MODEL_ENDPOINT = 'document-qa-embedding-model-endpoint'

# Function to create embeddings
# Add this function to your notebook
from sentence_transformers import SentenceTransformer

def create_embeddings_local(texts):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(texts)
    return embeddings


# Load extracted text from S3
def load_extracted_text(s3_key):
    response = s3_client.get_object(Bucket=BUCKET_NAME, Key=s3_key)
    text_data = response['Body'].read().decode('utf-8')
    return text_data.split('\n')  # Assuming each line is a separate text entry


# Example usage
s3_key = 'doc1_text.txt'  # Update with your S3 key
extracted_texts = load_extracted_text(s3_key)
embeddings = create_embeddings_local(extracted_texts)

# Normalize embeddings
normalized_embeddings = normalize(embeddings)

# Save embeddings to a DataFrame
embeddings_df = pd.DataFrame(normalized_embeddings)
embeddings_df.to_csv('embeddings.csv', index=False)

print('Embeddings created and saved to embeddings.csv')




README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings created and saved to embeddings.csv
