In [None]:
import os
import subprocess
import sys

# Install required packages
required_packages = [
    "google-cloud-aiplatform",
    "PyMuPDF"
]

for package in required_packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Now import the libraries after installing
import fitz  # PyMuPDF
from google.cloud import aiplatform

# Set your Google Cloud project ID and index details
PROJECT_ID = '625525962479'  # Update with your project ID
INDEX_ID = 'gymbeam_advisor_1729933910721'  # Update with the correct index ID
INDEX_ENDPOINT_ID = '2200078786714664960'
LOCAL_PDF_DIR = 'in/files/'  # Local directory containing PDFs

# Set the path to your service account key JSON file
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'datahackaton-projekt-2-5e5b5288a5fd.json'

# Initialize Vertex AI
aiplatform.init(project=PROJECT_ID, location='europe-north1')

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

# Function to create embeddings
def create_embeddings(text):
    prediction_client = aiplatform.gapic.PredictionServiceClient()

    # Log the endpoint information
    #endpoint = f'projects/{PROJECT_ID}/locations/europe-north1/endpoints/{INDEX_ENDPOINT_ID}'
    endpoint = prediction_client.endpoint_path(project=PROJECT_ID, location="europe-north1", endpoint=INDEX_ENDPOINT_ID)
    
    print(f"Using endpoint: {endpoint}")

    # Create an instance for prediction
    instances = [{'content': text}]

    #try:
    # Use the predict method to get the embedding
    response = prediction_client.predict(
        endpoint=endpoint,
        instances=instances
    )
    #except Exception as e:
     #   print(f"An error occurred: {e}")
        #raise e
      #  return None

    # The response contains predictions
    embeddings = response.predictions
    return embeddings[0] if embeddings else None

# Function to upload embeddings to the vector database
def upload_embeddings_to_index(embeddings, metadata):
    index_endpoint = aiplatform.gapic.IndexEndpointServiceClient()
    index_endpoint.upsert(
        index_endpoint=f'projects/{PROJECT_ID}/locations/europe-north1/indexEndpoints/{INDEX_ENDPOINT_ID}',
        instances=[
            {
                'embedding': embedding,
                'metadata': metadata
            }
            for embedding in embeddings
        ]
    )

def main():
    embeddings = []
    metadata_list = []

    # Process each PDF in the local directory
    for filename in os.listdir(LOCAL_PDF_DIR):
        if filename.endswith('.pdf'):
            local_pdf_path = os.path.join(LOCAL_PDF_DIR, filename)

            # Extract text from the PDF
            text = extract_text_from_pdf(local_pdf_path)
            print(f'Extracted text from {filename}')

            # Create embeddings
            embedding = create_embeddings(text)
            embeddings.append(embedding)

            # Store metadata (customize as needed)
            metadata_list.append({'document_title': filename})

            # Optionally, delete the local file after processing
            # os.remove(local_pdf_path)  # Uncomment if you want to delete the PDF after processing

    # Upload all embeddings to the vector index
    upload_embeddings_to_index(embeddings, metadata_list)
    print('All embeddings uploaded to the vector index.')

if __name__ == "__main__":
    main()
