In [6]:
%pip install --force-reinstall --no-deps -r ./requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Collecting awscli (from -r ./requirements.txt (line 1))
  Using cached awscli-1.44.41-py3-none-any.whl.metadata (11 kB)
Collecting boto3 (from -r ./requirements.txt (line 2))
  Using cached boto3-1.42.51-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore (from -r ./requirements.txt (line 3))
  Using cached botocore-1.42.51-py3-none-any.whl.metadata (5.9 kB)
Collecting opensearch-py (from -r ./requirements.txt (line 4))
  Using cached opensearch_py-3.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting requests-aws4auth (from -r ./requirements.txt (line 5))
  Using cached requests_aws4auth-1.3.1-py3-none-any.whl.metadata (18 kB)
Using cached awscli-1.44.41-py3-none-any.whl (4.6 MB)
Using cached boto3-1.42.51-py3-none-any.whl (140 kB)
Using cached botocore-1.42.51-py3-none-any.whl (14.6 MB)
Using cached opensearch_py-3.1.0-py3-none-any.whl (385 kB)
Using cached requests_aws4auth-1.3.1-py3-none-any.whl (24 kB)


In [7]:
# Restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [8]:
# Standard library imports
import os
import sys
import json
import time
import random

# Third-party imports
import boto3
from botocore.exceptions import ClientError

with open('../secrets/aws_access_key_id.txt') as f:
    os.environ['AWS_ACCESS_KEY_ID'] = f.read().strip()

with open('../secrets/aws_secret_access_key.txt') as f:
    os.environ['AWS_SECRET_ACCESS_KEY'] = f.read().strip()

os.environ['AWS_DEFAULT_REGION'] = 'eu-west-3'

# Local imports
import utility


# Print SDK versions
print(f"Python version: {sys.version.split()[0]}")
print(f"Boto3 SDK version: {boto3.__version__}")

Python version: 3.10.18
Boto3 SDK version: 1.42.51


In [9]:
boto_session = boto3.Session()
aws_region = os.environ['AWS_DEFAULT_REGION']

aoss_client = boto3.client('opensearchserverless')
bedrock_agent_client = boto3.client('bedrock-agent')
s3_client = boto3.client('s3')

resource_suffix = random.randrange(100, 99999)
s3_bucket_name = f"bedrock-kb-{aws_region}-{resource_suffix}"
aoss_collection_name = f"bedrock-kb-collection-{resource_suffix}"
aoss_index_name = f"bedrock-kb-index-{resource_suffix}"
bedrock_kb_name = f"bedrock-kb-{resource_suffix}"

embedding_model_id = 'amazon.titan-embed-text-v2:0'
embedding_model_arn = f'arn:aws:bedrock:{aws_region}::foundation-model/{embedding_model_id}'
embedding_model_dim = 1024

local_data_dir = 'data'

print("AWS Region:", aws_region)
print("S3 Bucket:", s3_bucket_name)
print("AOSS Collection Name:", aoss_collection_name)
print("Bedrock Knowledge Base Name:", bedrock_kb_name)

AWS Region: eu-west-3
S3 Bucket: bedrock-kb-eu-west-3-58790
AOSS Collection Name: bedrock-kb-collection-58790
Bedrock Knowledge Base Name: bedrock-kb-58790


In [10]:
try:
    s3_client.head_bucket(Bucket=s3_bucket_name)
    print(f"Bucket '{s3_bucket_name}' already exists")
except ClientError as e:
    print(f"Creating bucket: '{s3_bucket_name}'... ")
    if aws_region == 'us-east-1':
        s3_client.create_bucket(Bucket=s3_bucket_name)
    else:
        s3_client.create_bucket(
            Bucket=s3_bucket_name,
            CreateBucketConfiguration={'LocationConstraint' : aws_region}
        )



Creating bucket: 'bedrock-kb-eu-west-3-58790'... 


In [11]:
from urllib.request import urlretrieve

# URLs of shareholder letters to download
urls = [
    'https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/2022-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/2021-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2021/ar/Amazon-2020-Shareholder-Letter-and-1997-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2020/ar/2019-Shareholder-Letter.pdf'
]

# Corresponding local file names
filenames = [
    'AMZN-2022-Shareholder-Letter.pdf',
    'AMZN-2021-Shareholder-Letter.pdf',
    'AMZN-2020-Shareholder-Letter.pdf',
    'AMZN-2019-Shareholder-Letter.pdf'
]

os.makedirs(local_data_dir, exist_ok=True)

for url, filename in zip(urls, filenames):
    file_path = os.path.join(local_data_dir, filename)
    urlretrieve(url, file_path)
    print(f"Downloaded: '{filename}' to '{local_data_dir}'...")


Downloaded: 'AMZN-2022-Shareholder-Letter.pdf' to 'data'...
Downloaded: 'AMZN-2021-Shareholder-Letter.pdf' to 'data'...
Downloaded: 'AMZN-2020-Shareholder-Letter.pdf' to 'data'...
Downloaded: 'AMZN-2019-Shareholder-Letter.pdf' to 'data'...


In [12]:
for root, _, files in os.walk(local_data_dir):
    for file in files:
        full_path = os.path.join(root, file)
        s3_client.upload_file(full_path, s3_bucket_name, file)
        print(f"Uploaded: '{file}' to 's3://{s3_bucket_name}'..")

Uploaded: 'AMZN-2022-Shareholder-Letter.pdf' to 's3://bedrock-kb-eu-west-3-58790'..
Uploaded: 'AMZN-2021-Shareholder-Letter.pdf' to 's3://bedrock-kb-eu-west-3-58790'..
Uploaded: 'AMZN-2020-Shareholder-Letter.pdf' to 's3://bedrock-kb-eu-west-3-58790'..
Uploaded: 'AMZN-2019-Shareholder-Letter.pdf' to 's3://bedrock-kb-eu-west-3-58790'..


In [13]:
bedrock_kb_execution_role = utility.create_bedrock_execution_role(bucket_name=s3_bucket_name)
bedrock_kb_execution_role_arn = bedrock_kb_execution_role['Role']['Arn']

print("Created KB execution role with ARN:", bedrock_kb_execution_role_arn)

Created KB execution role with ARN: arn:aws:iam::597423010488:role/AmazonBedrockExecutionRoleForKnowledgeBase_218


In [14]:
aoss_encryption_policy, aoss_network_policy, aoss_access_policy = utility.create_policies_in_oss(
    vector_store_name=aoss_collection_name,
    aoss_client=aoss_client,
    bedrock_kb_execution_role_arn=bedrock_kb_execution_role_arn)

print("Created encryption policy with name:", aoss_encryption_policy['securityPolicyDetail']['name'])
print("Created network policy with name:", aoss_network_policy['securityPolicyDetail']['name'])
print("Created access policy with name:", aoss_access_policy['accessPolicyDetail']['name'])

Created encryption policy with name: bedrock-sample-rag-sp-218
Created network policy with name: bedrock-sample-rag-np-218
Created access policy with name: bedrock-sample-rag-ap-218


In [15]:
# Request to create AOSS collection
aoss_collection = aoss_client.create_collection(name=aoss_collection_name, type='VECTORSEARCH')

# Wait until collection becomes active
print("Waiting until AOSS collection becomes active: ", end='')
while True:
    response = aoss_client.list_collections(collectionFilters={'name': aoss_collection_name})
    status = response['collectionSummaries'][0]['status']
    if status in ('ACTIVE', 'FAILED'):
        print(" done.")
        break
    print('█', end='', flush=True)
    time.sleep(5)

print("An AOSS collection created:", json.dumps(response['collectionSummaries'], indent=2))

Waiting until AOSS collection becomes active: █████ done.
An AOSS collection created: [
  {
    "id": "c3qhy9lruvy9tw7wvvm3",
    "name": "bedrock-kb-collection-58790",
    "status": "ACTIVE",
    "arn": "arn:aws:aoss:eu-west-3:597423010488:collection/c3qhy9lruvy9tw7wvvm3",
    "kmsKeyArn": "auto"
  }
]


In [16]:
aoss_policy_arn = utility.create_oss_policy_attach_bedrock_execution_role(
    collection_id=aoss_collection['createCollectionDetail']['id'],
    bedrock_kb_execution_role=bedrock_kb_execution_role)

print("Waiting 60 sec for data access rules to be enforced: ", end='')
for _ in range(12):  # 12 * 5 sec = 60 sec
    print('█', end='', flush=True)
    time.sleep(5)
print(" done.")

print("Created and attached policy with ARN:", aoss_policy_arn)

Waiting 60 sec for data access rules to be enforced: ████████████ done.
Created and attached policy with ARN: arn:aws:iam::597423010488:policy/AmazonBedrockOSSPolicyForKnowledgeBase_218


In [17]:
from requests_aws4auth import AWS4Auth
from opensearchpy import OpenSearch, RequestsHttpConnection

# Use default credential configuration for authentication
credentials = boto_session.get_credentials()
awsauth = AWS4Auth(
    credentials.access_key,
    credentials.secret_key,
    aws_region,
    'aoss',
    session_token=credentials.token)

# Construct AOSS endpoint host
host = f"{aoss_collection['createCollectionDetail']['id']}.{aws_region}.aoss.amazonaws.com"

# Build the OpenSearch client
os_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    timeout=300
)

In [18]:
# Define the configuration for the AOSS vector index
index_definition = {
   "settings": {
      "index.knn": "true",
       "number_of_shards": 1,
       "knn.algo_param.ef_search": 512,
       "number_of_replicas": 0,
   },
   "mappings": {
      "properties": {
         "vector": {
            "type": "knn_vector",
            "dimension": embedding_model_dim,
             "method": {
                 "name": "hnsw",
                 "engine": "faiss",
                 "space_type": "l2"
             },
         },
         "text": {
            "type": "text"
         },
         "text-metadata": {
            "type": "text"
         }
      }
   }
}

# Create an OpenSearch index
response = os_client.indices.create(index=aoss_index_name, body=index_definition)

# Waiting for index creation to propagate
print("Waiting 30 sec for index update to propagate: ", end='')
for _ in range(6):  # 6 * 5 sec = 30 sec
    print('█', end='', flush=True)
    time.sleep(5)
print(" done.")

print("A new AOSS index created:", json.dumps(response, indent=2))

Waiting 30 sec for index update to propagate: ██████ done.
A new AOSS index created: {
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "bedrock-kb-index-58790"
}


In [19]:
# Vector Storage Configuration
storage_config = {
    "type": "OPENSEARCH_SERVERLESS",
    "opensearchServerlessConfiguration": {
        "collectionArn": aoss_collection["createCollectionDetail"]['arn'],
        "vectorIndexName": aoss_index_name,
        "fieldMapping": {
            "vectorField": "vector",
            "textField": "text",
            "metadataField": "text-metadata"
        }
    }
}

# Knowledge Base Configuration
knowledge_base_config = {
    "type": "VECTOR",
    "vectorKnowledgeBaseConfiguration": {
        "embeddingModelArn": embedding_model_arn
    }
}

response = bedrock_agent_client.create_knowledge_base(
    name=bedrock_kb_name,
    description="Amazon shareholder letter knowledge base.",
    roleArn=bedrock_kb_execution_role_arn,
    knowledgeBaseConfiguration=knowledge_base_config,
    storageConfiguration=storage_config)

bedrock_kb_id = response['knowledgeBase']['knowledgeBaseId']

print("Waiting until BKB becomes active: ", end='')
while True:
    response = bedrock_agent_client.get_knowledge_base(knowledgeBaseId=bedrock_kb_id)
    if response['knowledgeBase']['status'] == 'ACTIVE':
        print(" done.")
        break
    print('█', end='', flush=True)
    time.sleep(5)

print("A new Bedrock Knowledge Base created with ID:", bedrock_kb_id)

Waiting until BKB becomes active: █ done.
A new Bedrock Knowledge Base created with ID: XBBEI68SYF


In [20]:
response = bedrock_agent_client.get_knowledge_base(knowledgeBaseId=bedrock_kb_id)

print(json.dumps(response['knowledgeBase'], indent=2, default=str))

{
  "knowledgeBaseId": "XBBEI68SYF",
  "name": "bedrock-kb-58790",
  "knowledgeBaseArn": "arn:aws:bedrock:eu-west-3:597423010488:knowledge-base/XBBEI68SYF",
  "description": "Amazon shareholder letter knowledge base.",
  "roleArn": "arn:aws:iam::597423010488:role/AmazonBedrockExecutionRoleForKnowledgeBase_218",
  "knowledgeBaseConfiguration": {
    "type": "VECTOR",
    "vectorKnowledgeBaseConfiguration": {
      "embeddingModelArn": "arn:aws:bedrock:eu-west-3::foundation-model/amazon.titan-embed-text-v2:0"
    }
  },
  "storageConfiguration": {
    "type": "OPENSEARCH_SERVERLESS",
    "opensearchServerlessConfiguration": {
      "collectionArn": "arn:aws:aoss:eu-west-3:597423010488:collection/c3qhy9lruvy9tw7wvvm3",
      "vectorIndexName": "bedrock-kb-index-58790",
      "fieldMapping": {
        "vectorField": "vector",
        "textField": "text",
        "metadataField": "text-metadata"
      }
    }
  },
  "status": "ACTIVE",
  "createdAt": "2026-02-18 16:33:35.784960+00:00",
  "u

In [21]:
# Data Source Configuration
data_source_config = {
        "type": "S3",
        "s3Configuration":{
            "bucketArn": f"arn:aws:s3:::{s3_bucket_name}",
            # "inclusionPrefixes":["*.*"]   # you can use this if you want to create a KB using data within s3 prefixes.
        }
    }

# Vector Ingestion Configuration
vector_ingestion_config = {
        "chunkingConfiguration": {
            "chunkingStrategy": "FIXED_SIZE",
            "fixedSizeChunkingConfiguration": {
                "maxTokens": 512,
                "overlapPercentage": 20
            }
        }
    }

response = bedrock_agent_client.create_data_source(
    name=bedrock_kb_name,
    description="Amazon shareholder letter knowledge base.",
    knowledgeBaseId=bedrock_kb_id,
    dataSourceConfiguration=data_source_config,
    vectorIngestionConfiguration=vector_ingestion_config
)

bedrock_ds_id = response['dataSource']['dataSourceId']

print("A new BKB data source created with ID:", bedrock_ds_id)

A new BKB data source created with ID: NMUTPUEJPR


In [22]:
response = bedrock_agent_client.get_data_source(knowledgeBaseId=bedrock_kb_id, dataSourceId=bedrock_ds_id)

print(json.dumps(response['dataSource'], indent=2, default=str))

{
  "knowledgeBaseId": "XBBEI68SYF",
  "dataSourceId": "NMUTPUEJPR",
  "name": "bedrock-kb-58790",
  "status": "AVAILABLE",
  "description": "Amazon shareholder letter knowledge base.",
  "dataSourceConfiguration": {
    "type": "S3",
    "s3Configuration": {
      "bucketArn": "arn:aws:s3:::bedrock-kb-eu-west-3-58790"
    }
  },
  "vectorIngestionConfiguration": {
    "chunkingConfiguration": {
      "chunkingStrategy": "FIXED_SIZE",
      "fixedSizeChunkingConfiguration": {
        "maxTokens": 512,
        "overlapPercentage": 20
      }
    }
  },
  "dataDeletionPolicy": "DELETE",
  "createdAt": "2026-02-18 16:38:26.198080+00:00",
  "updatedAt": "2026-02-18 16:38:26.198080+00:00"
}


In [23]:
# Start an ingestion job
response = bedrock_agent_client.start_ingestion_job(knowledgeBaseId=bedrock_kb_id, dataSourceId=bedrock_ds_id)

bedrock_job_id = response['ingestionJob']['ingestionJobId']

print("A new BKB ingestion job started with ID:", bedrock_job_id)

A new BKB ingestion job started with ID: 7BFKNKISUJ


In [24]:
# Wait until ingestion job completes
print("Waiting until BKB ingestion job completes: ", end='')
while True:
    response = bedrock_agent_client.get_ingestion_job(
        knowledgeBaseId = bedrock_kb_id,
        dataSourceId = bedrock_ds_id,
        ingestionJobId = bedrock_job_id)
    if response['ingestionJob']['status'] == 'COMPLETE':
        print(" done.")
        break
    print('█', end='', flush=True)
    time.sleep(5)

print("The BKB ingestion job finished:", json.dumps(response['ingestionJob'], indent=2, default=str))

Waiting until BKB ingestion job completes: ██ done.
The BKB ingestion job finished: {
  "knowledgeBaseId": "XBBEI68SYF",
  "dataSourceId": "NMUTPUEJPR",
  "ingestionJobId": "7BFKNKISUJ",
  "status": "COMPLETE",
  "statistics": {
    "numberOfDocumentsScanned": 4,
    "numberOfMetadataDocumentsScanned": 0,
    "numberOfNewDocumentsIndexed": 4,
    "numberOfModifiedDocumentsIndexed": 0,
    "numberOfMetadataDocumentsModified": 0,
    "numberOfDocumentsDeleted": 0,
    "numberOfDocumentsFailed": 0
  },
  "startedAt": "2026-02-18 16:39:34.059435+00:00",
  "updatedAt": "2026-02-18 16:39:53.643782+00:00"
}


In [25]:
%store s3_bucket_name aoss_encryption_policy aoss_network_policy aoss_access_policy aoss_collection bedrock_kb_id

Stored 's3_bucket_name' (str)
Stored 'aoss_encryption_policy' (dict)
Stored 'aoss_network_policy' (dict)
Stored 'aoss_access_policy' (dict)
Stored 'aoss_collection' (dict)
Stored 'bedrock_kb_id' (str)


  db[ 'autorestore/' + arg ] = obj
  db[ 'autorestore/' + arg ] = obj
  db[ 'autorestore/' + arg ] = obj
  db[ 'autorestore/' + arg ] = obj
  db[ 'autorestore/' + arg ] = obj
  db[ 'autorestore/' + arg ] = obj
