In [None]:
# Install library yang diperlukan
!pip install PyMuPDF faiss-cpu sentence-transformers --quiet

import os
import fitz  # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# ======================= Step 1: Load Banyak PDF =========================
def extract_text_from_pdfs(pdf_folder):
    texts = []
    filenames = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            doc = fitz.open(os.path.join(pdf_folder, filename))
            text = ""
            for page in doc:
                text += page.get_text()
            texts.append(text)
            filenames.append(filename)
    return texts, filenames

# ======================= Step 2: Preprocessing + Chunking =========================
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

# ======================= Step 3: Embedding =========================
def embed_texts(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Ringan dan cepat
    embeddings = model.encode(chunks)
    return embeddings

# ======================= Step 4: Indexing ke FAISS =========================
def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# ======================= Step 5: Retrieval =========================
def search(index, query, model, chunks, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = [chunks[i] for i in indices[0]]
    return results

# ======================= MAIN =========================
# 1. Extract teks dari banyak PDF
pdf_folder = '/content/AWS'
texts, filenames = extract_text_from_pdfs(pdf_folder)

# 2. Chunk semua teks
all_chunks = []
for text in texts:
    chunks = chunk_text(text, chunk_size=500)
    all_chunks.extend(chunks)

# 3. Embed semua chunks
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_texts(all_chunks)

# 4. Bangun FAISS index
faiss_index = build_faiss_index(np.array(embeddings))

# 5. Contoh query
query = "Layanan cloud apa yang cocok untuk mengamankan website, apakah WAF atau Shield?"
results = search(faiss_index, query, embedding_model, all_chunks)

# 6. Tampilkan hasil retrieval
print("Top-5 hasil retrieval untuk query:")
for i, res in enumerate(results):
    print(f"\n[{i+1}] {res[:300]}...")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Top-5 hasil retrieval untuk query:

[1] service .................................................................... 1139 Data protection ................................................................................................................................. 1140 Identity and Access Management .......................................

[2] This platform is widely adopted by a huge customer community and a good enough ﬁt for most use cases. It is tried and tested with a vibrant support community for any issues that might be encountered. It is a low-risk decision with enough training available for the development resources. • Best ﬁt fo...

[3] on-premises environment. Availability Zones are designed to mitigate against the risk of natural disaster and other disruptions that may occur. Availability Zones are physically separated within a metropolitan region and are in different flood plains. Each Availability Zone is also designed as an in...

[4] the ISO 27002 and ISO 27001 stand

In [None]:
# 5. Contoh query
query = "Layanan cloud apa yang cocok untuk mengamankan website, apakah WAF atau Shield?"
results = search(faiss_index, query, embedding_model, all_chunks)

# 6. Tampilkan hasil retrieval
print("Top-5 hasil retrieval untuk query:")
for i, res in enumerate(results):
    print(f"\n[{i+1}] {res[:300]}...")

Top-5 hasil retrieval untuk query:

[1] AWS Shield Advanced Developer Guide Shield Advanced automatic application layer DDoS mitigation Shield Advanced now maintains a rate-based rule in the automatic mitigation rule group that limits the volume of requests from IP addresses known to be sources of DDoS attacks. October 31, 2023 Updated AW...

[2] DDoS attacks at the network and transport layers. Explore the guide • Getting started with AWS Shield Advanced Get started with AWS Shield Advanced by using the AWS Shield Advanced console. Explore the guide • AWS Shield Advanced workshop Protect internet-exposed resources against DDoS attacks, moni...

[3] Shield Advanced Developer Guide AWS WAF AWS WAF is a web application ﬁrewall that lets you monitor the HTTP(S) requests that are forwarded to your protected web application resources. You can protect the following resource types: • Amazon CloudFront distribution • Amazon API Gateway REST API • Appli...

[4] Developer Guide AWS WAF, AWS Fire