In [17]:
import os
import json
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from sentence_transformers import SentenceTransformer
import faiss

# Read PDFs and Chunk

In [3]:
# Read PDF file and return full text
def read_pdf(filepath):
    text = ""
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

In [8]:
# Chunk text with RecursiveCharacterTextSplitter
def chunk_text(text, chunk_size=600, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

In [13]:
# Save chunks as JSON to chunks/ directory
def save_chunks_to_json(chunks, filename, output_dir="chunks"):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)

    # Wrap each chunk as a dict for easier future use
    data = [{"id": i, "text": chunk} for i, chunk in enumerate(chunks)]

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(data)} chunks to {output_path}")

In [None]:
# Process all PDFs in /data
data_dir = "data"
for file in os.listdir(data_dir):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(data_dir, file)
        print(f"Processing: {pdf_path}")
        text = read_pdf(pdf_path)
        chunks = chunk_text(text, chunk_size=600, chunk_overlap=100)

        base_filename = os.path.splitext(file)[0] + ".json"
        save_chunks_to_json(chunks, base_filename)

📚 Processing: data/catan_knights_3to4p.pdf
✅ Saved 140 chunks to chunks/catan_knights_3to4p.json
📚 Processing: data/catan_barbarians_3to4p.pdf
✅ Saved 167 chunks to chunks/catan_barbarians_3to4p.json
📚 Processing: data/catan_seafarers_3to4p.pdf
✅ Saved 126 chunks to chunks/catan_seafarers_3to4p.json
📚 Processing: data/catan_knights_5to6p.pdf
✅ Saved 12 chunks to chunks/catan_knights_5to6p.json
📚 Processing: data/catan_barbarians_5to6p.pdf
✅ Saved 30 chunks to chunks/catan_barbarians_5to6p.json
📚 Processing: data/catan_pirates_3to4p.pdf
✅ Saved 112 chunks to chunks/catan_pirates_3to4p.json
📚 Processing: data/catan_pirates_5to4p.pdf
✅ Saved 23 chunks to chunks/catan_pirates_5to4p.json
📚 Processing: data/catan_base_5to6p.pdf
✅ Saved 22 chunks to chunks/catan_base_5to6p.json
📚 Processing: data/catan_base_3to4p.pdf
✅ Saved 111 chunks to chunks/catan_base_3to4p.json
📚 Processing: data/catan_seafarers_5to6p.pdf
✅ Saved 41 chunks to chunks/catan_seafarers_5to6p.json


# Tokenize and Index

In [18]:
def load_chunks_from_folder(folder_path):
    all_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
                all_texts.extend([item['text'] for item in data])
    return all_texts

In [19]:
def split_texts(texts, model_name="sentence-transformers/all-MiniLM-L6-v2", chunk_size=256, chunk_overlap=20):
    splitter = SentenceTransformersTokenTextSplitter(
        chunk_overlap=chunk_overlap,
        tokens_per_chunk=chunk_size,
        model_name=model_name
    )
    split_chunks = []
    for text in texts:
        split_chunks.extend(splitter.split_text(text))
    return split_chunks

In [20]:
def embed_and_index(chunks, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    
    return index, chunks

In [22]:
folder_path = "chunks/"
texts = load_chunks_from_folder(folder_path)
print(f"Loaded {len(texts)} documents.")

split_chunks = split_texts(texts)
print(f"Split into {len(split_chunks)} token-aware chunks.")

index, chunk_lookup = embed_and_index(split_chunks)
print("FAISS index built.")

Loaded 784 documents.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Split into 824 token-aware chunks.


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

FAISS index built.


In [24]:
faiss.write_index(index, "index/my_index.faiss")
with open("index/chunk_lookup.json", "w") as f:
    json.dump(chunk_lookup, f)

# Retrieval

In [25]:
def search_faiss(query, k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    
    print(f"\nTop {k} results for: \"{query}\"\n")
    for rank, (idx, dist) in enumerate(zip(indices[0], distances[0]), start=1):
        print(f"Result #{rank} (Distance: {dist:.4f})")
        print(chunk_lookup[idx])
        print("-" * 80)

In [26]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

query = "I have all the necessary resources and it is my turn. Where can I place a city? Anywhere connected to a road, right?"
search_faiss(query, k=3)


Top 3 results for: "I have all the necessary resources and it is my turn. Where can I place a city? Anywhere connected to a road, right?"

Result #1 (Distance: 0.9690)
production, you do not receive any resources or player ’ s metropolis away ), you may place another metropolis commodities, you may take any one resource of your choice gate on one of your cities. but, you may not purchase any from the bank. you many not, however, use this ability when improvements beyond the third level of a given color unless you a “ 7 ” is rolled. have a city where you could build a metropolis. if you do not have such a city on the board, you must wait until you have built at least one more city. illustration l 3 city improvements 88
--------------------------------------------------------------------------------
Result #2 (Distance: 1.0133)
player places 1 city and 1 road. ( the first player to – build roads, settlements, cities, place a settlement will be the last to place a city ). beginners shoul