In [34]:
import os
import json
import re
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from sentence_transformers import SentenceTransformer
import faiss

# Read PDFs and Chunk

In [29]:
# Read PDF file and return full text
def read_pdf(filepath):
    text = ""
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

In [53]:
def read_pdf_smart(filepath): 
    full_text = ""
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            text = page.extract_text(x_tolerance=1.5, y_tolerance=1.5, layout=True)
            if text:
                full_text += text + "\n"
    return full_text

In [54]:
# Chunk text with RecursiveCharacterTextSplitter
def chunk_text(text, chunk_size=600, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

In [55]:
def chunk_text_smart(text):
    lines = text.splitlines()
    sections = []
    current_heading = ""
    buffer = []

    for line in lines:
        stripped = line.strip()
        if is_heading(stripped):
            # Save previous section
            if buffer:
                sections.append(current_heading + "Content: " + "\n".join(buffer).strip())
                buffer = []
            current_heading = stripped
        else:
            buffer.append(stripped)

    return sections

def is_heading(line):
    line = line.strip()
    # Good heuristics for heading lines:
    return (
        len(line) < 100 and (
            re.match(r"^[A-Z][A-Z\s]{3,}$", line) or  # ALL CAPS
            re.match(r"^[A-Z][a-z]{1,15}(\s+[A-Z][a-z]{1,15}){0,4}$", line) or  # Title Case
            re.match(r"^[a-z]?\)?\s?[A-Z].{0,50}$", line)  # a) Heading Y style
        )
    )

In [13]:
# Save chunks as JSON to chunks/ directory
def save_chunks_to_json(chunks, filename, output_dir="chunks"):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)

    # Wrap each chunk as a dict for easier future use
    data = [{"id": i, "text": chunk} for i, chunk in enumerate(chunks)]

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(data)} chunks to {output_path}")

In [None]:
# Process all PDFs in /data
data_dir = "data"
for file in os.listdir(data_dir):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(data_dir, file)
        print(f"Processing: {pdf_path}")
        text = read_pdf(pdf_path)
        chunks = chunk_text(text, chunk_size=600, chunk_overlap=100)

        base_filename = os.path.splitext(file)[0] + ".json"
        save_chunks_to_json(chunks, base_filename)

📚 Processing: data/catan_knights_3to4p.pdf
✅ Saved 140 chunks to chunks/catan_knights_3to4p.json
📚 Processing: data/catan_barbarians_3to4p.pdf
✅ Saved 167 chunks to chunks/catan_barbarians_3to4p.json
📚 Processing: data/catan_seafarers_3to4p.pdf
✅ Saved 126 chunks to chunks/catan_seafarers_3to4p.json
📚 Processing: data/catan_knights_5to6p.pdf
✅ Saved 12 chunks to chunks/catan_knights_5to6p.json
📚 Processing: data/catan_barbarians_5to6p.pdf
✅ Saved 30 chunks to chunks/catan_barbarians_5to6p.json
📚 Processing: data/catan_pirates_3to4p.pdf
✅ Saved 112 chunks to chunks/catan_pirates_3to4p.json
📚 Processing: data/catan_pirates_5to4p.pdf
✅ Saved 23 chunks to chunks/catan_pirates_5to4p.json
📚 Processing: data/catan_base_5to6p.pdf
✅ Saved 22 chunks to chunks/catan_base_5to6p.json
📚 Processing: data/catan_base_3to4p.pdf
✅ Saved 111 chunks to chunks/catan_base_3to4p.json
📚 Processing: data/catan_seafarers_5to6p.pdf
✅ Saved 41 chunks to chunks/catan_seafarers_5to6p.json


In [56]:
# Process all PDFs smartly in /data
data_dir = "data"
for file in os.listdir(data_dir):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(data_dir, file)
        print(f"Processing: {pdf_path}")
        text = read_pdf_smart(pdf_path)
        chunks = chunk_text_smart(text)

        base_filename = os.path.splitext(file)[0] + ".json"
        save_chunks_to_json(chunks, base_filename)

Processing: data/catan_knights_3to4p.pdf
✅ Saved 296 chunks to chunks/catan_knights_3to4p.json
Processing: data/catan_barbarians_3to4p.pdf
✅ Saved 254 chunks to chunks/catan_barbarians_3to4p.json
Processing: data/catan_seafarers_3to4p.pdf
✅ Saved 228 chunks to chunks/catan_seafarers_3to4p.json
Processing: data/catan_knights_5to6p.pdf
✅ Saved 25 chunks to chunks/catan_knights_5to6p.json
Processing: data/catan_barbarians_5to6p.pdf
✅ Saved 74 chunks to chunks/catan_barbarians_5to6p.json
Processing: data/catan_pirates_3to4p.pdf
✅ Saved 209 chunks to chunks/catan_pirates_3to4p.json
Processing: data/catan_pirates_5to4p.pdf
✅ Saved 25 chunks to chunks/catan_pirates_5to4p.json
Processing: data/catan_base_5to6p.pdf
✅ Saved 46 chunks to chunks/catan_base_5to6p.json
Processing: data/catan_base_3to4p.pdf
✅ Saved 238 chunks to chunks/catan_base_3to4p.json
Processing: data/catan_seafarers_5to6p.pdf
✅ Saved 53 chunks to chunks/catan_seafarers_5to6p.json


# Tokenize and Index

In [57]:
def load_chunks_from_folder(folder_path):
    all_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
                all_texts.extend([item['text'] for item in data])
    return all_texts

In [58]:
def split_texts(texts, model_name="sentence-transformers/all-MiniLM-L6-v2", chunk_size=256, chunk_overlap=20):
    splitter = SentenceTransformersTokenTextSplitter(
        chunk_overlap=chunk_overlap,
        tokens_per_chunk=chunk_size,
        model_name=model_name
    )
    split_chunks = []
    for text in texts:
        split_chunks.extend(splitter.split_text(text))
    return split_chunks

In [59]:
def embed_and_index(chunks, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    
    return index, chunks

In [60]:
folder_path = "chunks/"
texts = load_chunks_from_folder(folder_path)
print(f"Loaded {len(texts)} documents.")

split_chunks = split_texts(texts)
print(f"Split into {len(split_chunks)} token-aware chunks.")

index, chunk_lookup = embed_and_index(split_chunks)
print("FAISS index built.")

Loaded 1448 documents.
Split into 1542 token-aware chunks.


Batches:   0%|          | 0/49 [00:00<?, ?it/s]

FAISS index built.


In [61]:
faiss.write_index(index, "index/my_index.faiss")
with open("index/chunk_lookup.json", "w") as f:
    json.dump(chunk_lookup, f)

# Retrieval

In [62]:
def search_faiss(query, k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    
    print(f"\nTop {k} results for: \"{query}\"\n")
    for rank, (idx, dist) in enumerate(zip(indices[0], distances[0]), start=1):
        print(f"Result #{rank} (Distance: {dist:.4f})")
        print(chunk_lookup[idx])
        print("-" * 80)

In [63]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

query = "I have all the necessary resources and it is my turn. Where can I place a city? Anywhere connected to a road, right?"
search_faiss(query, k=3)


Top 3 results for: "I have all the necessary resources and it is my turn. Where can I place a city? Anywhere connected to a road, right?"

Result #1 (Distance: 0.9765)
you cannot build a city directly. you can onlycontent : to make the sequence easier to learn for beginners. we upgrade an existing settlement to a city. you pay recommend experienced players ignore this separation. the required resources, return the settlement to after rolling for resource production, you can trade and build your supply, and replace the settlement with a city on the same intersection y. each city is worth 2 victory points. you receive in any order ( you can trade, build, trade again and build again, etc. ). you can even use a harbor on the same turn you build a double resource production ( 2 resource cards ) from the settlement there. using this method speeds up the game a lot. adjacent terrain hexes whenever those numbers are rolled. 66
------------------------------------------------------------------