In [2]:
import os
import json
import re
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from sentence_transformers import SentenceTransformer
import faiss

# Read PDFs and Chunk

In [3]:
# Read PDF file and return full text

def read_pdf(filepath):
    text = ""
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

In [4]:
def read_pdf_smart(filepath): 
    full_text = ""
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            text = page.extract_text(x_tolerance=1.5, y_tolerance=1.5, layout=True)
            if text:
                full_text += text + "\n"
    return full_text

In [5]:
# Chunk text with RecursiveCharacterTextSplitter
def chunk_text(text, chunk_size=600, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

In [6]:
def chunk_text_smart(text):
    lines = text.splitlines()
    sections = []
    current_heading = ""
    buffer = []

    for line in lines:
        stripped = line.strip()
        if is_heading(stripped):
            # Save previous section
            if buffer:
                sections.append(current_heading + "Content: " + "\n".join(buffer).strip())
                buffer = []
            current_heading = stripped
        else:
            buffer.append(stripped)

    return sections

def is_heading(line):
    line = line.strip()
    # Good heuristics for heading lines:
    return (
        len(line) < 100 and (
            re.match(r"^[A-Z][A-Z\s]{3,}$", line) or  # ALL CAPS
            re.match(r"^[A-Z][a-z]{1,15}(\s+[A-Z][a-z]{1,15}){0,4}$", line) or  # Title Case
            re.match(r"^[a-z]?\)?\s?[A-Z].{0,50}$", line)  # a) Heading Y style
        )
    )

In [11]:
def chunk_text_modelsized(text, expansion_name, model_name="sentence-transformers/all-MiniLM-L6-v2", chunk_size=256, chunk_overlap=20):
    splitter = SentenceTransformersTokenTextSplitter(
        tokens_per_chunk=chunk_size,
        chunk_overlap=chunk_overlap,
        model_name=model_name
    )

    chunks = splitter.split_text(text)

    return [f"These rules apply to the {expansion_name} game\n{chunk}" for chunk in chunks]

In [8]:
# Save chunks as JSON to chunks/ directory
def save_chunks_to_json(chunks, filename, output_dir="chunks"):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)

    # Wrap each chunk as a dict for easier future use
    data = [{"id": i, "text": chunk} for i, chunk in enumerate(chunks)]

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(data)} chunks to {output_path}")

In [13]:
# Process all PDFs smartly in /data
rulebook_translations = {
    "catan_barbarians_3to4p": "Traders and Barbarians 3 to 4 players",
    "catan_barbarians_5to6p": "Traders and Barbarians 5 to 6 players",
    "catan_base_3to4p": "Base 3 to 4 players",
    "catan_base_5to6p": "Base 5 to 6 players",
    "catan_knights_3to4p": "Cities and Knights 3 to 4 players",
    "catan_knights_5to6p": "Cities and Knights 5 to 6 players",
    "catan_pirates_3to4p": "Explorers and Pirates 3 to 4 players",
    "catan_pirates_5to4p": "Explorers and Pirates 5 to 6 players",
    "catan_seafarers_3to4p": "Seafarers 3 to 4 players",
    "catan_seafarers_5to6p": "Seafarers 5 to 6 players"
}

data_dir = "data"
for file in os.listdir(data_dir):
    if file.endswith(".pdf"):
        filename = os.path.splitext(file)[0]
        pdf_path = os.path.join(data_dir, file)
        chunk_path = filename + ".json"

        print(f"Processing: {pdf_path}")

        text = read_pdf_smart(pdf_path)
        chunks = chunk_text_modelsized(text, rulebook_translations[filename], "sentence-transformers/all-MiniLM-L6-v2", 112, 22)
        
        save_chunks_to_json(chunks, chunk_path)

Processing: data/catan_barbarians_3to4p.pdf
Saved 232 chunks to chunks/catan_barbarians_3to4p.json
Processing: data/catan_barbarians_5to6p.pdf
Saved 46 chunks to chunks/catan_barbarians_5to6p.json
Processing: data/catan_base_3to4p.pdf
Saved 157 chunks to chunks/catan_base_3to4p.json
Processing: data/catan_base_5to6p.pdf
Saved 32 chunks to chunks/catan_base_5to6p.json
Processing: data/catan_knights_3to4p.pdf
Saved 185 chunks to chunks/catan_knights_3to4p.json
Processing: data/catan_knights_5to6p.pdf
Saved 16 chunks to chunks/catan_knights_5to6p.json
Processing: data/catan_pirates_3to4p.pdf
Saved 153 chunks to chunks/catan_pirates_3to4p.json
Processing: data/catan_pirates_5to4p.pdf
Saved 33 chunks to chunks/catan_pirates_5to4p.json
Processing: data/catan_seafarers_3to4p.pdf
Saved 204 chunks to chunks/catan_seafarers_3to4p.json
Processing: data/catan_seafarers_5to6p.pdf
Saved 94 chunks to chunks/catan_seafarers_5to6p.json


# Tokenize and Index

In [14]:
def load_chunks_from_folder(folder_path):
    all_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
                all_texts.extend([item['text'] for item in data])
    return all_texts

In [15]:
def embed_and_index(chunks, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    
    return index, chunks

In [16]:
folder_path = "chunks/"
chunks = load_chunks_from_folder(folder_path)
print(f"Loaded {len(chunks)} documents.")

index, chunk_lookup = embed_and_index(chunks)
print("FAISS index built.")

Loaded 1152 documents.


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

FAISS index built.


In [17]:
faiss.write_index(index, "index/my_index.faiss")
with open("index/chunk_lookup.json", "w") as f:
    json.dump(chunk_lookup, f)

# Retrieval

In [18]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

def search_faiss(query, k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    
    print(f"\nTop {k} results for: \"{query}\"\n")
    for rank, (idx, dist) in enumerate(zip(indices[0], distances[0]), start=1):
        print(f"Result #{rank} (Distance: {dist:.4f})")
        print(chunk_lookup[idx])
        print("-" * 80)

In [19]:
query = "How do you acquire resources during the game?" # Answer in catan_base_3to4p.pdf at page 11 (you gotta dice the numbers where your settlements are)
search_faiss(query, k=2)

query = "How do you get the Longest Road special card and what happens if another player builds a longer road?" # Answer in catan_base_3to4p.pdf at page 5 (5 continoous reoad segments (and longest))
search_faiss(query, k=2)

query = "What do you need to play a Seafarers 5-6 Player scenario?" # Answer in catan_seafarers_5to6p.pdf at page 2 (you need Catan & Catan 5&6p, and seafarers game)
search_faiss(query, k=2)

query = "How should you assemble the game board for a Seafarers scenario?" # Answer in catan_seafarers_5to6p.pdf at page 2 (Assemble frame as in the photo and place tiles ..)
search_faiss(query, k=2)

query = "What happens when the barbarian ship reaches Catan?" # Answer in catan_barbarians_3to4p.pdf at page 7 (must compare knight strength to barbarians strength)
search_faiss(query, k=2)

query = "How are knights used in the game, and what actions can they perform?" # Answer in catan_barbarians_3to4p.pdf at page 6 (msut be activated by paying 1 grain, then he can used)
search_faiss(query, k=2)


Top 2 results for: "How do you acquire resources during the game?"

Result #1 (Distance: 0.9400)
These rules apply to the Base 3 to 4 players game
with the game board laid out randomly. the board changes each game. sort the resource cards into 5 stacks and put them face up if you would like to use the variable set - up, you can find the next to the game board. illustration b guidelines in the almanac under set - up, variable y. also look shuffle the development for useful tips under set - up phase y and tactics y. cards and place them face down by the board. s u g etting p the ame you receive resources y select a color and take your 5 settlements
--------------------------------------------------------------------------------
Result #2 (Distance: 0.9671)
These rules apply to the Cities and Knights 3 to 4 players game
2 : 1 rate. you can make as many such trades as you wish. as usual, you may exchange commodities for resources and vice versa. resource monopoly ( 4 ) resource monopoly n