In [7]:
import re
import nltk
import numpy as np
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN

nltk.download('punkt')

# === CONFIG ===
INPUT_PATH = './data/data.txt'
OUTPUT_PATH = './data/curated.txt'
MAX_TOKENS = 200
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
CLUSTER_EPS = 0.15
MIN_SAMPLES = 2

# === STEP 1: Load and Clean ===
def load_text(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

def clean_and_segment(text):
    text = re.sub(r'\s+', ' ', text)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    return [s.strip() for s in sentences if len(s.strip()) > 0]

# === STEP 2: Chunking ===
def chunk_sentences(sentences, max_tokens=MAX_TOKENS):
    chunks, current_chunk, token_count = [], [], 0
    for s in sentences:
        tokens = s.split()
        if token_count + len(tokens) > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = tokens
            token_count = len(tokens)
        else:
            current_chunk.extend(tokens)
            token_count += len(tokens)
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# === STEP 3: Embedding ===
def embed_chunks(chunks, model):
    return model.encode(chunks, convert_to_tensor=False)

# === STEP 4: Clustering Similar Chunks ===
def cluster_embeddings(embeddings, eps=CLUSTER_EPS, min_samples=MIN_SAMPLES):
    sim_matrix = cosine_similarity(embeddings)
    sim_matrix = np.clip(sim_matrix, 0, 1)  # Fix for floating-point errors
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    labels = clustering.fit_predict(1 - sim_matrix)
    return labels

# === STEP 5: Deduplicate: Keep Longest Chunk per Cluster ===
def filter_redundant_chunks(chunks, labels):
    cluster_map = {}
    for idx, label in enumerate(labels):
        key = f'unique_{idx}' if label == -1 else f'cluster_{label}'
        cluster_map.setdefault(key, []).append(chunks[idx])
    
    filtered = []
    for group in cluster_map.values():
        group.sort(key=lambda x: len(x.split()), reverse=True)
        filtered.append(group[0])
    return filtered

# === STEP 6: Save ===
def write_chunks(chunks, out_path):
    with open(out_path, 'w', encoding='utf-8') as f:
        for chunk in chunks:
            f.write(chunk.strip() + "\n\n")

# === MAIN PIPELINE ===
def main():
    print("Loading and cleaning...")
    raw_text = load_text(INPUT_PATH)
    sentences = clean_and_segment(raw_text)
    
    print("Chunking...")
    chunks = chunk_sentences(sentences)
    print(f"Generated {len(chunks)} chunks.")
    
    print("Embedding...")
    model = SentenceTransformer(EMBEDDING_MODEL)
    embeddings = embed_chunks(chunks, model)

    print("Clustering...")
    labels = cluster_embeddings(embeddings)
    
    print("Filtering redundant chunks...")
    deduped_chunks = filter_redundant_chunks(chunks, labels)
    print(f"{len(deduped_chunks)} chunks after deduplication.")

    print("Saving...")
    write_chunks(deduped_chunks, OUTPUT_PATH)
    print(f"Done. Output saved to {OUTPUT_PATH}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /home/amon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading and cleaning...
Chunking...
Generated 8221 chunks.
Embedding...
Clustering...
Filtering redundant chunks...
1225 chunks after deduplication.
Saving...
Done. Output saved to ./data/curated.txt
