In [1]:
import pickle
import numpy as np
import pandas as pd
import faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

dataset = pd.read_csv('dataset/Natural-Questions-Filtered.csv')[:1000]
dataset.head(5)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,question,long_answers,short_answers
0,which is the most common use of opt-in e-mail ...,A common example of permission marketing is a ...,A newsletter sent to an advertising firm's cus...
1,how i.met your mother who is the mother,"Tracy McConnell, better known as `` The Mother...",Tracy McConnell
2,who had the most wins in the nfl,Active quarterback Tom Brady holds the records...,Tom Brady
3,who played mantis guardians of the galaxy 2,Pom Klementieff (born May 1986) is a French ac...,Pom Klementieff
4,the nashville sound brought a polished and cos...,"In the early 1960s, the Nashville sound began ...",The use of lush string arrangements with a rea...


In [2]:
documents = list(dataset['long_answers'])
num_documents = len(documents)

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(documents, show_progress_bar=True, convert_to_numpy=True).astype('float32')
dim = doc_embeddings.shape[1]

candidate_ks = list(range(5, min(50, num_documents // 10 + 1)))
best_k = candidate_ks[0]
best_score = -1
for k in candidate_ks:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(doc_embeddings)
    if k > 1:
        score = silhouette_score(doc_embeddings, cluster_labels)
        print(f"k={k}, silhouette score: {score:.4f}")
        if score > best_score:
            best_score = score
            best_k = k

print("Best k based on silhouette score:", best_k)

Batches: 100%|██████████| 32/32 [00:02<00:00, 14.83it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


k=5, silhouette score: 0.0286
k=6, silhouette score: 0.0294
k=7, silhouette score: 0.0318
k=8, silhouette score: 0.0278
k=9, silhouette score: 0.0300
k=10, silhouette score: 0.0303
k=11, silhouette score: 0.0310
k=12, silhouette score: 0.0285
k=13, silhouette score: 0.0283
k=14, silhouette score: 0.0254
k=15, silhouette score: 0.0289
k=16, silhouette score: 0.0292
k=17, silhouette score: 0.0287
k=18, silhouette score: 0.0280
k=19, silhouette score: 0.0266
k=20, silhouette score: 0.0251
k=21, silhouette score: 0.0235
k=22, silhouette score: 0.0217
k=23, silhouette score: 0.0241
k=24, silhouette score: 0.0233
k=25, silhouette score: 0.0241
k=26, silhouette score: 0.0246
k=27, silhouette score: 0.0215
k=28, silhouette score: 0.0213
k=29, silhouette score: 0.0217
k=30, silhouette score: 0.0217
k=31, silhouette score: 0.0219
k=32, silhouette score: 0.0203
k=33, silhouette score: 0.0212
k=34, silhouette score: 0.0210
k=35, silhouette score: 0.0213
k=36, silhouette score: 0.0219
k=37, silhoue

In [4]:
# FAISS를 이용해 최적의 클러스터 개수(best_k)로 클러스터링
index_flat = faiss.IndexFlatL2(dim)
clustering = faiss.Clustering(dim, best_k)
clustering.niter = 20  # 클러스터링 반복 횟수
clustering.train(doc_embeddings, index_flat)
centroids = clustering.centroids  # shape: [best_k, dim]

In [None]:
# 각 문서를 가장 가까운 클러스터(centroid)에 할당
centroid_index = faiss.IndexFlatL2(dim)
centroid_index.add(centroids)
_, cluster_assignments = centroid_index.search(doc_embeddings, 1)
cluster_assignments = cluster_assignments.squeeze()

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
# 클러스터별 문서 인덱스 매핑 생성
clusters = {i: [] for i in range(best_k)}
for doc_idx, cluster_id in enumerate(cluster_assignments):
    clusters[int(cluster_id)].append(doc_idx)

In [None]:
# 각 클러스터별로 별도의 FAISS 인덱스 구축 (클러스터 내 빠른 검색을 위해)
cluster_indexes = {}
for cluster_id, doc_idxs in clusters.items():
    if len(doc_idxs) > 0:
        cluster_embeddings = doc_embeddings[doc_idxs]
        index_cluster = faiss.IndexFlatL2(dim)
        index_cluster.add(cluster_embeddings)
        cluster_indexes[cluster_id] = index_cluster

In [None]:
# FAISS centroid index 저장
faiss.write_index(centroid_index, "centroid_index.index")

# 각 클러스터별 인덱스 저장 (파일 이름에 cluster_id 포함)
for cluster_id, index_cluster in cluster_indexes.items():
    faiss.write_index(index_cluster, f"cluster_index_{cluster_id}.index")

# 클러스터 매핑 정보, 문서 목록, 메타 정보 저장 (pickle 사용)
with open("clusters_mapping.pkl", "wb") as f:
    pickle.dump(clusters, f)

with open("documents.pkl", "wb") as f:
    pickle.dump(documents, f)

with open("meta_info.pkl", "wb") as f:
    meta_info = {"best_k": best_k, "dim": dim}
    pickle.dump(meta_info, f)

print("FAISS 인덱스와 관련 데이터가 저장되었습니다.")