# Download the datasets

### 21 Mil WikiDPR

In [None]:
from datasets import load_dataset

dataset = load_dataset("wiki_dpr", "psgs_w100.multiset.no_index", streaming= True)

In [None]:
import numpy as np
# print(next(iter(dataset['train'])))
# embeds = np.array([doc['emb'] for doc in dataset['train']])
# print("Embeddings:")
# print(embeds)

# get the document of id 500
# temp = dataset['train'].filter(lambda x: x['id'] == 0)
# print("Document of id 500:")
# print(next(iter(temp)))

# dataset_head = dataset['train'].skip(10000)
# dataset_head = dataset_head.take(1)
# print("Head:")
# print(next(iter(dataset_head)))



### Cohere wiki 485k dataset

In [None]:
import pandas
from datasets import load_dataset

docs = load_dataset(f"Cohere/wikipedia-22-12-simple-embeddings", split="train")

In [None]:
for doc in docs:

    with open("embeds2.txt", "a") as f:
        f.write(str(doc['emb']))
        f.write('\n')

# Clustering Using Faiss KMeans

### Reading the dataset from the pre-downloaded data

In [None]:
# Read the dataset from the files in /batches/dataset in one big numpy array
import numpy as np
np.set_printoptions(threshold=np.inf, linewidth=np.inf)  # ensure numpy array won't truncate

embeds = []
for i in range(5):
   
    with open(f"./Data/dataset/{i}-embeds-batch.txt", "r") as f:
        lines = [np.fromstring(line, sep=" ") for line in f]
        # normalize the embeddings
        lines = [line / np.linalg.norm(line) for line in lines]
        embeds.append(lines)
        f.close()
    # # write the normalized embeddings to a file line by line
    # with open(f"./Data/normalizedDataset/{i}-embeds-batch.txt", "a") as f:
    #     for line in lines:
    #         line = np.array(line)
    #         f.write(str(line).replace('[', '').replace(']', '').replace('  ',' '))
    #         f.write('\n')
    #     f.close()
    

# Concatenate all lists into a
embeds = np.concatenate(embeds, axis=0)

# print the norm of the first 100 embeddings
for i in range(10):
    print(np.linalg.norm(embeds[i]))


### Constructing the clusters using Faiss

In [None]:
!pip install faiss-gpu

In [None]:
# use faiss to cluster the embeddings
import faiss
import numpy as np

ncentroids = 100
niter = 300
verbose = True
gpu = True
dim = 768
kmeans = faiss.Kmeans(dim, ncentroids, niter = niter, verbose = verbose, gpu = gpu, 
                      min_points_per_centroid = 100, max_points_per_centroid = 100000, 
                      nredo = 2, spherical = True)

kmeans.train(embeds.astype(np.float32))

faiss.write_index(kmeans.index, "./Data/KMeansFAISS/index.bin")

#### Mapping each point to its nearest centroid

In [None]:
import faiss
np.set_printoptions(threshold=np.inf, linewidth=np.inf)  # ensure numpy array won't truncate


# read the faiss index
index = faiss.read_index("./Data/KMeansFAISS/index.bin")

# mapping each point to its nearest centroid
_, I = index.search(embeds, 2)

# flatten the array of arrays
I_flat = [item for sublist in I for item in sublist]

# write the labels to a file where I_flat is the centroid index for each embedding
with open("./Data/labels/faiss_labels.txt", "a") as f:
    # remove the string array representation of the list
    f.write(str(I_flat).replace("[", "").replace("]", "").replace(",", "").replace("  ", " ").strip())

### Write the centroids to a file

For each centroid file, we need to manually remove square brackets "[" or "]"

In [None]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf)  # ensure numpy array won't truncate
centroids_filename = "D:/Boody/GP/Indexer/RAGn-Roll-Indexer/Data/centroids/final-centroids.txt"
with open(centroids_filename, "a") as f:
    # print the centroids line by line
    # make the printing so that it execludes any squared brackets or double spacing also remove that first space 
    # also remove all the commas
    # also remove all the white spaces at the very beginig and end of each line

    for i in range(ncentroids):
        f.write(str(kmeans.centroids[i]).replace("[", "").replace("]", "").replace("  ", " ").strip())
        f.write("\n")

### Create the labels file, where each item from dataset (in batches) is mapped to its closest two centroids

### Getting the nearest clusters to the given query

In [None]:
# read queries and compute nearest cloisters
import faiss 
import numpy as np
np.set_printoptions(threshold=np.inf, linewidth=np.inf)  # ensure numpy array won't truncate

# read the faiss index
index = faiss.read_index("./Data/KMeansFAISS/index.bin")

with open("./Data/query.txt", "r") as f:
    lines = [np.fromstring(line, sep=" ") for line in f]
    queries = np.array(lines)

#Normalize the query
queries = queries / np.linalg.norm(queries)

_, I = index.search(queries, 10)
I_flat = [i for i in I]

# write the labels to a file where I_flat is the centroid index for each embedding
with open("./Data/labels/query_faiss_labels.txt", "a") as f:
    f.write(str(I_flat).replace("[", "").replace("]", "").replace(",", "").replace("  ", " ").strip())

For each label file, we need to manually remove double spaces "  " and any square brackets "[" or "]"

In [None]:
def assign_labels(X, centroids):
    # // make this of dimension x.shape[0][num]
    assigned_custers = 2
    labels = np.empty(X.shape[0]*assigned_custers, dtype=np.int64)
    for i, x in enumerate(X):
        # L2
        # distances = np.linalg.norm(x - centroids, axis=1)

        # cosine similarity
        # distances = np.dot(x, centroids.T) / (np.linalg.norm(x) * np.linalg.norm(centroids))

        # use inner product to calculate the distances
        distances = np.inner(centroids, x)

        sorted_indices = np.argsort(distances)
        min_index = sorted_indices[0]
        second_min_index = sorted_indices[1]
        labels[2 * i] = min_index
        labels[2*i + 1] = second_min_index
    return labels

import numpy as np
np.set_printoptions(threshold=np.inf, linewidth=np.inf)  # ensure numpy array won't truncate
centroids_filename = "D:/Boody/GP/Indexer/RAGn-Roll-Indexer/Data/centroids/final-centroids.txt"

# #print the norm of the centroids
# centroids = np.loadtxt(centroids_filename)
# for i in range(100):
#     print(np.linalg.norm(centroids[i]))


for i in range(5):
    with open(f"D:/Boody/GP/Indexer/RAGn-Roll-Indexer/Data/dataset/{i}-embeds-batch.txt", "r") as f:
        # read each line and convert to numpy array
        lines = [np.fromstring(line, sep=" ") for line in f]
        # convert to numpy array
        embeds = np.array(lines)
        # print(embeds.shape)
        centroids = np.loadtxt(centroids_filename)
        # #normalize the centroids
        # centroids = centroids / np.linalg.norm(centroids, axis=1)[:, np.newaxis]
        # #normalize the embeddings
        # embeds = embeds / np.linalg.norm(embeds, axis=1)[:, np.newaxis]
        labels = assign_labels(embeds, centroids)
        with open(f"D:/Boody/GP/Indexer/RAGn-Roll-Indexer/Data/labels/labels{i}.txt", "a") as f:
            f.write(str(labels).replace('[', '').replace(']', '').replace('  ', ' ').replace('  ', ' ').strip())

# Attempts


## KD tree

### Constructing KD tree

In [None]:
from sklearn.neighbors import KDTree
from scipy.spatial import cKDTree
tree = KDTree(embeds) # leafSize = 16
# tree = cKDTree(embeds)

In [None]:
tree_data, index, tree_nodes, node_bounds = tree.get_arrays()
tree_nodes

## K means

### Running K means on each batch
- 100k batches
- each batch 100 clusters

In [None]:
# loop on folders
import os
import numpy as np
from sklearn.cluster import KMeans

np.set_printoptions(threshold=np.inf, linewidth=np.inf)  # ensure numpy array won't truncate

with open("./cmake-build-debug/batches/1-embeds-batch.txt", "r") as f:
    # read each line and convert to numpy array
    lines = [np.fromstring(line, sep=" ") for line in f]
    # convert to numpy array
    embeds = np.array(lines)
    # apply kmeans on embeds and get the centroids
    kmeans = KMeans(n_clusters = 100).fit(embeds)
    with open("./cmake-build-debug/batches/centroids1.txt", "a") as f:
        f.write(str(kmeans.cluster_centers_))


Collecting all centroid files into one array
* Make sure that centroids files don't have any squared brackets

In [None]:
centroids = np.loadtxt("./cmake-build-debug/batches/centroids0.txt")
centroids = np.concatenate((centroids, np.loadtxt("./cmake-build-debug/batches/centroids1.txt")), axis=0)
centroids = np.concatenate((centroids, np.loadtxt("./cmake-build-debug/batches/centroids2.txt")), axis=0)
centroids = np.concatenate((centroids, np.loadtxt("./cmake-build-debug/batches/centroids3.txt")), axis=0)
centroids = np.concatenate((centroids, np.loadtxt("./cmake-build-debug/batches/centroids4.txt")), axis=0)

Applying K-means on all the centroids

In [None]:
# apply kmeans on embeds and get the centroids
kmeans = KMeans(n_clusters = 100).fit(centroids)
with open("./cmake-build-debug/batches/final-centroids.txt", "a") as f:
    f.write(str(kmeans.cluster_centers_))

Assigning labels to each batch according to the final centroids

In [None]:
def assign_labels(X, centroids):
    # // make this of dimension x.shape[0][num]
    assigned_custers = 2
    labels = np.empty(X.shape[0]*assigned_custers, dtype=np.int64)
    for i, x in enumerate(X):
        distances = np.linalg.norm(x - centroids, axis=1)
        sorted_indices = np.argsort(distances)
        min_index = sorted_indices[0]
        second_min_index = sorted_indices[1]
        labels[2 * i] = min_index
        labels[2*i + 1] = second_min_index
    return labels

import numpy as np
np.set_printoptions(threshold=np.inf, linewidth=np.inf)  # ensure numpy array won't truncate


for i in range(5):
    with open(f"./cmake-build-debug/batches/dataset/{i}-embeds-batch.txt", "r") as f:
        # read each line and convert to numpy array
        lines = [np.fromstring(line, sep=" ") for line in f]
        # convert to numpy array
        embeds = np.array(lines)
        # print(embeds.shape)
        centroids = np.loadtxt("./cmake-build-debug/batches/centroids/final-centroids.txt")
        labels = assign_labels(embeds, centroids)
        with open(f"./cmake-build-debug/batches/labels/labels{i}.txt", "a") as f:
            f.write(str(labels))

Constructing clusters

In [None]:
# opening the embeds file
with open("./cmake-build-debug/batches/dataset/1-embeds-batch.txt", "r") as f:
    lines = [np.fromstring(line, sep=" ") for line in f]
    embeds = np.array(lines)

# read the labels and convert it to a list
with open("./cmake-build-debug/batches/labels/labels1.txt", "r") as f:
    labels =[int(x) for x in next(f).split()]   

# loop on the labels 
for i in range(100):
    indices = [j//2 for j, x in enumerate(labels) if x == i]
    current_embeds = embeds[indices]
    with open(f"./cmake-build-debug/batches/clusters/{i}.txt", "a") as f:
        f.write(str(current_embeds))

In [None]:
for i in range(100):
    with open(f"./cmake-build-debug/batches/clusters/cluster{i}.txt", "a") as f:
        f.write("\n")

## Clustering using Faiss IVF

### Reading the dataset from the pre-downloaded data

In [None]:
# Read the dataset from the files in /batches/dataset in one big numpy array
import numpy as np

embeds = []
for i in range(5):
    with open(f"D:/Boody/GP/Indexer/RAGn-Roll-Indexer/Data/dataset/{i}-embeds-batch.txt", "r") as f:
        lines = [np.fromstring(line, sep=" ") for line in f]
        # normalize the embeddings
        # lines = [line / np.linalg.norm(line) for line in lines]
        embeds.append(np.array(lines))

# Concatenate all lists into a
embeds = np.concatenate(embeds, axis=0)

# print the norm of the first 100 embeddings
for i in range(100):
    print(np.linalg.norm(embeds[i]))


### Constructing the clusters using Faiss

In [None]:
!pip install faiss-cpu

In [None]:
# use faiss to cluster the embeddings
import faiss

# create an inverted index
nlist = 256
m = 1
k = 5
d = 768
coarse_quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, m, 8)


In [None]:
index.train(embeds)
index.add(embeds)

In [None]:
# if there is a pre-transform, you can also use
# invlists = faiss.extract_index_ivf(index).invlists
invlists = index.invlists

In [None]:
def get_invlist(invlists, l):
    """ returns the inverted lists content as a pair of (list_ids, list_codes).
    The codes are reshaped to a proper size
    """
    invlists = faiss.downcast_InvertedLists(invlists)
    ls = invlists.list_size(l)
    list_ids = np.zeros(ls, dtype='int64')
    ids = codes = None
    try:
        ids = invlists.get_ids(l)
        if ls > 0:
            faiss.memcpy(faiss.swig_ptr(list_ids), ids, list_ids.nbytes)
        codes = invlists.get_codes(l)
        if invlists.code_size != faiss.InvertedLists.INVALID_CODE_SIZE:
            list_codes = np.zeros((ls, invlists.code_size), dtype='uint8')
        else:
            # it's a BlockInvertedLists
            npb = invlists.n_per_block
            bs = invlists.block_size
            ls_round = (ls + npb - 1) // npb
            list_codes = np.zeros((ls_round, bs // npb, npb), dtype='uint8')
        if ls > 0:
            faiss.memcpy(faiss.swig_ptr(list_codes), codes, list_codes.nbytes)
    finally:
        if ids is not None:
            invlists.release_ids(l, ids)
        if codes is not None:
            invlists.release_codes(l, codes)
    return list_ids, list_codes


In [None]:
# get content of inverted list #123
list_ids, list_codes = get_invlist(invlists, 255)

In [None]:
list_ids

In [None]:
import numpy as np
np.set_printoptions(threshold=np.inf, linewidth=np.inf)  # ensure numpy array won't truncate

# use the get_invlist function to get the list_ids and save them the the labels files each in a file indexed from 0 to 255
for i in range(256):
    list_ids, list_codes = get_invlist(invlists, i)
    with open(f"D:/Boody/GP/Indexer/RAGn-Roll-Indexer/Data/labels/labels{i}.txt", "a") as f:
        # make the printing so that it execludes any squared brackets or double spacing also remove that first space 
        # also remove all the white spaces at the very beginig and end of the string
        f.write(str(list_ids).replace('[', '').replace(']', '').replace('  ', ' ').replace('  ', ' ').strip())

In [None]:
# use list_ids to map each vector from embeds to its corresponding label using the indices in list_ids and output the clusters to the clusters folder
for i in range(256):
    with open(f"D:/Boody/GP/Indexer/RAGn-Roll-Indexer/Data/clusters/{i}.txt", "a") as f:

        with open(f"D:/Boody/GP/Indexer/RAGn-Roll-Indexer/Data/labels/labels{i}.txt", "r") as f2:
            labels =[int(x) for x in next(f2).split()]   

        for j in range(len(labels)):
            # make the printing so that it execludes any squared brackets or double spacing also remove that first space 
            # also remove all the white spaces at the very beginig and end of the string
            f.write(str(embeds[labels[j]]).replace('[', '').replace(']', '').replace('  ', ' ').replace('  ', ' ').strip())
            f.write("\n")

In [None]:
# Read the clusters from the folder file by file and calculate the medoid for each cluster file and output the medoids to one file in centroids folder
import numpy as np

for i in range(256):
    with open(f"D:/Boody/GP/Indexer/RAGn-Roll-Indexer/Data/clusters/{i}.txt", "r") as f:
        lines = [np.fromstring(line, sep=" ") for line in f]
        embeds = np.array(lines)
        medoid = embeds[np.argmin(np.sum(np.abs(embeds[:, np.newaxis] - embeds), axis=2), 0)]
        with open(f"D:/Boody/GP/Indexer/RAGn-Roll-Indexer/Data/centroids/test-centroids.txt", "a") as f:
            f.write(str(medoid).replace('[', '').replace(']', '').replace('  ', ' ').replace('  ', ' ').strip())
            f.write("\n")