In [1]:
import sys
sys.path.append("/content/drive/MyDrive/annime")

In [2]:
pip install annoy nmslib

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nmslib
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11<2.6.2 (from nmslib)
  Using cached pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
Building wheels for collected packages: annoy, nmslib
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=552450 sha256=05e037b852fa9ba97ef4a6f628ae501858e0495378737e04ab50fab1943c6063
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
  Building wheel for nmslib (setup

## Annoy example with interface

In [7]:
from src.annoy_int import AnnoyANN
import numpy as np
import logging


# Function to create and build a new Annoy index
def create_and_build_annoy_index(data_points_ind, dim=100, metric='angular', num_trees=20):
    ann = AnnoyANN(dim=dim, metric=metric, num_trees=num_trees)
    ann.add_items(data_points_ind)
    ann.build_index(data_points_ind, num_trees=num_trees)
    return ann


# Initialize Annoy interface with angular metric and setup logging
np.random.seed(42)
data_points = np.random.rand(2000, 100)  # 2000 points in 100 dimensions
annoy_ann = create_and_build_annoy_index(data_points)

# Perform a batch of complex queries
query_points = np.random.rand(50, 100)  # 50 new random points
constraints = lambda x: np.linalg.norm(x) > 0.5  # Constraint: norm should be greater than 0.5
batch_results_with_constraints = [
    annoy_ann.query_with_constraints(point, constraints, k=10) for point in query_points
]
print("Batch Query Results with Constraints:", batch_results_with_constraints)

# Simulate item updating by recreating the index
new_vector = np.random.rand(100)
data_points[10] = new_vector  # Update the data point in the array
annoy_ann = create_and_build_annoy_index(data_points)  # Create and build a new index
updated_results = annoy_ann.query(new_vector, k=10)
print("Results after simulated update:", updated_results)

# Benchmark the performance of querying
benchmark_results = annoy_ann.benchmark_performance(query_points, k=10, rounds=5)
print("Performance Benchmark:", benchmark_results)

# Enable detailed logging and perform some operations
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()
annoy_ann.enable_logging('DEBUG')
annoy_ann.optimize_index()  # Log an operation

# Save and load the index for demonstration of persistence
annoy_ann.save_index('final_annoy_index.ann')
annoy_ann.load_index('final_annoy_index.ann')

logger.info("Finished processing using Annoy interface")


Batch Query Results with Constraints: [[[395, 925, 1085, 384, 1270, 531, 1319, 1783, 859, 693, 1917, 1564, 993, 915, 1629, 428, 591, 1638, 951, 540, 250, 327, 1518, 172, 387, 1748, 845, 225, 1774, 195, 425, 1044, 1797, 786, 1531, 1890, 318, 1493, 1408, 58, 510, 1694, 1746, 536, 1720, 178, 1932, 1256, 601, 1093, 1868, 985, 1024, 161, 963, 1450, 78, 692, 1195, 680, 1102, 655, 804, 1354, 558, 722, 1553, 854, 1400, 111, 49, 362, 986, 141, 430, 11, 200, 1224, 1904, 1691, 520, 1786, 645, 1170, 1839, 1391, 1778, 348, 1356, 1119, 1042, 345, 1076, 508, 1919, 1979, 1810, 1630, 303, 1953], [0.5445383191108704, 0.5614657402038574, 0.5689914226531982, 0.5746710300445557, 0.5761875510215759, 0.58034747838974, 0.5815754532814026, 0.5820507407188416, 0.5841387510299683, 0.5847211480140686, 0.5848752856254578, 0.5878902077674866, 0.5894822478294373, 0.5906403660774231, 0.5914306044578552, 0.5933462381362915, 0.5936980247497559, 0.5937561392784119, 0.5938833951950073, 0.5947101712226868, 0.5954408645629

## Annoy example without interface

In [8]:
from annoy import AnnoyIndex
import numpy as np
import logging
import time

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()


def create_and_build_annoy_index(data_points_ind, dim=100, metric='angular', num_trees=20):
    index = AnnoyIndex(dim, metric)
    for i, vector in enumerate(data_points_ind):
        index.add_item(i, vector)
    index.build(num_trees)
    return index


# Initialize data
np.random.seed(42)
data_points = np.random.rand(2000, 100)  # 2000 points in 100 dimensions

# Create and build Annoy index
annoy_index = create_and_build_annoy_index(data_points)

# Perform a batch of complex queries with constraints
query_points = np.random.rand(50, 100)  # 50 new random points
constraints = lambda x: np.linalg.norm(x) > 0.5  # Constraint: norm should be greater than 0.5
batch_results_with_constraints = []
for point in query_points:
    nns = annoy_index.get_nns_by_vector(point, 10, include_distances=True)
    constrained_results = [idx for idx in nns[0] if constraints(data_points[idx])]
    batch_results_with_constraints.append(constrained_results)
print("Batch Query Results with Constraints:", batch_results_with_constraints)

# Simulate item updating by recreating the index
new_vector = np.random.rand(100)
data_points[10] = new_vector  # Update the data point in the array
annoy_index = create_and_build_annoy_index(data_points)  # Create and build a new index
updated_results = annoy_index.get_nns_by_vector(new_vector, 10, include_distances=True)
print("Results after simulated update:", updated_results[0])

# Benchmark the performance of querying
start_time = time.time()
for _ in range(5):  # Rounds
    for query in query_points:
        annoy_index.get_nns_by_vector(query, 10)
end_time = time.time()
print("Performance Benchmark:", (end_time - start_time) / (len(query_points) * 5), "seconds per query")

# Save and load the index for demonstration of persistence
annoy_index.save('final_annoy_index.ann')
annoy_index.load('final_annoy_index.ann')

logger.info("All operations completed successfully.")


Batch Query Results with Constraints: [[395, 384, 531, 1917, 951, 1518, 172, 387, 195, 1044], [1716, 83, 925, 731, 1513, 1496, 349, 799, 220, 760], [1729, 1870, 1496, 1735, 1738, 900, 1051, 1376, 1594, 161], [671, 1296, 1634, 900, 105, 166, 626, 384, 522, 580], [1447, 1110, 1122, 1998, 275, 1049, 1034, 709, 1811, 1263], [1062, 1592, 1645, 1304, 44, 765, 1756, 1870, 339, 827], [1966, 1590, 1174, 515, 1128, 1675, 828, 382, 846, 633], [591, 674, 1865, 1474, 1081, 766, 592, 1250, 412, 692], [1111, 550, 750, 451, 422, 1996, 1880, 854, 631, 505], [1183, 1533, 1327, 271, 1982, 1876, 893, 2, 685, 1494], [207, 1462, 1961, 1335, 1385, 1723, 1324, 403, 1439, 91], [833, 789, 1769, 1004, 704, 1977, 186, 731, 1413, 864], [1288, 1841, 765, 63, 1594, 189, 1246, 1322, 553, 1284], [1361, 1452, 103, 741, 696, 1536, 1084, 1354, 1633, 161], [1551, 845, 349, 1833, 1258, 1756, 1564, 1069, 33, 1058], [1462, 847, 1634, 403, 1651, 1801, 1794, 1288, 275, 1919], [1519, 567, 1536, 1327, 1594, 451, 1209, 760, 637, 

## Nmslib example with interface

In [11]:
from src.nmslib_int import NmslibANN
import numpy as np
import logging
import nmslib

# Setup logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("NmslibComplexExample")


def create_and_configure_nmslib_ann(space='cosinesimil', method='hnsw', dtype=nmslib.DistType.FLOAT):
    # Initialize the NMSLIB interface with correct data type
    ann = NmslibANN(space=space, method=method, dtype=dtype)
    logger.info(f"Initialized NMSLIB ANN with space {space}, method {method}, and data type {dtype}.")
    return ann


# Generate random high-dimensional data points
np.random.seed(42)
data_points = np.random.rand(1000, 40).astype(np.float32)  # 1000 points in 40 dimensions
query_points = np.random.rand(10, 40).astype(np.float32)  # 10 query points

# Initialize NMSLIB interface
nmslib_ann = create_and_configure_nmslib_ann()

# Add data to the index
nmslib_ann.add_items(data_points)
logger.info("Data points added to the index.")

# Build the index with specific parameters
nmslib_ann.build_index(data_points, index_params={'M': 30, 'post': 0, 'efConstruction': 100})
logger.info("Index built with custom parameters.")

# Perform a batch of complex queries
complex_results = nmslib_ann.batch_query(query_points, k=5)
logger.info("Performed batch querying.")

# Simulate an update by adding new data points and rebuilding the index
new_data_points = np.random.rand(100, 40).astype(np.float32)  # 100 new points
nmslib_ann.add_items(new_data_points)
nmslib_ann.build_index(np.vstack([data_points, new_data_points]))
logger.info("Index updated with new data points and rebuilt.")

# Benchmark the performance of querying
performance = nmslib_ann.benchmark_performance(query_points, k=5, rounds=3)
logger.info(f"Performance benchmark completed: {performance} seconds per query on average.")

# Save and load the index
nmslib_ann.save_index('nmslib_index.bin')
nmslib_ann.load_index('nmslib_index.bin')
logger.info("Index saved to 'nmslib_index.bin' and reloaded.")

# Print some complex query results
print("Sample Complex Query Results:", complex_results[:2])
print("Performance Benchmark:", performance)

# Finalize logging
logger.info("All operations completed successfully.")


Sample Complex Query Results: [[868, 0.11269915], [6, 0.08588064]]
Performance Benchmark: Average query time: 0.00002 seconds


## Nmslib example without interface

In [12]:
import nmslib
import numpy as np
import logging

# Setup logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("NmslibDirectExample")


# Initialize and build an NMSLIB index
def create_and_build_nmslib_index(space='cosinesimil', method='hnsw', dtype=nmslib.DistType.FLOAT, data_points_ind=None):
    ind = nmslib.init(method=method, space=space, dtype=dtype)
    if data_points_ind is not None:
        for i, vector in enumerate(data_points_ind):
            ind.addDataPoint(i, vector)
        ind.createIndex({'M': 30, 'post': 0, 'efConstruction': 100}, print_progress=False)
    logger.info(f"Initialized and built NMSLIB index with space {space}, method {method}, and data type {dtype}.")
    return ind


# Generate random high-dimensional data points
np.random.seed(42)
data_points = np.random.rand(1000, 40).astype(np.float32)  # 1000 points in 40 dimensions
query_points = np.random.rand(10, 40).astype(np.float32)  # 10 query points

# Initialize and build the NMSLIB index
index = create_and_build_nmslib_index(data_points_ind=data_points)

# Perform a batch of complex queries
complex_results = []
for point in query_points:
    ids, distances = index.knnQuery(point, k=5)
    complex_results.append(ids)
logger.info("Performed batch querying.")

# Simulate an update by creating a new index with additional data points
new_data_points = np.random.rand(100, 40).astype(np.float32)  # 100 new points
all_data_points = np.vstack([data_points, new_data_points])
index = create_and_build_nmslib_index(data_points_ind=all_data_points)
logger.info("Index rebuilt with additional data points.")

# Benchmark the performance of querying
import time

start_time = time.time()
for _ in range(3):  # Rounds
    for query in query_points:
        index.knnQuery(query, k=5)
end_time = time.time()
performance = (end_time - start_time) / (len(query_points) * 3)
logger.info(f"Performance benchmark completed: {performance} seconds per query on average.")

# Save and load the index
index.saveIndex('nmslib_index.bin', save_data=True)
index.loadIndex('nmslib_index.bin', load_data=True)
logger.info("Index saved to 'nmslib_index.bin' and reloaded.")

# Print some of the complex query results
print("Sample Complex Query Results:", complex_results[:2])
print("Performance Benchmark:", performance)

# Finalize logging
logger.info("All operations completed successfully.")


Sample Complex Query Results: [array([868, 466, 782, 936, 920], dtype=int32), array([  6,  73, 950, 989, 208], dtype=int32)]
Performance Benchmark: 2.6154518127441407e-05
