In [2]:
import numpy as np
from algorithms.h2alsh_mips import H2ALSH_MIPS  # Assuming your H2ALSH_MIPS implementation is saved in h2alsh_mips.py

Synethtic test data

In [3]:

# Generate synthetic test data
def generate_test_data(num_atoms, len_signal):
    """
    Generate random test data for atoms and signal vectors.
    :param num_atoms: Number of atoms (data points)
    :param len_signal: Length of each vector
    :return: Tuple of atoms and signals
    """
    np.random.seed(42)  # Set random seed for reproducibility
    atoms = np.random.randn(num_atoms, len_signal)
    signals = np.random.randn(1, len_signal)  # Single signal
    return atoms, signals

# Parameters for testing
num_atoms = 100  # Number of atoms
len_signal = 50  # Length of each vector
atoms, signals = generate_test_data(num_atoms, len_signal)

# Initialize H2-ALSH MIPS object
h2alsh_mips = H2ALSH_MIPS(
    atoms=atoms,
    delta=0.1,  # Error probability
    c0=2.0,     # Approximation constant c0 (c0-ANN problem)
    c=0.9,      # Approximation constant c (c-AMIP problem)
    N0=50       # Threshold for linear scan
)

# Run the H2-ALSH algorithm to retrieve top-k candidates
top_k = 5
candidates, sample_complexity = h2alsh_mips.mip_search_queries(signals, top_k=top_k)

# Output results
print(f"Top {top_k} candidates for each query (indices):\n{candidates}")
print(f"Sample complexity (total number of operations): {sample_complexity.sum()}")

# Naive approach for validation
inner_products = np.dot(atoms, signals[0])
top_k_naive = np.argsort(inner_products)[-top_k:][::-1]  # Indices of top-k by brute force
print(f"Top {top_k} candidates using naive approach (indices): {top_k_naive}")

# Compare results
accuracy = len(np.intersect1d(candidates[0], top_k_naive)) / top_k
print(f"Accuracy compared to naive method: {accuracy * 100:.2f}%")

# Compute speedup ratio
total_naive_computations = num_atoms * len_signal * len(signals)
speedup_ratio = total_naive_computations / sample_complexity.sum()
print(f"Speedup ratio: {speedup_ratio:.2f} times faster than naive computation.")


Top 5 candidates for each query (indices):
[[33 27 22  9  2]]
Sample complexity (total number of operations): 17198
Top 5 candidates using naive approach (indices): [21 33 27 38 45]
Accuracy compared to naive method: 40.00%
Speedup ratio: 0.29 times faster than naive computation.


Movie Lens 100k

In [4]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from algorithms.h2alsh_mips import H2ALSH_MIPS  # Assuming H2ALSH_MIPS is implemented

# Load the MovieLens dataset (100k version for simplicity)
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.2)

# Build a user-item matrix
user_item_matrix = np.zeros((trainset.n_users, trainset.n_items))
for uid, iid, rating in trainset.all_ratings():
    user_item_matrix[int(uid), int(iid)] = rating

# Use SVD to reduce dimensionality
svd = TruncatedSVD(n_components=50)
atoms = svd.fit_transform(user_item_matrix.T)  # Transpose to get item embeddings

# Create a signal (simulate a user's preference vector)
user_id = 0  # Choose a user ID from the dataset
user_ratings = user_item_matrix[user_id]
signal = np.dot(user_ratings, atoms)  # Weighted average of rated item embeddings

# Initialize H2-ALSH MIPS object
h2alsh_mips = H2ALSH_MIPS(
    atoms=atoms,
    delta=0.1,  # Error probability
    c0=2.0,     # Approximation constant for c0-ANN problem
    c=0.9,      # Approximation constant for c-AMIP problem
    N0=50       # Threshold for linear scan
)

# Run the H2-ALSH algorithm to retrieve top-k candidates
top_k = 10
candidates, sample_complexity = h2alsh_mips.mip_search(signal, top_k=top_k)

# Output results
print(f"Top {top_k} candidates (indices): {candidates}")
print(f"Sample complexity (total number of operations): {sample_complexity}")

# Naive approach for validation
inner_products = np.dot(atoms, signal)
top_k_naive = np.argsort(inner_products)[-top_k:][::-1]  # Indices of top-k by brute force
print(f"Top {top_k} candidates using naive approach (indices): {top_k_naive}")

# Compare results
accuracy = len(np.intersect1d(candidates, top_k_naive)) / top_k
print(f"Accuracy compared to naive method: {accuracy * 100:.2f}%")

# Compute speedup ratio
total_naive_computations = atoms.shape[0] * atoms.shape[1]  # Total naive computations
speedup_ratio = total_naive_computations / sample_complexity
print(f"Speedup ratio: {speedup_ratio:.2f} times faster than naive computation.")


Top 10 candidates (indices): [301 286 160 265   1 200  80  98 167  83]
Sample complexity (total number of operations): 45203
Top 10 candidates using naive approach (indices): [301 286 160 265   1 200  80  98 167  83]
Accuracy compared to naive method: 100.00%
Speedup ratio: 1.83 times faster than naive computation.


In [5]:
len(user_item_matrix)

943

Movie lens 1m

In [6]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from algorithms.h2alsh_mips import H2ALSH_MIPS  # Assuming H2ALSH_MIPS is implemented

# Load the MovieLens dataset (100k version for simplicity)
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data, test_size=0.2)

# Build a user-item matrix
user_item_matrix = np.zeros((trainset.n_users, trainset.n_items))
for uid, iid, rating in trainset.all_ratings():
    user_item_matrix[int(uid), int(iid)] = rating

# Use SVD to reduce dimensionality
svd = TruncatedSVD(n_components=50)
atoms = svd.fit_transform(user_item_matrix.T)  # Transpose to get item embeddings

# Create a signal (simulate a user's preference vector)
user_id = 0  # Choose a user ID from the dataset
user_ratings = user_item_matrix[user_id]
signal = np.dot(user_ratings, atoms)  # Weighted average of rated item embeddings

# Initialize H2-ALSH MIPS object
h2alsh_mips = H2ALSH_MIPS(
    atoms=atoms,
    delta=0.1,  # Error probability
    c0=2.0,     # Approximation constant for c0-ANN problem
    c=0.9,      # Approximation constant for c-AMIP problem
    N0=50       # Threshold for linear scan
)

# Run the H2-ALSH algorithm to retrieve top-k candidates
top_k = 10
candidates, sample_complexity = h2alsh_mips.mip_search(signal, top_k=top_k)

# Output results
print(f"Top {top_k} candidates (indices): {candidates}")
print(f"Sample complexity (total number of operations): {sample_complexity}")

# Naive approach for validation
inner_products = np.dot(atoms, signal)
top_k_naive = np.argsort(inner_products)[-top_k:][::-1]  # Indices of top-k by brute force
print(f"Top {top_k} candidates using naive approach (indices): {top_k_naive}")

# Compare results
accuracy = len(np.intersect1d(candidates, top_k_naive)) / top_k
print(f"Accuracy compared to naive method: {accuracy * 100:.2f}%")

# Compute speedup ratio
total_naive_computations = atoms.shape[0] * atoms.shape[1]  # Total naive computations
speedup_ratio = total_naive_computations / sample_complexity
print(f"Speedup ratio: {speedup_ratio:.2f} times faster than naive computation.")


Top 10 candidates (indices): [1136  434  182  435   85  207  118   69   52  303]
Sample complexity (total number of operations): 45203
Top 10 candidates using naive approach (indices): [1136  434  182  435   85  207  118   69   52  303]
Accuracy compared to naive method: 100.00%
Speedup ratio: 4.06 times faster than naive computation.


In [7]:
len(user_item_matrix)

6040

Netflix Data

In [10]:
# Load preprocessed factors and biases
movie_factors = np.load("data/netflix/Movie_factors_15_new.npy")
movie_biases = np.load("data/netflix/Movie_biases_15_new.npy")
customer_factors = np.load("data/netflix/Customer_factors_15_new.npy")
customer_biases = np.load("data/netflix/Customer_biases_15_new.npy")
global_mean = np.load("data/netflix/netflix_global_mean.npy")

# Use movie factors as atoms for LSH-MIPS
atoms = movie_factors

# Choose a specific user
user_id = 0  # Replace with the desired user ID (index-based, starting at 0)
user_factors = customer_factors[user_id]
user_bias = customer_biases[user_id]

# Construct the user preference signal
signal = user_factors  # Optionally, add user_bias and global_mean if needed for personalization

# Initialize H2-ALSH MIPS object
h2alsh_mips = H2ALSH_MIPS(
    atoms=atoms,
    delta=0.1,  # Error probability
    c0=2.0,     # Approximation constant for c0-ANN problem
    c=0.9,      # Approximation constant for c-AMIP problem
    N0=50       # Threshold for linear scan
)

# Run the H2-ALSH algorithm to retrieve top-k candidates
top_k = 10
candidates, sample_complexity = h2alsh_mips.mip_search(signal, top_k=top_k)

# Output results
print(f"Top {top_k} candidates (indices): {candidates}")
print(f"Sample complexity (total number of operations): {sample_complexity}")

# Naive approach for validation
inner_products = np.dot(atoms, signal)
top_k_naive = np.argsort(inner_products)[-top_k:][::-1]  # Indices of top-k by brute force
print(f"Top {top_k} candidates using naive approach (indices): {top_k_naive}")

# Compare results
accuracy = len(np.intersect1d(candidates, top_k_naive)) / top_k
print(f"Accuracy compared to naive method: {accuracy * 100:.2f}%")

# Compute speedup ratio
total_naive_computations = atoms.shape[0] * atoms.shape[1]  # Total naive computations
speedup_ratio = total_naive_computations / sample_complexity
print(f"Speedup ratio: {speedup_ratio:.2f} times faster than naive computation.")


Top 10 candidates (indices): [1274  738   46  559  314 1305  305   24  217 1041]
Sample complexity (total number of operations): 203200
Top 10 candidates using naive approach (indices): [1274  738   46  559  314 1171 1305  305   24  217]
Accuracy compared to naive method: 90.00%
Speedup ratio: 0.66 times faster than naive computation.


Crypto-Pairs

In [11]:
# Load the preprocessed dataset
dataset_path = "data\crypto_pairs\crypto_pairs_1m_dimensions.npy"  # Path to the saved .npy file
crypto_data = np.load(dataset_path, allow_pickle=True)

# Step 1: Use Truncated SVD to reduce dimensionality (if necessary)
svd = TruncatedSVD(n_components=50)  # Reduce to 50 dimensions
atoms = svd.fit_transform(crypto_data)  # The reduced dataset

# Step 2: Create a query signal
# Example: Use the first crypto pair as the query vector
query_index = 0
signal = atoms[query_index]  # A single crypto pair's embedding

# Initialize H2-ALSH MIPS object
h2alsh_mips = H2ALSH_MIPS(
    atoms=atoms,
    delta=0.1,  # Error probability
    c0=2.0,     # Approximation constant for c0-ANN problem
    c=0.9,      # Approximation constant for c-AMIP problem
    N0=50       # Threshold for linear scan
)

# Run the H2-ALSH algorithm to retrieve top-k candidates
top_k = 10
candidates, sample_complexity = h2alsh_mips.mip_search(signal, top_k=top_k)

# Output results
print(f"Top {top_k} candidates (indices): {candidates}")
print(f"Sample complexity (total number of operations): {sample_complexity}")

# Naive approach for validation
inner_products = np.dot(atoms, signal)
top_k_naive = np.argsort(inner_products)[-top_k:][::-1]  # Indices of top-k by brute force
print(f"Top {top_k} candidates using naive approach (indices): {top_k_naive}")

# Compare results
accuracy = len(np.intersect1d(candidates, top_k_naive)) / top_k
print(f"Accuracy compared to naive method: {accuracy * 100:.2f}%")

# Compute speedup ratio
total_naive_computations = atoms.shape[0] * atoms.shape[1]  # Total naive computations
speedup_ratio = total_naive_computations / sample_complexity
print(f"Speedup ratio: {speedup_ratio:.2f} times faster than naive computation.")


Top 10 candidates (indices): [ 58  97  35 104  49  77  67  33  19 101]
Sample complexity (total number of operations): 18034
Top 10 candidates using naive approach (indices): [ 13  28  51   2  58  97  35 104  57  49]
Accuracy compared to naive method: 50.00%
Speedup ratio: 0.29 times faster than naive computation.
