<a href="https://colab.research.google.com/github/Dlogical23/capstone/blob/main/automatemodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import tarfile

# Path to your tar.gz file
file_path = '/content/drive/MyDrive/SIFT10M/SIFT10M.tar.gz'

# Path to the directory where you want to extract the files
extract_path = '/content/drive/MyDrive/SIFT10M'

# Open the tar.gz file
with tarfile.open(file_path, 'r:gz') as tar_ref:
    # Extract all files to the specified directory
    tar_ref.extractall(extract_path)

In [2]:
!pip install scipy



In [7]:
import os

# Define the path to the tar.gz file
tar_path = "/content/drive/MyDrive/SIFT10M/SIFT10M.tar.gz"

# Define the extraction path
extract_path = "/content/drive/MyDrive/SIFT10M/extracted/"

# Ensure the extraction path exists
os.makedirs(extract_path, exist_ok=True)

# Extract the tar.gz file
try:
    with tarfile.open(tar_path, "r:gz") as tar:
        tar.extractall(path=extract_path)
        print(f"Extracted files to: {extract_path}")
except Exception as e:
    print(f"An error occurred while extracting: {e}")

# List the extracted files
extracted_files = os.listdir(extract_path)
print("Extracted files:", extracted_files)

# Read the first 5 lines from one of the extracted text files (assuming a .txt or .csv file exists)
for file in extracted_files:
    file_path = os.path.join(extract_path, file)

    # Check if it's a text file before reading
    if file.endswith(".txt") or file.endswith(".csv"):
        try:
            with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                print("\nFirst 5 lines of", file_path)
                for _ in range(5):
                    print(f.readline().strip())
            break  # Stop after reading the first valid file
        except Exception as e:
            print(f"Could not read {file}: {e}")

Extracted files to: /content/drive/MyDrive/SIFT10M/extracted/
Extracted files: ['SIFT10M']


Explanation of the Code
Loading the Dataset:

The .mat file is loaded using h5py, and the feature vectors are extracted.

Similarity Search:

The NearestNeighbors algorithm from scikit-learn is used to find the nearest neighbors for each feature vector.

Validation:

Simulated ground truth labels are used to validate the retrieved neighbors.

A distance threshold is applied to filter out incorrect matches.

In [3]:
import h5py

# Load the .mat file using h5py
file_path = '/content/drive/MyDrive/SIFT10M/extracted/SIFT10M/SIFT10Mfeatures.mat'
with h5py.File(file_path, 'r') as file:
    # Access the correct dataset. 'fea' is a common dataset name in .mat files.
    # If 'fea' doesn't exist, inspect the file's contents to find the correct name
    features = file['fea'][:]

# View the first 5 rows
print(features[:5])

[[ 55  23  21  15  43 100 116  63 101  17   5   1   1  12  60 117  33  80
   19   3   3  15  53  48  46  46   3   0   2   8  35  57  47  39  51  19
   14  19  70  54  73  29   9   2   3  60 117  75 117  48   8   3  14  31
  117 117  21  10   1   2  32  74 117  69  21   2   2  10  72  36  78  52
   72  36  55   8   7  26  57  68 117  61  33   4  24   8   7  26  19   4
    0   2  59  32   8   5   5   0   0   1  27   2   0   7   8   2   1   0
    0   0   0  10   0   1   1   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   2  10  41  22   5   0   0   0  50 123  82  64
   28   0   0   8  53  73  95  77  40   1   0   2   7  16   1   1   2   0
    0   0   0   2 115  51  28   0   0   0   1  37 130 119  41   0   0   0
    0  40  64  85  63  28   2   0   7  23   1   3   3   0   0   0   0   3
   54  65  65   0   0   0   2  57 130  57  11   0   0   0   9 128  98  24
   13  13   1   0  22  79   4   0   0   0   0   0   0   4  73   1   0   0
    0   1   7  82 108   2  

In [8]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [5]:
import faiss
import numpy as np
import h5py

# Step 1: Load the SIFT10M dataset using h5py
file_path = '/content/drive/MyDrive/SIFT10M/extracted/SIFT10M/SIFT10Mfeatures.mat'
with h5py.File(file_path, 'r') as f:
    # Change 'SIFT10Mfeatures' to 'fea'
    features = f['fea'][()]  # 'fea' is the correct key for the features dataset
    print(f"Dataset shape: {features.shape}")

# Step 2: Normalize the feature vectors (required for faiss)
features = features.astype('float32')  # Faiss requires float32
faiss.normalize_L2(features)  # Normalize vectors to unit length

# Step 3: Build the FAISS index
dimension = features.shape[1]  # Dimension of feature vectors (e.g., 128 for SIFT)
nlist = 100  # Number of clusters (adjust based on your dataset)
quantizer = faiss.IndexFlatL2(dimension)  # Quantizer for clustering
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

# Train the index on a subset of the data
index.train(features[:100000])  # Use a subset for training

# Add all vectors to the index
index.add(features)

# Step 4: Perform ANN search
n_neighbors = 5  # Number of neighbors to retrieve
k = n_neighbors
distances, indices = index.search(features[:10], k)  # Search for the first 10 queries

# Display the results
print(f"Indices of nearest neighbors (first 10 queries):\n{indices}")
print(f"Distances to nearest neighbors (first 10 queries):\n{distances}")

Dataset shape: (11164866, 128)
Indices of nearest neighbors (first 10 queries):
[[       0 10666922 10998619  4671894  5452253]
 [       1  3326570  3794060  3934596  9583039]
 [       2  3190175  6666050  3191122   536917]
 [       3 11139920  5040907  6989752   943646]
 [       4 10067049   404232  1326223  9973223]
 [       5  5882074  7785920  4220897  4809564]
 [       6  3957172  3758667  4680228  2096124]
 [       7  9847968  4458487  3486086  3489818]
 [       8  3741451  4623691  1119612  3311879]
 [       9  5170464  8365302  3576684  5508015]]
Distances to nearest neighbors (first 10 queries):
[[0.         0.3163097  0.34529692 0.34696847 0.35441175]
 [0.         0.12889135 0.16931581 0.17161056 0.17416853]
 [0.         0.20816678 0.21986952 0.22561806 0.22668757]
 [0.         0.2300973  0.24736458 0.25816062 0.26168203]
 [0.         0.2472966  0.2522156  0.25333616 0.26297927]
 [0.         0.2606849  0.26170495 0.26498383 0.26905888]
 [0.         0.17823675 0.1866689  0.204

In [10]:
!pip install faiss-cpu

import faiss
import h5py
import numpy as np

# Step 1: Load the SIFT10M dataset using h5py
file_path = '/content/drive/MyDrive/SIFT10M/extracted/SIFT10M/SIFT10Mfeatures.mat'
with h5py.File(file_path, 'r') as f:
    # Change 'SIFT10Mfeatures' to 'fea'
    features = f['fea'][()]  # 'fea' is the correct key for the features dataset
    print(f"Dataset shape: {features.shape}")

# Step 2: Normalize the feature vectors (required for faiss)
features = features.astype('float32')  # Faiss requires float32
faiss.normalize_L2(features)  # Normalize vectors to unit length

# Step 3: Build the FAISS index
dimension = features.shape[1]  # Dimension of feature vectors (e.g., 128 for SIFT)
nlist = 100  # Number of clusters (adjust based on your dataset)
quantizer = faiss.IndexFlatL2(dimension)  # Quantizer for clustering
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

# Train the index on a subset of the data
index.train(features[:100000])  # Use a subset for training

# Add all vectors to the index
index.add(features)

# Step 4: Perform ANN search
n_neighbors = 5  # Number of neighbors to retrieve
k = n_neighbors
distances, indices = index.search(features[:10], k)  # Search for the first 10 queries

# Display the results
print(f"Indices of nearest neighbors (first 10 queries):\n{indices}")
print(f"Distances to nearest neighbors (first 10 queries):\n{distances}")

# Simulate ground truth labels (replace with actual labels if available)
np.random.seed(42)  # For reproducibility
labels = np.random.randint(0, 10, size=features.shape[0])  # 10 classes for demonstration

# Set a distance threshold for validation
distance_threshold = 0.5  # Adjust based on your dataset

# Validate nearest neighbors for the first 5 queries
for i in range(5):  # Check the first 5 queries
    print(f"\nQuery {i}:")
    print(f"  Indices of neighbors: {indices[i]}")
    print(f"  Distances to neighbors: {distances[i]}")

    # Validate using ground truth labels
    query_label = labels[i]
    neighbor_labels = labels[indices[i]]
    print(f"  Query label: {query_label}, Neighbor labels: {neighbor_labels}")

    if all(neighbor_labels == query_label):
        print("  Validation: All neighbors are correct!")
    else:
        print("  Validation: Some neighbors are incorrect.")

    # Validate using distance threshold
    if all(d < distance_threshold for d in distances[i]):
        print("  Validation: All neighbors are within the threshold!")
    else:
        print("  Validation: Some neighbors are too far away.")

Dataset shape: (11164866, 128)
Indices of nearest neighbors (first 10 queries):
[[       0 10666922 10998619  4671894  5452253]
 [       1  3326570  3794060  3934596  9583039]
 [       2  3190175  6666050  3191122   536917]
 [       3 11139920  5040907  6989752   943646]
 [       4 10067049   404232  1326223  9973223]
 [       5  5882074  7785920  4220897  4809564]
 [       6  3957172  3758667  4680228  2096124]
 [       7  9847968  4458487  3486086  3489818]
 [       8  3741451  4623691  1119612  3311879]
 [       9  5170464  8365302  3576684  5508015]]
Distances to nearest neighbors (first 10 queries):
[[0.         0.3163097  0.34529692 0.34696847 0.35441175]
 [0.         0.12889135 0.16931581 0.17161056 0.17416853]
 [0.         0.20816678 0.21986952 0.22561806 0.22668757]
 [0.         0.2300973  0.24736458 0.25816062 0.26168203]
 [0.         0.2472966  0.2522156  0.25333616 0.26297927]
 [0.         0.2606849  0.26170495 0.26498383 0.26905888]
 [0.         0.17823675 0.1866689  0.204

In [7]:
from sklearn.neighbors import NearestNeighbors  # Import NearestNeighbors

# Use a subset of the data (e.g., first 100,000 vectors)
subset_size = 100000
subset_features = features[:subset_size]

# Perform nearest neighbor search on the subset
nbrs = NearestNeighbors(n_neighbors=5, algorithm="auto").fit(subset_features)
distances, indices = nbrs.kneighbors(subset_features[:10])  # Search for the first 10 queries

# Display the results
print(f"Indices of nearest neighbors (first 10 queries):\n{indices}")
print(f"Distances to nearest neighbors (first 10 queries):\n{distances}")

Indices of nearest neighbors (first 10 queries):
[[    0 47983 58506 58507 43854]
 [    1 91619 30906 51883 47830]
 [    2 82912 39012 15792 88679]
 [    3 91607 89604 30291 12948]
 [    4 21594 47986 76190 96112]
 [    5   128  5829 96111 85587]
 [    6 32831 28254 29113  7276]
 [    7  1793 44513 56598 23540]
 [    8 56954  8353 38009 69447]
 [    9 27411 34409 20117  2071]]
Distances to nearest neighbors (first 10 queries):
[[4.21468478e-08 6.35944963e-01 6.59065008e-01 6.59065008e-01
  6.70171440e-01]
 [0.00000000e+00 4.31596667e-01 4.71363902e-01 4.74981606e-01
  4.93423849e-01]
 [0.00000000e+00 5.40128052e-01 5.71346819e-01 5.90725958e-01
  5.95478952e-01]
 [0.00000000e+00 5.72786748e-01 5.77502847e-01 5.83306789e-01
  5.84833920e-01]
 [2.98023224e-08 5.47321737e-01 5.58476329e-01 5.75547397e-01
  5.77083886e-01]
 [0.00000000e+00 5.52836061e-01 5.55233181e-01 5.65088391e-01
  5.75627685e-01]
 [0.00000000e+00 5.05108297e-01 5.13195217e-01 5.26694655e-01
  5.37020683e-01]
 [0.00000

Next Steps
Replace Simulated Labels:

If you have actual labels for the SIFT10M dataset, replace the simulated labels with them.

Adjust the Distance Threshold:

Tune the threshold based on your dataset and requirements.

Scale the System:

Apply the validation step to the entire dataset or integrate it into a production pipeline.

replacing labels

In [11]:
!pip install mat

Collecting mat
  Downloading mat-1.0.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mysql-connector-python>=8.0.20 (from mat)
  Downloading mysql_connector_python-9.2.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.0 kB)
Collecting tabulate>=0.8.7 (from mat)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading mysql_connector_python-9.2.0-cp311-cp311-manylinux_2_28_x86_64.whl (34.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.0/34.0 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Building wheels for collected packages: mat
  Building wheel for mat (setup.py) ... [?25l[?25hdone
  Created wheel for mat: filename=mat-1.0.2-py3-none-any.whl size=11672 sha256=efd24ca46fdd6209fb15dc6e4f97b2ec339abd9bca337490510726342b143225
  Stored in directory: /root/.cache/pip/wheels/52/d5/74/536e139e8ca764ab351663048fb32cd0486411c6c41dd0c349
Successfully built mat

In [12]:
# Load ground truth labels
with h5py.File('/content/drive/MyDrive/SIFT10M/extracted/SIFT10M/SIFT10Mfeatures.mat', 'r') as f:
    # The key for labels is likely 'gnd', not 'SIFT10Mlabels'
    # Inspect the file using `h5dump -n /content/drive/MyDrive/SIFT10M/extracted/SIFT10M/SIFT10Mfeatures.mat`
    # to confirm the correct key.
    labels = f['fea'][()]
    print(f"Labels shape: {labels.shape}")

# Define the ground_truth_path variable
ground_truth_path = '/content/drive/MyDrive/SIFT10M/extracted/SIFT10M/SIFT10Mfeatures.mat'  # Replace with the actual path

# Load the .mat file
with h5py.File(ground_truth_path, 'r') as f:
    # Extract the ground truth data using the key 'fea'
    ground_truth_data = f['fea'][()]
    print(f"Ground truth data shape: {ground_truth_data.shape}")
    print("Ground truth data:")
    print(ground_truth_data)

Labels shape: (11164866, 128)
Ground truth data shape: (11164866, 128)
Ground truth data:
[[ 55  23  21 ...   0   0   0]
 [  0   0   0 ...   6  23  89]
 [  0   0   0 ... 125   8   3]
 ...
 [ 70   2   0 ...  35   7   7]
 [  0   0   0 ...  86  82 102]
 [  2   0   0 ...   9   2   3]]


adjusting distance

In [10]:
!pip install faiss-cpu

import faiss
import numpy as np
import h5py

# Step 1: Load the SIFT10M dataset using h5py
file_path = '/content/drive/MyDrive/SIFT10M/extracted/SIFT10M/SIFT10Mfeatures.mat'
with h5py.File(file_path, 'r') as f:
    # Change 'SIFT10Mfeatures' to 'fea'
    features = f['fea'][()]  # 'fea' is the correct key for the features dataset
    print(f"Dataset shape: {features.shape}")

# Step 2: Normalize the feature vectors (required for faiss)
features = features.astype('float32')  # Faiss requires float32
faiss.normalize_L2(features)  # Normalize vectors to unit length

# Step 3: Build the FAISS index
dimension = features.shape[1]  # Dimension of feature vectors (e.g., 128 for SIFT)
nlist = 100  # Number of clusters (adjust based on your dataset)
quantizer = faiss.IndexFlatL2(dimension)  # Quantizer for clustering
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

# Train the index on a subset of the data
index.train(features[:100000])  # Use a subset for training

# Add all vectors to the index
index.add(features)

# Step 4: Perform ANN search
n_neighbors = 5  # Number of neighbors to retrieve
k = n_neighbors
distances, indices = index.search(features[:10], k)  # Search for the first 10 queries

# Display the results
print(f"Indices of nearest neighbors (first 10 queries):\n{indices}")
print(f"Distances to nearest neighbors (first 10 queries):\n{distances}")

Dataset shape: (11164866, 128)
Indices of nearest neighbors (first 10 queries):
[[       0 10666922 10998619  4671894  5452253]
 [       1  3326570  3794060  3934596  9583039]
 [       2  3190175  6666050  3191122   536917]
 [       3 11139920  5040907  6989752   943646]
 [       4 10067049   404232  1326223  9973223]
 [       5  5882074  7785920  4220897  4809564]
 [       6  3957172  3758667  4680228  2096124]
 [       7  9847968  4458487  3486086  3489818]
 [       8  3741451  4623691  1119612  3311879]
 [       9  5170464  8365302  3576684  5508015]]
Distances to nearest neighbors (first 10 queries):
[[0.         0.3163097  0.34529692 0.34696847 0.35441175]
 [0.         0.12889135 0.16931581 0.17161056 0.17416853]
 [0.         0.20816678 0.21986952 0.22561806 0.22668757]
 [0.         0.2300973  0.24736458 0.25816062 0.26168203]
 [0.         0.2472966  0.2522156  0.25333616 0.26297927]
 [0.         0.2606849  0.26170495 0.26498383 0.26905888]
 [0.         0.17823675 0.1866689  0.204

In [20]:
import numpy as np

# ... (rest of your code) ...

# Validate nearest neighbors for the first 5 queries
for i in range(5):  # Check the first 5 queries
    print(f"\nQuery {i}:")
    print(f"  Indices of neighbors: {indices[i]}")
    print(f"  Distances to neighbors: {distances[i]}")

    # Validate using ground truth labels
    query_label = labels[i]  # Fix indexing for query label
    neighbor_labels = [labels[j] for j in indices[i]]  # Fix indexing for neighbors
    print(f"  Query label: {query_label}, Neighbor labels: {neighbor_labels}")

    # Validate correct neighbors
    # Use np.array_equal to compare arrays element-wise
    correct_neighbors = sum(1 for nl in neighbor_labels if np.array_equal(nl, query_label))
    print(f"  {correct_neighbors}/{len(neighbor_labels)} neighbors match the query label.")

    if correct_neighbors == len(neighbor_labels):
        print("  ✅ Validation: All neighbors are correct!")
    elif correct_neighbors > 0:
        print("  ⚠️ Partial match: Some neighbors are correct.")
    else:
        print("  ❌ Validation: All neighbors are incorrect.")

    print(f"  Distance range: {min(distances[i]):.2f} - {max(distances[i]):.2f}")


Query 0:
  Indices of neighbors: [       0 10666922 10998619  4671894  5452253]
  Distances to neighbors: [0.         0.3163097  0.34529692 0.34696847 0.35441175]
  Query label: [ 55  23  21  15  43 100 116  63 101  17   5   1   1  12  60 117  33  80
  19   3   3  15  53  48  46  46   3   0   2   8  35  57  47  39  51  19
  14  19  70  54  73  29   9   2   3  60 117  75 117  48   8   3  14  31
 117 117  21  10   1   2  32  74 117  69  21   2   2  10  72  36  78  52
  72  36  55   8   7  26  57  68 117  61  33   4  24   8   7  26  19   4
   0   2  59  32   8   5   5   0   0   1  27   2   0   7   8   2   1   0
   0   0   0  10   0   1   1   0   0   0   0   0   0   0   0   0   0   0
   0   0], Neighbor labels: [array([ 55,  23,  21,  15,  43, 100, 116,  63, 101,  17,   5,   1,   1,
        12,  60, 117,  33,  80,  19,   3,   3,  15,  53,  48,  46,  46,
         3,   0,   2,   8,  35,  57,  47,  39,  51,  19,  14,  19,  70,
        54,  73,  29,   9,   2,   3,  60, 117,  75, 117,  48,   8