# Train (Index) on Your Own Data

I created this notebook so you can easily create a new search engine model based on **your own images**.

### **How it works:**
1. You enter the path to your folder of images.
2. I wrote a script to scan the folder and "learn" (index) every image.
3. Then we save the resulting model (index) to a folder of your choice.

Let's get started!

In [None]:
import os
import sys
from pathlib import Path
import numpy as np
import torch
import pickle
from PIL import Image
from transformers import AutoProcessor, AutoModel
from tqdm.notebook import tqdm

# I'm forcing usage of GPU if available, otherwise defaulting to CPU.
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {DEVICE}")

In [None]:
# --- 1. SETUP MODEL ---
# I am using the same powerful model here as in the main project for consistency.
MODEL_NAME = "google/siglip-base-patch16-224"

print("Loading AI Model... Please wait...")
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()
print("Model Loaded!")

### **Step 2: Enter Your Data Path**
Copy and paste the full path to the folder where your images are stored.
Example: `C:\Users\Name\Pictures\MyVacation` or `D:\Datasets\Cars`

In [None]:
dataset_folder = input("Enter dataset folder path: ").strip('"') # I'm removing quotes just in case
dataset_path = Path(dataset_folder)

if not dataset_path.exists():
    print("Error: That folder does not exist! Please check the path.")
else:
    # I'm scanning for standard image formats here.
    valid_exts = {'.jpg', '.jpeg', '.png', '.webp', '.bmp'}
    image_files = [
        p for p in dataset_path.rglob("*") 
        if p.suffix.lower() in valid_exts
    ]
    print(f"Found {len(image_files)} images in '{dataset_path.name}'")

### **Step 3: Train (Index) the Data**
This step effectively converts your images into mathematical vectors that the AI understands.

In [None]:
def compute_embedding(path):
    try:
        image = Image.open(path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            # Get features - handling both SigLIP and CLIP models
            if hasattr(model, 'get_image_features'):
                outputs = model.get_image_features(**inputs)
            else:
                outputs = model.get_text_features(**inputs)
            # Normalize the vectors
            embedding = outputs / outputs.norm(p=2, dim=-1, keepdim=True)
            return embedding.cpu().numpy().flatten()
    except Exception as e:
        return None

embeddings = []
filenames = []

print(f"Starting processing of {len(image_files)} images...")

for img_path in tqdm(image_files):
    emb = compute_embedding(img_path)
    if emb is not None:
        embeddings.append(emb)
        # I'm saving the path relative to the dataset folder to keep it clean and portable.
        filenames.append(str(img_path.relative_to(dataset_path)))

print("Processing Complete!")

### **Step 4: Save Your New Model**
Where should I save this index? You can choose your Downloads folder or the project folder.

In [None]:
# I default to a 'custom_model' folder in the current directory, but you can change it.
save_dir = input("Enter folder path to save model (Press Enter for default 'custom_model'): ").strip()

if not save_dir:
    save_dir = "custom_model"

out_path = Path(save_dir)
out_path.mkdir(parents=True, exist_ok=True)

if len(embeddings) > 0:
    # Save Embeddings
    emb_array = np.vstack(embeddings).astype('float32')
    np.save(out_path / "embeddings.npy", emb_array)
    
    # Save Filenames
    with open(out_path / "filenames.pkl", 'wb') as f:
        pickle.dump(filenames, f)
        
    print(f"\nSUCCESS! Model saved to: {out_path.absolute()}")
    print("\nFILES CREATED:")
    print(f"1. {out_path / 'embeddings.npy'}")
    print(f"2. {out_path / 'filenames.pkl'}")
    
    print("\nTo use this model, point your scripts or API to this folder!")
else:
    print("Warning: No embeddings were generated. Check your images.")