# **Product Matching Model**


# **Import Required Libraries**

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
import pickle
import os
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## **2: Custom Dataset Class**

- Inherits from PyTorch's Dataset class
- Stores the DataFrame and implements required methods `(__len__ and __getitem__)`
- Returns product title pairs with their match labels
- Handles data type conversion for PyTorch compatibility

In [3]:
class ProductPairDataset(Dataset):
    """
    Custom PyTorch Dataset for product title pairs with pre-computed embeddings
    """
    def __init__(self, dataframe, embedding_map):
        """
        Initialize the dataset
        Args:
            dataframe: pandas DataFrame with columns ['title1', 'title2', 'match']
            embedding_map: dictionary mapping titles to pre-computed embeddings
        """
        self.data = dataframe.reset_index(drop=True)
        self.embedding_map = embedding_map

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Get a single sample from the dataset
        Returns:
            tuple: (embedding1, embedding2, label)
        """
        row = self.data.iloc[idx]

        # Look up the pre-computed embeddings
        embedding1 = self.embedding_map[row['title1']]
        embedding2 = self.embedding_map[row['title2']]
        label = float(row['match'])

        return embedding1, embedding2, label

## **3: Siamese Network Architecture**

- Uses pre-trained `all-MiniLM-L6-v2` SentenceTransformer as backbone
- Freezes all SentenceTransformer parameters to prevent training
- Adds a trainable projection head `(Linear->ReLU->Linear)` that outputs 128-dim embeddings
- Implements shared weight architecture through forward_one method
- Returns embeddings for both inputs in the pair

In [4]:
class SiameseNetwork(nn.Module):
    """
    Siamese Network for product matching using pre-computed SBERT embeddings
    """
    def __init__(self, input_dim=384, embedding_dim=128):
        """
        Args:
            input_dim: dimension of input SBERT embeddings (384 for all-MiniLM-L6-v2)
            embedding_dim: final output embedding dimension
        """
        super(SiameseNetwork, self).__init__()

        # The model now ONLY contains the trainable projection head
        self.projection_head = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, embedding_dim)
        )

    def forward_one(self, embedding):
        """
        Forward pass for one pre-computed embedding
        Args:
            embedding: pre-computed SBERT embedding tensor
        Returns:
            torch.Tensor: projected embedding vector
        """
        return self.projection_head(embedding)

    def forward(self, embedding1, embedding2):
        """
        Forward pass for a pair of embeddings
        Args:
            embedding1: first pre-computed embedding
            embedding2: second pre-computed embedding
        Returns:
            tuple: (projected_embedding1, projected_embedding2)
        """
        output1 = self.forward_one(embedding1)
        output2 = self.forward_one(embedding2)
        return output1, output2

## **4: Contrastive Loss Implementation**

- Takes two embeddings and a label (1 for match, 0 for no match)
- Calculates Euclidean distance between embeddings
- Applies contrastive loss formula:

  - For matching pairs (label=1): penalizes large distances
  - For non-matching pairs (label=0): penalizes small distances (below margin)


- Uses margin of 1.0 as specified

In [5]:
class ContrastiveLoss(nn.Module):
    """
    Contrastive Loss for Siamese Networks (Fixed Logic)
    """
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, embedding1, embedding2, label):
        """
        Calculate contrastive loss with corrected logic
        Args:
            embedding1: first embedding vector
            embedding2: second embedding vector
            label: 1 for similar pairs (should have small distance)
                  0 for dissimilar pairs (should have large distance)
        Returns:
            torch.Tensor: contrastive loss value
        """
        # Calculate Euclidean distance
        distance = torch.nn.functional.pairwise_distance(embedding1, embedding2)

        # Corrected Contrastive loss formula:
        # If match (label=1): penalize large distances (distance^2)
        # If no-match (label=0): penalize small distances (margin-distance)^2
        loss = (label) * torch.pow(distance, 2) + \
               (1 - label) * torch.pow(torch.clamp(self.margin - distance, min=0.0), 2)

        return torch.mean(loss)

In [6]:
def precompute_embeddings(df, model_name='all-MiniLM-L6-v2'):
    """
    Pre-compute SBERT embeddings for all unique titles
    This dramatically speeds up training by avoiding repeated encoding
    """
    print("Loading SentenceTransformer model...")
    text_encoder = SentenceTransformer(model_name)

    print("Extracting unique titles...")
    # Get all unique titles from both columns
    all_titles = pd.concat([df['title1'], df['title2']]).unique()
    print(f"Found {len(all_titles)} unique titles")

    print("Pre-computing SBERT embeddings...")
    # Create a dictionary mapping titles to their embeddings
    title_to_embedding = {}

    # Process in batches for memory efficiency
    batch_size = 32
    for i in range(0, len(all_titles), batch_size):
        batch_titles = all_titles[i:i+batch_size]
        batch_embeddings = text_encoder.encode(
            batch_titles.tolist(),
            convert_to_tensor=True,
            show_progress_bar=False
        )

        # Store embeddings
        for title, embedding in zip(batch_titles, batch_embeddings):
            title_to_embedding[title] = embedding

        if (i // batch_size + 1) % 10 == 0:
            print(f"Processed {i + len(batch_titles)}/{len(all_titles)} titles")

    print(f"Successfully computed embeddings for {len(all_titles)} unique titles")
    return title_to_embedding

## **5: Data Loading and Preprocessing**

- Reads the CSV file with product pairs
- Creates the custom Dataset instance
- Splits data into 80% training and 20% testing
- Uses fixed random seed for reproducible splits
- Creates DataLoader objects for batch processing
- Prints data distribution statistics

In [7]:
def load_and_prepare_data(csv_file_path, batch_size=32, test_split=0.2):
    """
    Load data from CSV, pre-compute embeddings, and create train/test DataLoaders
    """
    # Load the CSV file
    print(f"Loading data from {csv_file_path}...")
    df = pd.read_csv(csv_file_path)
    print(f"Loaded {len(df)} product pairs")

    # Display data distribution
    print(f"Matching pairs: {df['match'].sum()}")
    print(f"Non-matching pairs: {len(df) - df['match'].sum()}")

    # Pre-compute all SBERT embeddings
    embedding_map = precompute_embeddings(df)

    # Create dataset with embedding map
    full_dataset = ProductPairDataset(df, embedding_map)

    # Calculate split sizes
    test_size = int(len(full_dataset) * test_split)
    train_size = len(full_dataset) - test_size

    # Split dataset
    train_dataset, test_dataset = random_split(
        full_dataset,
        [train_size, test_size],
        generator=torch.Generator().manual_seed(42)  # For reproducibility
    )

    print(f"Training samples: {len(train_dataset)}")
    print(f"Testing samples: {len(test_dataset)}")

    # Create DataLoaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0  # Set to 0 to avoid multiprocessing issues
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0
    )

    return train_loader, test_loader, embedding_map

In [10]:
# Configuration
CSV_FILE = '/content/drive/MyDrive/Product Matching/Data/training_pairs.csv'
BATCH_SIZE = 32


train_loader, test_loader, embedding_map = load_and_prepare_data(
            CSV_FILE,
            batch_size=BATCH_SIZE,
            test_split=0.2
)

Loading data from /content/drive/MyDrive/Product Matching/Data/training_pairs.csv...
Loaded 102882 product pairs
Matching pairs: 51441
Non-matching pairs: 51441
Loading SentenceTransformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Extracting unique titles...
Found 30800 unique titles
Pre-computing SBERT embeddings...
Processed 320/30800 titles
Processed 640/30800 titles
Processed 960/30800 titles
Processed 1280/30800 titles
Processed 1600/30800 titles
Processed 1920/30800 titles
Processed 2240/30800 titles
Processed 2560/30800 titles
Processed 2880/30800 titles
Processed 3200/30800 titles
Processed 3520/30800 titles
Processed 3840/30800 titles
Processed 4160/30800 titles
Processed 4480/30800 titles
Processed 4800/30800 titles
Processed 5120/30800 titles
Processed 5440/30800 titles
Processed 5760/30800 titles
Processed 6080/30800 titles
Processed 6400/30800 titles
Processed 6720/30800 titles
Processed 7040/30800 titles
Processed 7360/30800 titles
Processed 7680/30800 titles
Processed 8000/30800 titles
Processed 8320/30800 titles
Processed 8640/30800 titles
Processed 8960/30800 titles
Processed 9280/30800 titles
Processed 9600/30800 titles
Processed 9920/30800 titles
Processed 10240/30800 titles
Processed 10560/30

## **6: Training Function**

- Sets up device (GPU if available, otherwise CPU)
- Initializes ContrastiveLoss and Adam optimizer with specified parameters
- Iterates through epochs and batches
- Performs forward pass, loss calculation, and backpropagation
- Prints training progress and loss statistics
- Returns the trained model

In [8]:
def train_model(model, train_loader, num_epochs=10, learning_rate=0.001, weight_decay=1e-5):
    """
    Train the Siamese Network with pre-computed embeddings
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Training on device: {device}")

    model = model.to(device)

    # Loss function and optimizer
    criterion = ContrastiveLoss(margin=1.0)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    model.train()

    for epoch in range(num_epochs):
        total_loss = 0.0
        num_batches = 0

        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print("-" * 30)

        for batch_idx, (embedding1_batch, embedding2_batch, labels_batch) in enumerate(train_loader):
            # Move tensors to device
            embedding1_batch = embedding1_batch.to(device)
            embedding2_batch = embedding2_batch.to(device)
            labels_batch = labels_batch.to(device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            output1, output2 = model(embedding1_batch, embedding2_batch)

            # Calculate loss
            loss = criterion(output1, output2, labels_batch)

            # Backward pass
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1

            # Print progress
            if (batch_idx + 1) % 10 == 0:
                print(f"Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

        # Print epoch statistics
        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")

    return model

In [11]:
NUM_EPOCHS = 20
LEARNING_RATE = 0.001
WEIGHT_DECAY = 1e-5


model = SiameseNetwork(input_dim=384, embedding_dim=128)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

trained_model = train_model(
        model,
        train_loader,
        num_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY
    )

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Batch 2180/2573, Loss: 0.0109
Batch 2190/2573, Loss: 0.0246
Batch 2200/2573, Loss: 0.0322
Batch 2210/2573, Loss: 0.0482
Batch 2220/2573, Loss: 0.0568
Batch 2230/2573, Loss: 0.0297
Batch 2240/2573, Loss: 0.0372
Batch 2250/2573, Loss: 0.0332
Batch 2260/2573, Loss: 0.0279
Batch 2270/2573, Loss: 0.0403
Batch 2280/2573, Loss: 0.0756
Batch 2290/2573, Loss: 0.0566
Batch 2300/2573, Loss: 0.0402
Batch 2310/2573, Loss: 0.0331
Batch 2320/2573, Loss: 0.0204
Batch 2330/2573, Loss: 0.0448
Batch 2340/2573, Loss: 0.0448
Batch 2350/2573, Loss: 0.0286
Batch 2360/2573, Loss: 0.0600
Batch 2370/2573, Loss: 0.0073
Batch 2380/2573, Loss: 0.0424
Batch 2390/2573, Loss: 0.0295
Batch 2400/2573, Loss: 0.0432
Batch 2410/2573, Loss: 0.0358
Batch 2420/2573, Loss: 0.0257
Batch 2430/2573, Loss: 0.0348
Batch 2440/2573, Loss: 0.0276
Batch 2450/2573, Loss: 0.0305
Batch 2460/2573, Loss: 0.0367
Batch 2470/2573, Loss: 0.0556
Batch 2480/2573, Loss: 0.0293
Batch

## **7: Evaluation Function**

- Sets model to evaluation mode
- Iterates through test data without gradient computation
- Calculates Euclidean distances between embeddings
- Makes binary predictions using 0.6 threshold
- Computes precision, recall, and F1-score using sklearn
- Prints detailed evaluation results and distance statistics

In [12]:
def evaluate_model(model, test_loader, threshold=0.7):
    """
    Evaluate the trained model on test set with pre-computed embeddings
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    all_predictions = []
    all_labels = []
    all_distances = []

    print("Evaluating model on test set...")

    with torch.no_grad():
        for embedding1_batch, embedding2_batch, labels_batch in test_loader:
            # Move tensors to device
            embedding1_batch = embedding1_batch.to(device)
            embedding2_batch = embedding2_batch.to(device)
            labels_batch = labels_batch.to(device)

            # Forward pass
            output1, output2 = model(embedding1_batch, embedding2_batch)

            # Calculate distances
            distances = torch.nn.functional.pairwise_distance(output1, output2)

            # Make predictions based on threshold
            predictions = (distances < threshold).float()

            # Store results
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels_batch.cpu().numpy())
            all_distances.extend(distances.cpu().numpy())

    # Convert to numpy arrays
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    all_distances = np.array(all_distances)

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_predictions, average='binary', zero_division=0
    )

    # Print results
    print(f"\nEvaluation Results (threshold = {threshold}):")
    print("-" * 40)
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Average Distance (Matches): {all_distances[all_labels == 1].mean():.4f}")
    print(f"Average Distance (Non-matches): {all_distances[all_labels == 0].mean():.4f}")

    return precision, recall, f1, all_predictions, all_labels, all_distances

In [13]:
THRESHOLD = 0.6

precision, recall, f1, predictions, labels, distances = evaluate_model(
        trained_model,
        test_loader,
        threshold=THRESHOLD
    )

Evaluating model on test set...

Evaluation Results (threshold = 0.6):
----------------------------------------
Precision: 0.9661
Recall: 0.9970
F1-Score: 0.9813
Average Distance (Matches): 0.1225
Average Distance (Non-matches): 1.3318


## **8: Model Saving Function**

- Saves only the trainable projection head (since SBERT is frozen)
- Stores model metadata for proper reconstruction
- Implements loading function to recreate the model
- Uses CPU mapping to ensure compatibility across devices

In [14]:
def save_model(model, filepath='siamese_matcher.pth'):
    """
    Save the trained model
    """
    # Save only the trainable projection head since SentenceTransformer is frozen
    model_state = {
        'projection_head_state_dict': model.projection_head.state_dict(),
        'model_architecture': 'SiameseNetwork',
        'sbert_model': 'all-MiniLM-L6-v2',
        'embedding_dim': 128
    }

    torch.save(model_state, filepath)
    print(f"Model saved to {filepath}")

def load_model(filepath='siamese_matcher.pth'):
    """
    Load the trained model
    """
    model_state = torch.load(filepath, map_location='cpu')

    # Create new model instance
    model = SiameseNetwork(embedding_dim=model_state['embedding_dim'])

    # Load only the projection head weights
    model.projection_head.load_state_dict(model_state['projection_head_state_dict'])

    print(f"Model loaded from {filepath}")
    return model


In [17]:
save_model(trained_model, 'siamese_matcher.pth')

Model saved to siamese_matcher.pth


In [24]:
def test_inference(model_path='siamese_matcher.pth'):
    """
    Example of how to use the trained model for inference
    """
    import torch
    from sentence_transformers import SentenceTransformer

    # Load the trained Siamese model
    model = load_model(model_path)

    # Move model to device (GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    # Load the SentenceTransformer model
    text_encoder = SentenceTransformer('all-MiniLM-L6-v2')

    # Example product pairs for testing
    test_pairs = [
    ("iPhone 13 Pro Max 256GB Blue", "Apple iPhone 13 Pro Max 256GB Blue"),
    ("Samsung Galaxy S21", "iPhone 12 Pro"),
    ("Nike Air Force 1 White", "Nike Air Force One White Sneakers"),
    ("Dell Laptop 15 inch", "MacBook Pro 13 inch"),

    # Additional 30 pairs
    ("Sony WH-1000XM4 Headphones", "Sony WH1000XM4 Wireless Noise Cancelling Headphones"),
    ("Canon EOS 5D Mark IV DSLR", "Canon EOS 6D Mark II DSLR Camera"),
    ("Apple Watch Series 7 GPS", "Apple Watch Series 7 Cellular"),
    ("Adidas Ultraboost 21", "Adidas Ultraboost 20 Shoes"),
    ("Lenovo ThinkPad X1 Carbon", "Lenovo ThinkPad X1 Yoga"),
    ("Samsung Galaxy Tab S7", "Samsung Galaxy Tab S8"),
    ("Google Pixel 6 Pro", "Google Pixel 6"),
    ("Bose QuietComfort 35 II", "Bose Noise Cancelling Headphones 700"),
    ("HP Spectre x360 Laptop", "HP Envy x360 Laptop"),
    ("Microsoft Surface Pro 7", "Microsoft Surface Go 3"),
    ("Fitbit Charge 5", "Fitbit Inspire 2"),
    ("Logitech MX Master 3 Mouse", "Logitech MX Anywhere 3 Mouse"),
    ("Razer BlackWidow V3 Keyboard", "Razer Huntsman Mini Keyboard"),
    ("GoPro Hero 9 Black", "GoPro Hero 8 Black"),
    ("Nintendo Switch Console", "Nintendo Switch OLED Console"),
    ("Amazon Echo Dot 4th Gen", "Amazon Echo Dot 3rd Gen"),
    ("JBL Flip 5 Bluetooth Speaker", "JBL Charge 4 Speaker"),
    ("Canon PIXMA MG3620 Printer", "Canon PIXMA TS6320 Printer"),
    ("Samsung Galaxy Buds Pro", "Samsung Galaxy Buds Live"),
    ("Apple AirPods Pro", "Apple AirPods 2nd Gen"),
    ("Sony PlayStation 5 Console", "Sony PlayStation 4 Pro"),
    ("LG 55 inch OLED TV", "LG 65 inch OLED TV"),
    ("Dyson V11 Vacuum Cleaner", "Dyson V10 Vacuum Cleaner"),
    ("Instant Pot Duo 7-in-1", "Instant Pot Duo 6-in-1"),
    ("Nikon D750 DSLR Camera", "Nikon D5600 DSLR Camera"),
    ("Corsair Vengeance LPX 16GB RAM", "Corsair Dominator Platinum 16GB RAM"),
    ("Seagate 2TB External Hard Drive", "Seagate 1TB External Hard Drive"),
    ("Samsung 970 EVO Plus SSD 1TB", "Samsung 980 PRO SSD 1TB"),
    ("Kingston A2000 500GB NVMe SSD", "Crucial P2 500GB NVMe SSD"),
    ("Anker PowerCore 10000mAh", "Crucial P2 500GB NVMe SSD")
]
    print("Testing inference on sample pairs:")
    print("-" * 60)

    with torch.no_grad():
        for title1, title2 in test_pairs:
            # Get embeddings using the SentenceTransformer
            embedding1 = text_encoder.encode(title1, convert_to_tensor=True).unsqueeze(0).to(device)
            embedding2 = text_encoder.encode(title2, convert_to_tensor=True).unsqueeze(0).to(device)

            # Get projected embeddings from the Siamese network
            output1, output2 = model(embedding1, embedding2)

            # Calculate distance
            distance = torch.nn.functional.pairwise_distance(output1, output2).item()

            # Make prediction
            threshold = 0.6
            is_match = distance < threshold

            print(f"Title 1: {title1}")
            print(f"Title 2: {title2}")
            print(f"Distance: {distance:.4f}")
            print(f"Prediction: {'MATCH' if is_match else 'NO MATCH'}")
            print("-" * 60)

# Uncomment to test inference after training
# test_inference()


In [25]:
test_inference('siamese_matcher.pth')

Model loaded from siamese_matcher.pth
Testing inference on sample pairs:
------------------------------------------------------------
Title 1: iPhone 13 Pro Max 256GB Blue
Title 2: Apple iPhone 13 Pro Max 256GB Blue
Distance: 0.0373
Prediction: MATCH
------------------------------------------------------------
Title 1: Samsung Galaxy S21
Title 2: iPhone 12 Pro
Distance: 0.9483
Prediction: NO MATCH
------------------------------------------------------------
Title 1: Nike Air Force 1 White
Title 2: Nike Air Force One White Sneakers
Distance: 0.2780
Prediction: MATCH
------------------------------------------------------------
Title 1: Dell Laptop 15 inch
Title 2: MacBook Pro 13 inch
Distance: 0.5890
Prediction: MATCH
------------------------------------------------------------
Title 1: Sony WH-1000XM4 Headphones
Title 2: Sony WH1000XM4 Wireless Noise Cancelling Headphones
Distance: 0.3523
Prediction: MATCH
------------------------------------------------------------
Title 1: Canon EOS 5