# Siamese Network for Signature Verification

This notebook implements a **Siamese Network** to learn a similarity metric between signatures. 
Goal: Distinguish between **Genuine-Genuine** pairs (same person) and **Genuine-Forged** pairs.

In [None]:
import os
import random
import glob
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras import backend as K

# Set Random Seeds
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Check GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU Detected: {gpus}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected. Running on CPU.")

In [None]:
# Configuration
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
SOURCE_DIR = "signatures"

## 1. Data Preparation
We load data directly from the `signatures` source directory and organize it by person ID. This facilitates easy pair generation.

In [None]:
def load_data(source_dir):
    """
    Loads genuine and forged signatures, grouping them by Person ID.
    Returns: dict { person_id: {'genuine': [paths], 'forged': [paths]} }
    """
    data = {}
    if not os.path.exists(source_dir):
        print(f"Error: Source directory '{source_dir}' not found.")
        return data

    subdirs = os.listdir(source_dir)
    for folder in subdirs:
        path = os.path.join(source_dir, folder)
        if not os.path.isdir(path): continue
        
        # Parse Person ID from folder name (e.g., '001', '001_forg')
        if '_forg' in folder:
            person_id = folder.replace('_forg', '')
            cat = 'forged'
        else:
            person_id = folder
            cat = 'genuine'
        
        if person_id not in data:
            data[person_id] = {'genuine': [], 'forged': []}
            
        # Get all image files
        files = glob.glob(os.path.join(path, "*"))
        files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        data[person_id][cat].extend(files)
        
    return data

data_dict = load_data(SOURCE_DIR)
print(f"Loaded {len(data_dict)} people.")

## 2. Pair Generation
We define a function to generate pairs:
- **Positive (Label 1)**: Two Genuine signatures of the same person.
- **Negative (Label 0)**: One Genuine and one Forged signature of the same person.

We also split the data into **Train** and **Test** sets based on Person IDs (80/20).

In [None]:
def make_pairs(data_dict, person_ids):
    """
    Generates pairs of paths: (image_A, image_B) and labels.
    """
    pairs_1 = []
    pairs_2 = []
    labels = []
    
    for pid in person_ids:
        gens = data_dict[pid]['genuine']
        forgs = data_dict[pid]['forged']
        
        if len(gens) < 2: 
            continue
        
        # --- Positive Pairs (Genuine + Genuine) ---
        # We take adjacent pairs to keep it simple and balanced, or all pairs
        # Here we try to create as many pairs as possible without explosion
        for i in range(len(gens)):
            for j in range(i+1, len(gens)):
                pairs_1.append(gens[i])
                pairs_2.append(gens[j])
                labels.append(1.0) # 0 distance ideally -> Label depends on loss choice. 
                                   # Standard contrastive: 1 = same, 0 = different? Or 0=same?
                                   # Let's use: 0 = Different, 1 = Same
                
        # --- Negative Pairs (Genuine + Forged) ---
        # Pair every genuine with every forged
        # If too many forged, we can subsample
        for g in gens:
            for f in forgs:
                pairs_1.append(g)
                pairs_2.append(f)
                labels.append(0.0) 
    
    return np.array(pairs_1), np.array(pairs_2), np.array(labels).astype('float32')

# Split IDs
all_ids = list(data_dict.keys())
random.shuffle(all_ids)
split_idx = int(0.8 * len(all_ids))
train_ids = all_ids[:split_idx]
test_ids = all_ids[split_idx:]

print(f"Train People: {len(train_ids)}, Test People: {len(test_ids)}")

# Generate Pairs
tr_p1, tr_p2, tr_y = make_pairs(data_dict, train_ids)
te_p1, te_p2, te_y = make_pairs(data_dict, test_ids)

print(f"Train Pairs: {len(tr_y)} (Positive: {np.sum(tr_y==1)}, Negative: {np.sum(tr_y==0)})")
print(f"Test Pairs: {len(te_y)} (Positive: {np.sum(te_y==1)}, Negative: {np.sum(te_y==0)})")

In [None]:
# TF Data Pipeline
def preprocess_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = img / 255.0
    return img

def preprocess_pair(path1, path2, label):
    return (preprocess_image(path1), preprocess_image(path2)), label

def create_dataset(p1, p2, y, batch_size=32, shuffle=True):
    ds = tf.data.Dataset.from_tensor_slices((p1, p2, y))
    if shuffle:
        ds = ds.shuffle(buffer_size=1024)
    ds = ds.map(preprocess_pair, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = create_dataset(tr_p1, tr_p2, tr_y, BATCH_SIZE)
test_ds = create_dataset(te_p1, te_p2, te_y, BATCH_SIZE, shuffle=False)

In [None]:
# Visualization
def visualize_pairs(dataset, num_pairs=5):
    (img1_b, img2_b), label_b = next(iter(dataset))
    
    plt.figure(figsize=(15, 6))
    for i in range(num_pairs):
        ax = plt.subplot(2, num_pairs, i + 1)
        plt.imshow(img1_b[i])
        plt.title("Signature 1")
        plt.axis("off")
        
        ax = plt.subplot(2, num_pairs, i + 1 + num_pairs)
        plt.imshow(img2_b[i])
        lbl_text = "Genuine" if label_b[i]==1 else "Forged Pair"
        plt.title(f"Sig 2: {lbl_text}")
        plt.axis("off")
    plt.show()

visualize_pairs(train_ds)

## 3. Model Architecture
We define the Siamese Network:
1.  **Embedding Network**: A CNN that maps images to a 128-dimensional vector.
2.  **Distance Layer**: Computes Euclidean distance between two embeddings.
3.  **Siamese Model**: Wraps the above.

In [None]:
def build_embedding_network(input_shape):
    inputs = layers.Input(shape=input_shape)
    
    # Standard CNN blocks
    x = layers.Conv2D(32, (3, 3), activation='relu')(inputs)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Conv2D(64, (3, 3), activation='relu')(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv2D(128, (3, 3), activation='relu')(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Flatten()(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dense(128)(x) # Embedding vector
    
    return models.Model(inputs, x, name="embedding_network")

def euclidean_distance(vectors):
    (featA, featB) = vectors
    sum_squared = K.sum(K.square(featA - featB), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_squared, K.epsilon()))

In [None]:
# Build Siamese Network
input_shape = IMG_SIZE + (3,)

img_a = layers.Input(shape=input_shape)
img_b = layers.Input(shape=input_shape)

embedding_net = build_embedding_network(input_shape)
feat_a = embedding_net(img_a)
feat_b = embedding_net(img_b)

distance = layers.Lambda(euclidean_distance, output_shape=(1,))([feat_a, feat_b])

model = models.Model(inputs=[img_a, img_b], outputs=distance)
model.summary()

## 4. Training Configuration
We use **Contrastive Loss**.
- Label 1 (Similar): Minimize distance (Distance -> 0).
- Label 0 (Dissimilar): Maximize distance (Distance -> Margin).


In [None]:
def contrastive_loss(y_true, y_pred, margin=1.0):
    # y_true: 1 for same, 0 for different
    # y_pred: distance
    
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    
    # Loss = (y_true * square_pred) + ((1 - y_true) * margin_square)
    # Wait! Standard contrastive often uses 1=same -> dist=0
    # Let's check: If y_true=1 (Same), we want dist (y_pred) to be small. -> y_true * square_pred. Correct.
    # If y_true=0 (Diff), we want dist to be large (up to margin). -> (1-y) * margin_square. Correct.
    
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

model.compile(loss=contrastive_loss, optimizer='rmsprop')

In [None]:
# Training
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=10,
    callbacks=[early_stopping]
)

## 5. Evaluation
We check the model's ability to distinguish pairs. 
We compute accuracy by applying a fixed threshold (e.g., 0.5) to the distance.

In [None]:
def compute_accuracy(y_true, y_pred, threshold=0.5):
    # If distance < threshold -> Predicted Same (1)
    # Else -> Predicted Different (0)
    pred_labels = (y_pred < threshold).astype(int)
    return np.mean(pred_labels == y_true)

# Get predictions for Test set
print("Predicting on Test Set...")
predictions = model.predict(test_ds)

accuracy = compute_accuracy(te_y, predictions.ravel())
print(f"Test Accuracy (Threshold=0.5): {accuracy * 100:.2f}%")

In [None]:
# Visualize Training History
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend()
plt.title('Training Loss')
plt.show()

In [None]:
# Visualizing Test Results
def show_results(dataset, predictions, num_pairs=5):
    # We pick first batch visually, but need corresponding predictions
    # Note: dataset is batched. predictions are flat array for whole dataset.
    # We need to map them manually or iterate carefully.
    
    (img1_b, img2_b), label_b = next(iter(dataset))
    batch_preds = predictions[:len(label_b)] # Approximation for first batch
    
    plt.figure(figsize=(15, 8))
    for i in range(num_pairs):
        ax = plt.subplot(2, num_pairs, i + 1)
        plt.imshow(img1_b[i])
        plt.axis("off")
        
        ax = plt.subplot(2, num_pairs, i + 1 + num_pairs)
        plt.imshow(img2_b[i])
        
        dist = batch_preds[i]
        true_lbl = "Same" if label_b[i]==1 else "Diff"
        pred_lbl = "Same" if dist < 0.5 else "Diff"
        
        color = "green" if true_lbl == pred_lbl else "red"
        
        plt.title(f"D:{dist:.2f} | T:{true_lbl} P:{pred_lbl}", color=color)
        plt.axis("off")
    plt.show()

show_results(test_ds, predictions)