In [27]:
import pandas as pd
import numpy as np
from pathlib import Path
import cv2
import sys
import img_preprocessing_util_functions as img_utils

In [28]:
csv_file_path = '../data-sampling/sample_dataset_BHSig260_Bengali.csv'  # Path to your CSV file 
df_csv = pd.read_csv(csv_file_path)
# image_path_df = df_csv['Image File']

In [29]:
df_csv

Unnamed: 0,Data Source,Language,Seed,Person ID/Name,Class,Image ID,Image File
0,BHSig260,Bengali,123,person_41,true,16,./sample_data/BHSig260_Dataset_Bengali/seed_12...
1,BHSig260,Bengali,123,person_41,true,14,./sample_data/BHSig260_Dataset_Bengali/seed_12...
2,BHSig260,Bengali,123,person_41,true,7,./sample_data/BHSig260_Dataset_Bengali/seed_12...
3,BHSig260,Bengali,123,person_41,true,12,./sample_data/BHSig260_Dataset_Bengali/seed_12...
4,BHSig260,Bengali,123,person_41,true,17,./sample_data/BHSig260_Dataset_Bengali/seed_12...
...,...,...,...,...,...,...,...
139,BHSig260,Bengali,123,person_70,forge,3,./sample_data/BHSig260_Dataset_Bengali/seed_12...
140,BHSig260,Bengali,123,person_70,forge,29,./sample_data/BHSig260_Dataset_Bengali/seed_12...
141,BHSig260,Bengali,123,person_70,forge,13,./sample_data/BHSig260_Dataset_Bengali/seed_12...
142,BHSig260,Bengali,123,person_70,forge,17,./sample_data/BHSig260_Dataset_Bengali/seed_12...


In [30]:
def preprocess_image(image_path):
    """
    Preprocess the image data including skeletonization and augmentation.
    """
    # Load the image from file
    image = cv2.imread(image_path)

    if image is None:
        raise ValueError(f"Failed to load image: {image_path}")

    # Convert the image if it's in color
    if image.shape[-1] == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Apply preprocessing steps from utility functions
    gray_image = img_utils.rgb_to_grey(image)
    binary_image = img_utils.grey_to_binary(gray_image)
    skeleton = img_utils.skeletonize_image(binary_image)
    augmented_image = img_utils.augment_image(skeleton)
    preprocessed_image = img_utils.preprocess_for_efficientnet(augmented_image)  # Or EfficientNet, ResNet, etc.

    return preprocessed_image

def preprocess_data(df):
    """
    Preprocess the image data by applying preprocessing steps to each image.
    """
    label_mapping = {'true': 0, 'forge': 1}  # Assuming 'R' is genuine and 'F' is forged
    preprocessed_data = []

    for i, row in df.iterrows():
        image_path = row['Image File']  # Assuming this column contains the image paths
        label = label_mapping[row['Class']]  # Assuming 'Class' column has 'R' or 'F'
        person_id = row['Person ID/Name']  # Assuming 'Person ID/Name' column exists

        # Preprocess the image
        # sample_data_directory = "../data-sampling"
        # image_path = sample_data_directory + image_path
        image_path = image_path.replace('./sample_data/', '../data-sampling/sample_data/')
        processed_image = preprocess_image(image_path)

        # Append the preprocessed image, label, and person ID to the list
        preprocessed_data.append((processed_image, label, person_id))

    # Extract separate lists of images, labels, and person IDs
    images = np.array([item[0] for item in preprocessed_data])
    labels = np.array([item[1] for item in preprocessed_data])
    person_ids = np.array([item[2] for item in preprocessed_data])

    # Create a new DataFrame with the preprocessed data
    preprocessed_df = pd.DataFrame({'person_id': person_ids, 'image': list(images), 'label': labels})

    return preprocessed_df

In [31]:
from pathlib import Path

csv_file_path = '../data-sampling/sample_dataset_BHSig260_Bengali.csv'  # Path to your CSV file 
df_csv = pd.read_csv(csv_file_path)

project_name = 'EfficientNetb0'
output_dir = Path("./" + project_name)

# Ensure the output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Preprocess the image data
preprocessed_df = preprocess_data(df_csv)

# Save the preprocessed DataFrame to a pickle file
preprocessed_pickle_path = output_dir / 'preprocessed_signature_df.pkl'
# preprocessed_pickle_path = './preprocessed_signature_df.pkl'
preprocessed_df.to_pickle(preprocessed_pickle_path)
print(f'Saved preprocessed_df to {preprocessed_pickle_path}')

Saved preprocessed_df to EfficientNetb0/preprocessed_signature_df.pkl


In [32]:
preprocessed_df

Unnamed: 0,person_id,image,label
0,person_41,"[[[-2.0494049148043496, -1.965686274509804, -1...",0
1,person_41,"[[[-2.1007791762993406, -2.018207282913165, -1...",0
2,person_41,"[[[-2.1007791762993406, -2.018207282913165, -1...",0
3,person_41,"[[[-2.1179039301310043, -2.0357142857142856, -...",0
4,person_41,"[[[-2.032280160972686, -1.9481792717086837, -1...",0
...,...,...,...
139,person_70,"[[[-2.083654422467677, -2.000700280112045, -1....",1
140,person_70,"[[[-2.1179039301310043, -2.0357142857142856, -...",1
141,person_70,"[[[-2.032280160972686, -1.9481792717086837, -1...",1
142,person_70,"[[[-2.0665296686360133, -1.9831932773109244, -...",1


In [38]:
preprocessed_df['image'][0].shape

(224, 224, 3)

In [40]:
import random

# Save the triplets (optional) using utility function
triplets_save_path = './preprocessed_triplets.npy'


def create_triplets(df, num_triplets):
    """
    Create triplets with mixed negative sampling.
    :param df: DataFrame with columns 'person_id', 'image', 'label', 'Real or Forged'
    :param num_triplets: Number of triplets to generate
    :return: List of triplets
    """
    triplets = []
    person_ids = df['person_id'].unique()
    
    for _ in range(num_triplets):
        # Select anchor and positive from the same person (genuine signatures)
        person_id = random.choice(person_ids)
        positives = df[(df['person_id'] == person_id) & (df['label'] == 0)]  # assuming '0' is genuine
        negatives = df[(df['person_id'] == person_id) & (df['label'] == 1)]  # assuming '1' is forged

        if len(positives) < 2:
            continue  # Skip if not enough genuine samples

        anchor_idx, positive_idx = np.random.choice(len(positives), 2, replace=False)
        negative_idx = np.random.choice(len(negatives), 1, replace=False)[0]
        anchor = positives.iloc[anchor_idx]['image']
        positive = positives.iloc[positive_idx]['image']
        negative = negatives.iloc[negative_idx]['image']

        print("person_id selected:", person_id)
        print("Anchor:",anchor.shape)
        print("Positive:",positive.shape)
        print("Negative:", negative.shape)

        triplets.append((anchor, positive, negative))
    
    return triplets

num_triplets=1000
create_triplets(preprocessed_df, num_triplets)
# save_triplets(preprocessed_df, triplets_save_path)
print(f'Saved preprocessed triplets to {triplets_save_path}')

person_id selected: person_70
Anchor: (224, 224, 3)
Positive: (224, 224, 3)
Negative: (224, 224, 3)
person_id selected: person_41
Anchor: (224, 224, 3)
Positive: (224, 224, 3)
Negative: (224, 224, 3)
person_id selected: person_36
Anchor: (224, 224, 3)
Positive: (224, 224, 3)
Negative: (224, 224, 3)
person_id selected: person_36
Anchor: (224, 224, 3)
Positive: (224, 224, 3)
Negative: (224, 224, 3)
person_id selected: person_36
Anchor: (224, 224, 3)
Positive: (224, 224, 3)
Negative: (224, 224, 3)
person_id selected: person_41
Anchor: (224, 224, 3)
Positive: (224, 224, 3)
Negative: (224, 224, 3)
person_id selected: person_41
Anchor: (224, 224, 3)
Positive: (224, 224, 3)
Negative: (224, 224, 3)
person_id selected: person_36
Anchor: (224, 224, 3)
Positive: (224, 224, 3)
Negative: (224, 224, 3)
person_id selected: person_41
Anchor: (224, 224, 3)
Positive: (224, 224, 3)
Negative: (224, 224, 3)
person_id selected: person_41
Anchor: (224, 224, 3)
Positive: (224, 224, 3)
Negative: (224, 224, 3)
