## Project Overview: Similarity-Based Tamper Detection

This notebook builds a scoring system to quantify image tampering likelihood using similarity metrics. The approach compares two different images using learned feature embeddings and cosine similarity.

Key aspects:
- Extracts deep features from pretrained CNN models.
- Calculates pairwise cosine similarity between image sets.

In [None]:
# Install required packages
!/usr/bin/env pip install torch torchvision

In [None]:
# Import libraries
import boto3
import os
import sagemaker
import random
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torch.optim as optim
import zipfile
import shutil
import torchvision.models as models
import copy
import pandas as pd
import numpy as np
import math
from sagemaker.pytorch import PyTorch, PyTorchModel
from torchvision import transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from torch.optim import Adam, SGD
from tqdm import tqdm
from random import choice
from itertools import product
from scipy.spatial.distance import cosine
from tqdm import tqdm

In [None]:
# Functions
def preprocess_image_similarity(img_path, model_name):
    """ Preprocess the image based on the model's requirements. """
    assert model_name in models_dict, "Unsupported model"

    preprocess = transforms.Compose([
        transforms.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    img = Image.open(img_path)
    img_t = preprocess(img)
    batch_t = torch.unsqueeze(img_t, 0)
    return batch_t
            
def extract_features(img_path, model_name):
    """ Extract features from an image using the specified model. """
    img_t = preprocess_image_similarity(img_path, model_name)
    model = models_dict[model_name]

    with torch.no_grad():
        features = model(img_t)
    return features[0]

def compare_images(feature_vec1, feature_vec2):
    """ Compare two images using cosine similarity. """
    similarity = 1 - cosine(feature_vec1, feature_vec2)
    return similarity

def unzip_file(zip_path, extract_to=None):
    """
    Unzips a file to the specified directory.
    
    :param zip_path: Path to the .zip file
    :param extract_to: Directory to extract the contents to. 
                       If None, extracts to the same directory as the zip file.
    """
    if extract_to is None:
        extract_to = os.path.dirname(zip_path)
        
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"Files extracted to {extract_to}")
        
def list_images(directory_path):
    """
    Lists all image files in a directory and its subdirectories.

    :param directory_path: Path to the directory to search for image files.
    :return: List of paths to image files.
    """
    image_files = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_files.append(file)
    return image_files

### Similarity

In [None]:
# If images are in zip file, extract them.
zip_file_path = "/root/example-project/Test_Images/results.zip"  # Change this to the path of your zip file
destination_path = 'path/to/extract/'    # Change this to where you want to extract files

unzip_file(zip_file_path)

In [None]:
# Load pre-trained models and set to evaluation mode
models_dict = {
    'resnet50': models.resnet50(pretrained=True),
    'vgg16': models.vgg16(pretrained=True)
}
for model in models_dict.values():
    model.eval()

In [None]:
directory_path = "/root/example-project/Test_Images"

# List all image files in the directory
image_files = [file for file in os.listdir(directory_path) if file.lower().endswith(('.png', '.jpg', '.jpeg'))]
#image_files = list_images(directory_path)

# Number of images
n = len(image_files)

# Initialize an empty DataFrame to store similarity scores
similarity_matrix = pd.DataFrame(np.zeros((n, n)), index=image_files, columns=image_files)

# Extract features and compare images
counter = 0
for i in range(n):
    features1 = extract_features(os.path.join(directory_path, image_files[i]), 'resnet50')
    for j in range(n):
        if i != j:
            features2 = extract_features(os.path.join(directory_path, image_files[j]), 'resnet50')
            similarity_score = compare_images(features1, features2)
            similarity_matrix.iloc[i, j] = similarity_score
            counter += 1
            print(counter)
            
similarity_matrix.to_csv("/root/example-project/similarity_matrix.csv")