<a href="https://colab.research.google.com/github/Fortune-Adekogbe/VisualPlagiarism/blob/main/code/CustomEmbeddingEvaluator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%cd /content/drive/MyDrive/Projects/VisualPlagiarism

/content/drive/MyDrive/Projects/VisualPlagiarism


In [None]:
!ls data

 BLIP	       jpeg	   'Plagiarised images'		   'Plagiarised Images UI - Sheet1.csv'
 CLIP	       MetaCLIP    'Plagiarised images 1'	    sigLIP
 CS_Detector   MobileCLIP  'Plagiarised Images UI.gsheet'


In [None]:
!pip install keras==2.15.0 tensorflow==2.15.0

Collecting keras==2.15.0
  Downloading keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)
Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.15.0)
  Downloading wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15.0)
  Downloading tensorboard-2.15.2-py3-none-any.whl.metadata (1.7 kB)
Collecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow==2.15.0)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31

### Setup

In [None]:
from tqdm.auto import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import h5py
import numpy as np
import os
import json
import time
from datetime import date

In [None]:
!ls data/CS_Detector/CLIP

test_pairs.h5  train_pairs.h5  val_pairs.h5


### Load Dataset

In [None]:
split_dir = 'data/CS_Detector/CLIP'

In [None]:
import keras
import h5py
import numpy as np

class HDF5PairDataGenerator(keras.utils.Sequence):
    def __init__(self, file_path, input1_name='image1', input2_name='image2',
                 label_name='label', batch_size=32, emb_size=768, shuffle=True):
        self.file_path = file_path
        self.batch_size = batch_size
        self.input1_name = input1_name
        self.input2_name = input2_name
        self.label_name = label_name
        self.emb_size = emb_size
        self.hf = h5py.File(self.file_path, 'r')
        self.items = list(self.hf.keys())
        self.num_samples = len(self.items)
        self.indexes = np.arange(self.num_samples)
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(self.num_samples / self.batch_size))

    def __getitem__(self, index):
        start_idx = index * self.batch_size
        end_idx = min((index + 1) * self.batch_size, self.num_samples)
        batch_indexes = self.indexes[start_idx:end_idx]

        batch_input1 = []
        batch_input2 = []
        batch_labels = []

        for i in batch_indexes:
            item = self.items[i]
            group = self.hf[item]
            image1 = np.array(group[self.input1_name])
            image2 = np.array(group[self.input2_name])
            label = group.attrs[self.label_name]

            batch_input1.append(image1)
            batch_input2.append(image2)
            batch_labels.append(label)

        batch_input1 = np.array(batch_input1).reshape(-1, self.emb_size)
        batch_input2 = np.array(batch_input2).reshape(-1, self.emb_size)
        batch_labels = np.array(batch_labels).reshape(-1, 1)

        return [batch_input1, batch_input2], batch_labels

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __del__(self):
        self.hf.close()

In [None]:
input1_name = 'image1'
input2_name = 'image2'
label_name = 'label'
batch_size = 32

train_generator = HDF5PairDataGenerator(
    f"{split_dir}/train_pairs.h5",
    input1_name,
    input2_name,
    label_name,
    batch_size
)

validation_generator = HDF5PairDataGenerator(
    f"{split_dir}/val_pairs.h5",
    input1_name,
    input2_name,
    label_name,
    batch_size
)

test_generator = HDF5PairDataGenerator(
    f"{split_dir}/test_pairs.h5",
    input1_name,
    input2_name,
    label_name,
    batch_size
)

### Simple Dense Model

In [None]:
# Define a simple model for demonstration
input1 = keras.layers.Input(shape=(768))
input2 = keras.layers.Input(shape=(768))
print(input1.shape, input2.shape)
combined = keras.layers.concatenate([input1, input2], axis=-1)
print(combined.shape)
dense = keras.layers.Dense(32, activation='relu')(combined)
print(dense.shape)

output = keras.layers.Dense(1, activation='sigmoid')(dense)
print(output.shape)
model = keras.models.Model(inputs=[input1, input2], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

(None, 768) (None, 768)
(None, 1536)
(None, 32)
(None, 1)


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(
    "models/dense_768_l_.keras",
    monitor="val_loss",
    verbose=0,
    save_best_only=True,
    mode="min",
)

# Train the model
model.fit(
    train_generator,
    epochs=10,
    validation_data=validation_generator,
    callbacks=[checkpoint]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f218c22bd60>

In [None]:
os.makedirs("models", exist_ok=True)
model.save("models/dense_768_0.99.keras")

### MLP

In [None]:
import keras
from keras.models import Model
from keras.layers import Input, Dense, concatenate

embedding_size = 768

# Input layers
input1 = Input(shape=(embedding_size))
input2 = Input(shape=(embedding_size))

# Shared layers
shared_dense1 = Dense(64, activation='relu')
encoded1 = shared_dense1(input1)
encoded2 = shared_dense1(input2)

# Concatenate
merged = concatenate([encoded1, encoded2], axis=-1)

# Similarity calculation layers
dense1 = Dense(32, activation='relu')(merged)
output = Dense(1, activation='sigmoid')(dense1)
print(output.shape)
# Model
model = Model(inputs=[input1, input2], outputs=output)

optimizer = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

(None, 1)


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(
    "models/dense_768_1shared_l_.keras",
    monitor="val_loss",
    verbose=0,
    save_best_only=True,
    mode="min",
)

# Training
model.fit(
    train_generator,
    epochs=30,
    validation_data=validation_generator,
    callbacks=[checkpoint]
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x78d7681e4880>

In [None]:
!ls models

conv1d_768_2shared_0.9857.keras  dense_768_1shared_0.65.keras	 dense_768_l_.keras
dense_768_0.99.keras		 dense_768_1shared_0.9953.keras


In [None]:
model = keras.models.load_model('models/dense_768_l_.keras')

In [None]:
loss, accuracy = model.evaluate(test_generator)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Test Loss: 0.020800188183784485, Test Accuracy: 0.9949575066566467


### CNN

In [None]:
import numpy as np
import random
from keras.models import Model

In [None]:
# Define shared convolutional layers
def build_shared_conv_layers():
    model = keras.Sequential([
        keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
        keras.layers.MaxPooling1D(pool_size=2),
        # keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
        # keras.layers.MaxPooling1D(pool_size=2),
        keras.layers.Flatten()
    ])
    return model

# Define input tensors
input1 = keras.layers.Input(shape=(768,1))
input2 = keras.layers.Input(shape=(768,1))

# input1 = input1[:,...]
# print(input1.shape)
# input2 = input2[:,...]

# Build shared convolutional layers
shared_conv = build_shared_conv_layers()

# Apply shared layers to both inputs
conv1 = shared_conv(input1)
conv2 = shared_conv(input2)

# Concatenate the feature maps
combined = keras.layers.concatenate([conv1, conv2], axis=-1)

# Fully connected layers
dense = keras.layers.Dense(256, activation='relu')(combined)
# dense = keras.layers.BatchNormalization()(dense)
# dense = keras.layers.Dropout(0.5)(dense)

dense = keras.layers.Dense(128, activation='relu')(dense)
# dense = keras.layers.BatchNormalization()(dense)
# dense = keras.layers.Dropout(0.5)(dense)

output = keras.layers.Dense(1, activation='sigmoid')(dense)

model = keras.models.Model(inputs=[input1, input2], outputs=output)

# Compile the model
optimizer = keras.optimizers.Adam(learning_rate=keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3, decay_steps=10000, decay_rate=0.9))

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Training
model.fit(
    train_generator,
    epochs=10,
    validation_data=validation_generator
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ab25b4b0850>

In [None]:
loss, accuracy = model.evaluate(test_generator)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Test Loss: 0.0763879045844078, Test Accuracy: 0.9857128858566284


In [None]:
os.makedirs("models", exist_ok=True)
model.save("models/conv1d_768_2shared_0.9857.keras")

## Evaluate

In [None]:
!ls models

conv1d_768_2shared_0.9857.keras  dense_768_1shared_0.9953_0.0248.keras	dense_768_l_0.0235.keras
dense_768_0.99_l_0.0247.keras	 dense_768_1shared_l_0.0208.keras


In [None]:
import keras
model = keras.models.load_model("models/dense_768_l_0.0235.keras")

In [None]:
loss, accuracy = model.evaluate(test_generator)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Test Loss: 0.023502349853515625, Test Accuracy: 0.9940181970596313


In [None]:
import json

eval_embeddings = "data/CLIP/eval_embeddings.json"

with open(eval_embeddings, 'r') as fp:
    embeddings = json.load(fp)

In [None]:
embedding_df = pd.DataFrame(embeddings)
embedding_df.head()

Unnamed: 0,IMAGE,SPLIT,CLIP EMBEDDING
0,WhatsApp Image 2024-06-10 at 17.38.35_15ecdfe8...,1,"[[0.3953934908, -0.3986973464, -1.1613459587, ..."
1,WhatsApp Image 2024-06-10 at 17.38.35_15ecdfe8...,2,"[[0.4288555384, -0.3180504143, -0.0896697417, ..."
2,WhatsApp Image 2024-06-10 at 17.38.35_15ecdfe8...,3,"[[0.0469199754, 0.0471390374, 0.2625674009, 0...."
3,IMG-20240613-WA0039.jpg,1,"[[0.9292801023, 0.6872065067, 0.8969425559, -0..."
4,IMG-20240613-WA0039.jpg,2,"[[0.5844798684, 0.9285963178, 0.4280830622, 0...."


In [None]:
from collections import defaultdict

res = defaultdict(list)
for i in range(embedding_df.shape[0]):
    row = embedding_df.iloc[i,:]
    embedding = row["CLIP EMBEDDING"]
    imgs1 = np.array([embedding] * embedding_df.shape[0])
    imgs2 = np.array([i for i in embedding_df["CLIP EMBEDDING"]])
    imgs1 = imgs1.reshape(-1, 768)
    imgs2 = imgs2.reshape(-1, 768)
    batch = [imgs1, imgs2]
    y_preds = model.predict(batch, verbose=0)
    top_5_indices = np.argsort(-y_preds.reshape((-1,)))[:15]
    similar = embedding_df.loc[top_5_indices, 'IMAGE']
    values = y_preds.reshape((-1,))[top_5_indices]
    res[row["IMAGE"]].extend(list(zip(similar,values)))

In [None]:
output = {}
for key,value in res.items():
    df = pd.DataFrame(value, columns=['Image', 'Value'])
    unique_df = df.groupby('Image')['Value'].mean().reset_index()
    unique_df = unique_df.sort_values(by='Value', ascending=False)
    output[key] = unique_df.values.tolist()[:3]

In [None]:


df = pd.read_csv("data/Plagiarised Images UI - Sheet1.csv")

In [None]:
correct = 0
total = 0

for img1, img2 in df[['Image 1', 'Image 2']].values:
    candidates = {i[0] for i in output[img1]}
    correct += img2 in candidates
    total += 1

correct, total

(13, 21)

In [None]:
18/21

0.8571428571428571