<a href="https://colab.research.google.com/github/Fortune-Adekogbe/VisualPlagiarism/blob/main/code/MetaCLIP_CustomEmbeddingEvaluator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%cd /content/drive/MyDrive/Projects/VisualPlagiarism

/content/drive/MyDrive/Projects/VisualPlagiarism


In [None]:
!ls data

 BLIP	       jpeg	   'Plagiarised images'		   'Plagiarised Images UI - Sheet1.csv'
 CLIP	       MetaCLIP    'Plagiarised images 1'	    sigLIP
 CS_Detector   MobileCLIP  'Plagiarised Images UI.gsheet'


In [None]:
!pip install keras==2.15.0 tensorflow==2.15.0

Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.15.0)
  Downloading wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15.0)
  Downloading tensorboard-2.15.2-py3-none-any.whl.metadata (1.7 kB)
Collecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow==2.15.0)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m

### Setup

In [None]:
from tqdm.auto import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import h5py
import numpy as np
import os
import json
import time
from datetime import date

In [None]:
!ls data/CS_Detector/MetaCLIP

test_pairs.h5  train_pairs.h5  val_pairs.h5


### Load Dataset

In [None]:
split_dir = 'data/CS_Detector/MetaCLIP'

In [None]:
import keras
import h5py
import numpy as np

class HDF5PairDataGenerator(keras.utils.Sequence):
    def __init__(self, file_path, input1_name='image1', input2_name='image2',
                 label_name='label', batch_size=32, emb_size=768, shuffle=True):
        self.file_path = file_path
        self.batch_size = batch_size
        self.input1_name = input1_name
        self.input2_name = input2_name
        self.label_name = label_name
        self.emb_size = emb_size
        self.hf = h5py.File(self.file_path, 'r')
        self.items = list(self.hf.keys())
        self.num_samples = len(self.items)
        self.indexes = np.arange(self.num_samples)
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(self.num_samples / self.batch_size))

    def __getitem__(self, index):
        start_idx = index * self.batch_size
        end_idx = min((index + 1) * self.batch_size, self.num_samples)
        batch_indexes = self.indexes[start_idx:end_idx]

        batch_input1 = []
        batch_input2 = []
        batch_labels = []

        for i in batch_indexes:
            item = self.items[i]
            group = self.hf[item]
            image1 = np.array(group[self.input1_name])
            image2 = np.array(group[self.input2_name])
            label = group.attrs[self.label_name]

            batch_input1.append(image1)
            batch_input2.append(image2)
            batch_labels.append(label)

        batch_input1 = np.array(batch_input1).reshape(-1, self.emb_size)
        batch_input2 = np.array(batch_input2).reshape(-1, self.emb_size)
        batch_labels = np.array(batch_labels, dtype=np.float32).reshape(-1, 1)

        return (batch_input1, batch_input2), batch_labels

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __del__(self):
        self.hf.close()

In [None]:
input1_name = 'image1'
input2_name = 'image2'
label_name = 'label'
batch_size = 32
emb_size = 1024

train_generator = HDF5PairDataGenerator(
    f"{split_dir}/train_pairs.h5",
    input1_name,
    input2_name,
    label_name,
    batch_size,
    emb_size
)

validation_generator = HDF5PairDataGenerator(
    f"{split_dir}/val_pairs.h5",
    input1_name,
    input2_name,
    label_name,
    batch_size,
    emb_size
)

test_generator = HDF5PairDataGenerator(
    f"{split_dir}/test_pairs.h5",
    input1_name,
    input2_name,
    label_name,
    batch_size,
    emb_size
)

### Simple Dense Model

In [None]:
# Define a simple model for demonstration
input1 = keras.layers.Input(shape=(emb_size))
input2 = keras.layers.Input(shape=(emb_size))
print(input1.shape, input2.shape)
combined = keras.layers.concatenate([input1, input2], axis=-1)
print(combined.shape)
dense = keras.layers.Dense(32, activation='relu')(combined)
print(dense.shape)

output = keras.layers.Dense(1, activation='sigmoid')(dense)
print(output.shape)
model = keras.models.Model(inputs=[input1, input2], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

(None, 1024) (None, 1024)
(None, 2048)
(None, 32)
(None, 1)


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(
    f"models/dense_{emb_size}_l_.keras",
    monitor="val_loss",
    verbose=0,
    save_best_only=True,
    mode="min",
)

# Train the model
model.fit(
    train_generator,
    epochs=10,
    validation_data=validation_generator,
    callbacks=[checkpoint]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c7e39bd5b70>

In [None]:
os.makedirs("models", exist_ok=True)
model.save(f"models/dense_{emb_size}_0.99.keras")

### Simple Dense Model with Dot Layer Output

In [None]:
import keras
from keras.models import Model
from keras.layers import Input, Dense, concatenate


# Input layers
input1 = Input(shape=(emb_size,))
input2 = Input(shape=(emb_size,))

# Shared layers
shared_dense1 = Dense(512, activation='relu')
encoded1 = shared_dense1(input1)
# encoded2 = shared_dense1(input2)

shared_dense2 = Dense(512, activation='relu')
# encoded1 = shared_dense2(encoded1)
encoded2 = shared_dense2(input2)

# Concatenate
cosine_similarity = keras.layers.Dot(axes=-1, normalize=True)([encoded1, encoded2])

# Similarity calculation layers
print(cosine_similarity.shape)
# Model
model = Model(inputs=[input1, input2], outputs=cosine_similarity)

optimizer = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

(None, 1)


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(
    f"models/dense_siamese_{emb_size}_l_.keras",
    monitor="val_loss",
    verbose=0,
    save_best_only=True,
    mode="min",
)

# Train the model
model.fit(
    train_generator,
    epochs=10,
    validation_data=validation_generator,
    callbacks=[checkpoint]
)

Epoch 1/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 61ms/step - accuracy: 0.9312 - loss: 0.2009 - val_accuracy: 0.9712 - val_loss: 0.1315
Epoch 2/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 62ms/step - accuracy: 0.9781 - loss: 0.1205 - val_accuracy: 0.9743 - val_loss: 0.1246
Epoch 3/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 60ms/step - accuracy: 0.9822 - loss: 0.1102 - val_accuracy: 0.9760 - val_loss: 0.1200
Epoch 4/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 61ms/step - accuracy: 0.9852 - loss: 0.1025 - val_accuracy: 0.9763 - val_loss: 0.1197
Epoch 5/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 71ms/step - accuracy: 0.9869 - loss: 0.0981 - val_accuracy: 0.9757 - val_loss: 0.1189
Epoch 6/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 62ms/step - accuracy: 0.9880 - loss: 0.0939 - val_accuracy: 0.9755 - val_loss: 0.119

<keras.src.callbacks.history.History at 0x7da9647caf80>

In [None]:
model.fit(
    train_generator,
    epochs=30,
    validation_data=validation_generator,
    callbacks=[checkpoint]
)

Epoch 1/30
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 71ms/step - accuracy: 0.9918 - loss: 0.0806 - val_accuracy: 0.9712 - val_loss: 0.1286
Epoch 2/30
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m283s[0m 60ms/step - accuracy: 0.9915 - loss: 0.0786 - val_accuracy: 0.9694 - val_loss: 0.1323
Epoch 3/30
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 61ms/step - accuracy: 0.9922 - loss: 0.0779 - val_accuracy: 0.9711 - val_loss: 0.1335
Epoch 4/30
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 61ms/step - accuracy: 0.9921 - loss: 0.0761 - val_accuracy: 0.9680 - val_loss: 0.1358
Epoch 5/30
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m305s[0m 72ms/step - accuracy: 0.9925 - loss: 0.0745 - val_accuracy: 0.9663 - val_loss: 0.1371
Epoch 6/30
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 72ms/step - accuracy: 0.9924 - loss: 0.0731 - val_accuracy: 0.9653 - val_loss: 0.140

KeyboardInterrupt: 

In [None]:
os.makedirs("models", exist_ok=True)
model.save(f"models/dense_{emb_size}_0.99.keras")

### MLP

In [None]:
import keras
from keras.models import Model
from keras.layers import Input, Dense, concatenate


# Input layers
input1 = Input(shape=(emb_size))
input2 = Input(shape=(emb_size))

# Shared layers
shared_dense1 = Dense(64, activation='relu')
encoded1 = shared_dense1(input1)
encoded2 = shared_dense1(input2)

# Concatenate
merged = concatenate([encoded1, encoded2], axis=-1)

# Similarity calculation layers
dense1 = Dense(32, activation='relu')(merged)
output = Dense(1, activation='sigmoid')(dense1)
print(output.shape)
# Model
model = Model(inputs=[input1, input2], outputs=output)

optimizer = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

(None, 1)


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(
    f"models/dense_{emb_size}_1shared_l_.keras",
    monitor="val_loss",
    verbose=0,
    save_best_only=True,
    mode="min",
)

early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=7
)

# Training
model.fit(
    train_generator,
    epochs=30,
    validation_data=validation_generator,
    callbacks=[checkpoint, early_stopping]
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30


<keras.src.callbacks.History at 0x7c7e2602c8b0>

In [None]:
!ls models

conv1d_768_2shared_0.9857.keras  dense_768_0.99_l_0.0247.keras		dense_768_l_0.0235.keras
dense_1024_1shared_l_.keras	 dense_768_1shared_0.9953_0.0248.keras	dense_768_l_.keras
dense_1024_l_0.0249.keras	 dense_768_1shared_l_0.0208.keras


## Evaluate

In [None]:
!ls models

conv1d_768_2shared_0.9857.keras  dense_768_0.99_l_0.0247.keras		dense_768_l_0.0235.keras
dense_1024_l_0.0249.keras	 dense_768_1shared_0.9953_0.0248.keras	dense_768_l_.keras
dense_1024_l_.keras		 dense_768_1shared_l_0.0208.keras	dense_siamese_1024_l_.keras


In [None]:
import keras
emb_size = 1024

In [None]:
model = keras.models.load_model(f"models/dense_1024_l_0.0249.keras")

In [None]:
loss, accuracy = model.evaluate(test_generator)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

NameError: name 'test_generator' is not defined

In [None]:
import json

eval_embeddings = "data/MetaCLIP/eval_embeddings.json"

with open(eval_embeddings, 'r') as fp:
    embeddings = json.load(fp)

In [None]:
embedding_df = pd.DataFrame(embeddings)
embedding_df.head()

Unnamed: 0,IMAGE,SPLIT,MetaCLIP EMBEDDING
0,WhatsApp Image 2024-06-10 at 17.38.35_15ecdfe8...,1,"[[0.0644937456, -0.7185157537, -0.8260132074, ..."
1,WhatsApp Image 2024-06-10 at 17.38.35_15ecdfe8...,2,"[[0.0458599627, -0.123551935, -0.4900194407, 1..."
2,WhatsApp Image 2024-06-10 at 17.38.35_15ecdfe8...,3,"[[-0.0905783027, -0.3711664081, -0.407592386, ..."
3,IMG-20240613-WA0039.jpg,1,"[[0.0387269855, -0.0533350632, -0.4823995829, ..."
4,IMG-20240613-WA0039.jpg,2,"[[-0.2583229244, -0.239710927, -0.5733796954, ..."


In [None]:
from collections import defaultdict

res = defaultdict(list)
for i in range(embedding_df.shape[0]):
    row = embedding_df.iloc[i,:]
    embedding = row["MetaCLIP EMBEDDING"]
    imgs1 = np.array([embedding] * embedding_df.shape[0])
    imgs2 = np.array([i for i in embedding_df["MetaCLIP EMBEDDING"]])
    imgs1 = imgs1.reshape(-1, emb_size)
    imgs2 = imgs2.reshape(-1, emb_size)
    batch = [imgs1, imgs2]
    y_preds = model.predict(batch, verbose=0)
    top_5_indices = np.argsort(-y_preds.reshape((-1,)))[:15]
    similar = embedding_df.loc[top_5_indices, 'IMAGE']
    values = y_preds.reshape((-1,))[top_5_indices]
    res[row["IMAGE"]].extend(list(zip(similar,values)))

In [None]:
output = {}
for key,value in res.items():
    df = pd.DataFrame(value, columns=['Image', 'Value'])
    unique_df = df.groupby('Image')['Value'].mean().reset_index()
    unique_df = unique_df.sort_values(by='Value', ascending=False)
    output[key] = unique_df.values.tolist()[:10]

In [None]:
df = pd.read_csv("data/Plagiarised Images UI - Sheet1.csv")

In [None]:
correct = 0
total = 0

for img1, img2 in df[['Image 1', 'Image 2']].values:
    candidates = {i[0] for i in output[img1]}
    correct += img2 in candidates
    total += 1

correct, total

(17, 21)

In [None]:
17/21

0.8095238095238095