In [1]:
import torch 
import os
import requests 
import fitz
from tqdm import tqdm 
import numpy as np 

In [2]:
os.environ['CUDA_VISIBLE_DEVICES']='1'
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
device

'cuda'

In [4]:
pdf_path = 'visual para.pdf'

In [5]:
def text_formatter(text: str) -> str : 
    clean_txt = text.replace("\n"," ").strip()
    return clean_txt


#     return pages_and_texts
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    print(len(doc))
    n = len(doc)
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        if page_number <= n :
            text = page.get_text()  # get plain text encoded as UTF-8
            text = text_formatter(text)
            pages_and_texts.append({"page_number": page_number,  
                                    "page_char_count": len(text),
                                    "page_word_count": len(text.split(" ")),
                                    "page_sentence_count_raw": len(text.split(". ")),
                                    "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                    "text": text})
    return pages_and_texts


In [6]:
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:15]

14


14it [00:00, 195.30it/s]


[{'page_number': 0,
  'page_char_count': 3047,
  'page_word_count': 458,
  'page_sentence_count_raw': 24,
  'page_token_count': 761.75,
  'text': 'Image Watermarking Techniques are Brittle: Investigating Visual Paraphrasing for De-Watermarking AI-Generated Images Anonymous submission Forward Diffusion Process Text Caption Generator Black Box An image of Pope Francis smiling and wearing a white jacket, surrounded by a crowd White Box An image of Pope Francis smiling and wearing a white jacket, surrounded by a crowd E UNet . . . . D UNet Denoising Dewatermarked Image  Figure 1: Block diagram of the visual paraphrasing technique illustrating the dewatermarking process. The diagram includes a forward diffusion process for encoding and decoding images to generate visually paraphrased outputs. It features a White Box scenario, where access to prompts is available, allowing direct manipulation of the image using descriptive prompts. In contrast, the Black Box scenario does not have access to 

In [7]:
import random 
random.sample(pages_and_texts, k = 3)

[{'page_number': 4,
  'page_char_count': 2580,
  'page_word_count': 415,
  'page_sentence_count_raw': 22,
  'page_token_count': 645.0,
  'text': 'Tree Ring Stable Signature Figure 5: This figure shows the variation of CMMD (Jayasumana et al. 2024) and detectability of visual paraphrases with respect to strength and guidance scale. The images were watermarked using Tree Ring Watermarking (Wen et al. 2023) and Stable Signature (Fernandez et al. 2023). VS: We need justification of why we only benchmark these two and how they are representative of the broader class on watermarking methods. In lit review we mention SythiID and Zodiac but dont benchmark them? original image s=0.2 s=0.3 s=0.4 s=0.5 s=0.6 s=0.7 Prompt: Potrait of a Labrador in the style of Van Gogh Figure 6: Varying strength for content injection: The intensity of noise injected into the content is varied which impacts both the preservation of layout semantics and the fusion of prompt semantics. and consistency of digital imag

In [8]:
import pandas as pd 

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,3047,458,24,761.75,Image Watermarking Techniques are Brittle: Inv...
1,1,5037,714,33,1259.25,than ever. The findings of the latest (seventh...
2,2,5331,794,40,1332.75,demonstrating the vulnerability of existing wa...
3,3,4985,806,41,1246.25,Generated Image Watermarked Image Difference W...
4,4,2580,415,22,645.0,Tree Ring Stable Signature Figure 5: This figu...


In [9]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,14.0,14.0,14.0,14.0,14.0
mean,6.5,2713.14,414.79,26.21,678.29
std,4.18,2012.9,282.15,25.73,503.23
min,0.0,168.0,46.0,1.0,42.0
25%,3.25,730.75,162.5,5.5,182.69
50%,6.5,2640.0,410.5,23.0,660.0
75%,9.75,4741.5,680.5,38.25,1185.38
max,13.0,5741.0,806.0,96.0,1435.25


In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-1.5B-instruct", trust_remote_code=True)
# In case you want to reduce the maximum length:
model.max_seq_length = 8192

queries = [
    "how much protein should a female eat",
    "summit define",
]
documents = [
    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
]

query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)

scores = (query_embeddings @ document_embeddings.T) * 100
print(scores.tolist())

  from tqdm.autonotebook import tqdm, trange


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[[78.49691772460938, 17.042869567871094], [14.924493789672852, 75.37962341308594]]


In [None]:
# import torch
# import torch.nn.functional as F

# from torch import Tensor
# from transformers import AutoTokenizer, AutoModel


# def last_token_pool(last_hidden_states: Tensor,
#                  attention_mask: Tensor) -> Tensor:
#     left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
#     if left_padding:
#         return last_hidden_states[:, -1]
#     else:
#         sequence_lengths = attention_mask.sum(dim=1) - 1
#         batch_size = last_hidden_states.shape[0]
#         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


# def get_detailed_instruct(task_description: str, query: str) -> str:
#     return f'Instruct: {task_description}\nQuery: {query}'


# # Each query must come with a one-sentence instruction that describes the task
# task = 'Given a web search query, retrieve relevant passages that answer the query'
# queries = [
#     get_detailed_instruct(task, 'how much protein should a female eat'),
#     get_detailed_instruct(task, 'summit define')
# ]
# # No need to add instruction for retrieval documents
# documents = [
#     "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
#     "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."
# ]
# input_texts = queries + documents

# tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True)
# model = AutoModel.from_pretrained('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True)

# max_length = 8192

# # Tokenize the input texts
# batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
# outputs = model(**batch_dict)
# embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# # normalize embeddings
# embeddings = F.normalize(embeddings, p=2, dim=1)
# scores = (embeddings[:2] @ embeddings[2:].T) * 100
# print(scores.tolist())


In [11]:
from spacy.lang.en import English 

nlp = English()
nlp.add_pipe('sentencizer')

doc = nlp("Hi, I saw you standing there. What were you doing?")

# assert(len(list(doc.sents))) == 3

s = list(doc.sents)

In [12]:
type(s)

list

In [13]:
type(s[0])

spacy.tokens.span.Span

In [14]:
import nltk

txt = "Hi, I saw you standing there. What were you doing?"
l = nltk.tokenize.sent_tokenize(txt, language='english')
l

['Hi, I saw you standing there.', 'What were you doing?']

In [18]:
type(l)

list

In [19]:
type(l[0])

str

In [20]:
pages_and_texts[6]

{'page_number': 6,
 'page_char_count': 2700,
 'page_word_count': 406,
 'page_sentence_count_raw': 16,
 'page_token_count': 675.0,
 'text': 'Watermarking Method Watermark Detection Rate (η) Pre Attack Post Attack Brightness Rotation JPEG Compression Gaussian Noise Visual Paraphrase COCO (Lin et al. 2015) DctDwdSVD - - - - - - HiDDen - - - - - - Stable Signature - - - - - - Tree Ring - - - - - - ZoDiac - - - - - - Gaussian Shading - - - - - - Table 1: Watermark Detection Rates (η) for Different Techniques and Attacks delicate equilibrium can be achieved by controlling the hy- perparameters of the image-to-image diffusion model. (a) Watermarked (b) Visual Paraphrase (c) Watermarked (d) Visual Paraphrase Figure 9: Comparison of watermarked images (a, c) with their visual paraphrased counterparts (b, d). Each colored box (red, yellow, and blue) represents a one-to-one comparison of the same region in the original and paraphrased images, highlighting how visual paraphrasing alters specific e

In [22]:
for item in tqdm(pages_and_texts) : 
    text = item['text']
    item["sentences"] = nltk.tokenize.sent_tokenize(text, language='english') 

    item['page_sentence_count_nltk'] = len(item['sentences'])

100%|██████████| 14/14 [00:00<00:00, 846.13it/s]


In [23]:
pages_and_texts[9]

{'page_number': 9,
 'page_char_count': 1715,
 'page_word_count': 250,
 'page_sentence_count_raw': 49,
 'page_token_count': 428.75,
 'text': 'Lopes, R. G.; Salimans, T.; Ho, J.; Fleet, D. J.; and Norouzi, M. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. arXiv:2205.11487. Song, J.; Meng, C.; and Ermon, S. 2022. Denoising Diffusion Implicit Models. arXiv:2010.02502. T.J. Thomson, P. D., Daniel Angus. 2020. 3.2 billion images and 720,000 hours of video are shared online daily. Can you sort real from fake? Wen, B.; and Aydore, S. 2019. ROMark: A Ro- bust Watermarking System Using Adversarial Training. arXiv:1910.01221. Wen, Y.; Kirchenbauer, J.; Geiping, J.; and Goldstein, T. 2023. Tree-Ring Watermarks: Fingerprints for Diffusion Images that are Invisible and Robust. arXiv:2305.20030. White-House. 2023. Blueprint for an AI Bill of Rights: Mak- ing Automated Systems Work For the American People. Yu, N.; Skripniuk, V.; Abdelnabi, S.; and Fritz, M. 2021.

In [24]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_nltk
0,0,3047,458,24,761.75,Image Watermarking Techniques are Brittle: Inv...,[Image Watermarking Techniques are Brittle: In...,23
1,1,5037,714,33,1259.25,than ever. The findings of the latest (seventh...,"[than ever., The findings of the latest (seven...",33
2,2,5331,794,40,1332.75,demonstrating the vulnerability of existing wa...,[demonstrating the vulnerability of existing w...,40
3,3,4985,806,41,1246.25,Generated Image Watermarked Image Difference W...,[Generated Image Watermarked Image Difference ...,42
4,4,2580,415,22,645.0,Tree Ring Stable Signature Figure 5: This figu...,[Tree Ring Stable Signature Figure 5: This fig...,23


In [25]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_nltk
count,14.0,14.0,14.0,14.0,14.0,14.0
mean,6.5,2713.14,414.79,26.21,678.29,23.79
std,4.18,2012.9,282.15,25.73,503.23,20.51
min,0.0,168.0,46.0,1.0,42.0,1.0
25%,3.25,730.75,162.5,5.5,182.69,5.5
50%,6.5,2640.0,410.5,23.0,660.0,23.0
75%,9.75,4741.5,680.5,38.25,1185.38,36.0
max,13.0,5741.0,806.0,96.0,1435.25,73.0


In [26]:
"""We will chunk sentences into groups of 5 """

'We will chunk sentences into groups of 5 '

In [27]:
chunk_size = 10

def chunking(input_list , chunk_size) :
    l = [input_list[i : i+ chunk_size] for i in range(0,len(input_list), chunk_size)]
    return l 

test = list(range(21))

chunking(test,chunk_size)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20]]

In [28]:
for item in tqdm(pages_and_texts) : 
    item["chunks"] = chunking(item['sentences'], chunk_size)
    item['num_chunks'] = len(item["chunks"])

100%|██████████| 14/14 [00:00<00:00, 84006.09it/s]


In [29]:
pages_and_texts[6]['chunks']

[['Watermarking Method Watermark Detection Rate (η) Pre Attack Post Attack Brightness Rotation JPEG Compression Gaussian Noise Visual Paraphrase COCO (Lin et al.',
  '2015) DctDwdSVD - - - - - - HiDDen - - - - - - Stable Signature - - - - - - Tree Ring - - - - - - ZoDiac - - - - - - Gaussian Shading - - - - - - Table 1: Watermark Detection Rates (η) for Different Techniques and Attacks delicate equilibrium can be achieved by controlling the hy- perparameters of the image-to-image diffusion model.',
  '(a) Watermarked (b) Visual Paraphrase (c) Watermarked (d) Visual Paraphrase Figure 9: Comparison of watermarked images (a, c) with their visual paraphrased counterparts (b, d).',
  'Each colored box (red, yellow, and blue) represents a one-to-one comparison of the same region in the original and paraphrased images, highlighting how visual paraphrasing alters specific elements within these regions, resulting in information loss.',
  'One crucial hyperparameter is the strength parameter, wh

In [30]:
pages_and_texts[6]['num_chunks']

2

In [31]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_nltk,chunks,num_chunks
0,0,3047,458,24,761.75,Image Watermarking Techniques are Brittle: Inv...,[Image Watermarking Techniques are Brittle: In...,23,[[Image Watermarking Techniques are Brittle: I...,3
1,1,5037,714,33,1259.25,than ever. The findings of the latest (seventh...,"[than ever., The findings of the latest (seven...",33,"[[than ever., The findings of the latest (seve...",4
2,2,5331,794,40,1332.75,demonstrating the vulnerability of existing wa...,[demonstrating the vulnerability of existing w...,40,[[demonstrating the vulnerability of existing ...,4
3,3,4985,806,41,1246.25,Generated Image Watermarked Image Difference W...,[Generated Image Watermarked Image Difference ...,42,[[Generated Image Watermarked Image Difference...,5
4,4,2580,415,22,645.0,Tree Ring Stable Signature Figure 5: This figu...,[Tree Ring Stable Signature Figure 5: This fig...,23,[[Tree Ring Stable Signature Figure 5: This fi...,3


In [32]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_nltk,num_chunks
count,14.0,14.0,14.0,14.0,14.0,14.0,14.0
mean,6.5,2713.14,414.79,26.21,678.29,23.79,2.93
std,4.18,2012.9,282.15,25.73,503.23,20.51,2.02
min,0.0,168.0,46.0,1.0,42.0,1.0,1.0
25%,3.25,730.75,162.5,5.5,182.69,5.5,1.0
50%,6.5,2640.0,410.5,23.0,660.0,23.0,3.0
75%,9.75,4741.5,680.5,38.25,1185.38,36.0,4.0
max,13.0,5741.0,806.0,96.0,1435.25,73.0,8.0


In [33]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for chunk in item["chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["chunks"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

100%|██████████| 14/14 [00:00<00:00, 6733.97it/s]


41

In [34]:
pages_and_texts[6]['chunks']

[['Watermarking Method Watermark Detection Rate (η) Pre Attack Post Attack Brightness Rotation JPEG Compression Gaussian Noise Visual Paraphrase COCO (Lin et al.',
  '2015) DctDwdSVD - - - - - - HiDDen - - - - - - Stable Signature - - - - - - Tree Ring - - - - - - ZoDiac - - - - - - Gaussian Shading - - - - - - Table 1: Watermark Detection Rates (η) for Different Techniques and Attacks delicate equilibrium can be achieved by controlling the hy- perparameters of the image-to-image diffusion model.',
  '(a) Watermarked (b) Visual Paraphrase (c) Watermarked (d) Visual Paraphrase Figure 9: Comparison of watermarked images (a, c) with their visual paraphrased counterparts (b, d).',
  'Each colored box (red, yellow, and blue) represents a one-to-one comparison of the same region in the original and paraphrased images, highlighting how visual paraphrasing alters specific elements within these regions, resulting in information loss.',
  'One crucial hyperparameter is the strength parameter, wh

In [35]:
pages_and_chunks[6]

{'page_number': 1,
 'chunks': 'Visual paraphrasing is not yet a widely recognized sub-discipline. However, this paper demonstrates how visual paraphrasing can be accomplished using state-of-the-art text-to-image generation systems. This paper presents a critical assessment, empirically',
 'chunk_char_count': 255,
 'chunk_word_count': 31,
 'chunk_token_count': 63.75}

In [36]:
pages_and_texts[6]['num_chunks']

2

In [37]:
chunk_dict

{'page_number': 13,
 'chunks': 'Watermarked Brightness Rotation JPEG Compression Gaussian Noise Visual Paraphrase (Ours) η = 1 η = 0.989 η = 0.841 η = 0.624 η = 0.671 η = 0.263 η = 1 η = 0.991 η = 0.813 η = 0.611 η = 0.633 η = 0.334 η = 1 η = 0.984 η = 0.837 η = 0.656 η = 0.603 η = 0.297 η = 1 η = 0.994 η = 0.784 η = 0.609 η = 0.579 η = 0.273 η = 1 η = 0.997 η = 0.759 η = 0.702 η = 0.682 η = 0.311 Table 4: Placeholder, these are stable signature examples.need to put tree ring, ZoDiac, Gaussian Shading, dwtdctsvd, HiDDen examples. The figure shows watermarked images, images under various attacks, and our visual paraphrase method. The attacks include Brightness adjustment, Rotation, JPEG Compression, and Gaussian Noise, along with our Visual Paraphrase method.η comparisons, representing watermark detection score (bit accuracy), are also provided.',
 'chunk_char_count': 824,
 'chunk_word_count': 159,
 'chunk_token_count': 206.0}

In [38]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,41.0,41.0,41.0,41.0
mean,5.34,924.1,139.95,231.02
std,3.57,527.92,76.22,131.98
min,0.0,168.0,31.0,42.0
25%,2.0,481.0,65.0,120.25
50%,5.0,908.0,155.0,227.0
75%,8.0,1309.0,194.0,327.25
max,13.0,2223.0,295.0,555.75


In [39]:
random.sample(pages_and_chunks, k = 1 )

[{'page_number': 3,
  'chunks': '\x121 2 \x13n + ⌊n−nτ⌋ X i=1  n i !\x121 2 \x13n (2) Visual Paraphrasing In the realm of AI-generated image detection, visual para- phrasing is a crucial method for confirming the authenticity',
  'chunk_char_count': 180,
  'chunk_word_count': 34,
  'chunk_token_count': 45.0}]

Filtering very short chunks, they may not contain much info 

In [40]:
"""Removing very short chunks """

'Removing very short chunks '

In [42]:
min_token_len = 20

In [43]:
df[df["chunk_token_count"] <= min_token_len]["chunks"]

Series([], Name: chunks, dtype: object)

In [44]:
for row in df[df["chunk_token_count"] <= min_token_len].sample(2).iterrows(): 
    print(f'CHunk token count : {row[1]["chunk_token_count"]} | text : {row[1]["chunks"]}')


ValueError: a must be greater than 0 unless no samples are taken

In [45]:
pages_and_chunks_over_threshold = df[df["chunk_token_count"] > min_token_len].to_dict(orient="records")
pages_and_chunks_over_threshold[:2]

[{'page_number': 0,
  'chunks': 'Image Watermarking Techniques are Brittle: Investigating Visual Paraphrasing for De-Watermarking AI-Generated Images Anonymous submission Forward Diffusion Process Text Caption Generator Black Box An image of Pope Francis smiling and wearing a white jacket, surrounded by a crowd White Box An image of Pope Francis smiling and wearing a white jacket, surrounded by a crowd E UNet .... D UNet Denoising Dewatermarked Image Figure 1: Block diagram of the visual paraphrasing technique illustrating the dewatermarking process. The diagram includes a forward diffusion process for encoding and decoding images to generate visually paraphrased outputs. It features a White Box scenario, where access to prompts is available, allowing direct manipulation of the image using descriptive prompts. In contrast, the Black Box scenario does not have access to prompts, relying on a caption generator (Kosmos 2)(Peng et al.2023) to interpret and paraphrase the image context indi

In [46]:
random.sample(pages_and_chunks_over_threshold, k =2)

[{'page_number': 8,
  'chunks': 'High-Resolution Image Synthesis With Latent Diffusion Models. In Proceedings of the IEEE/CVF Confer- ence on Computer Vision and Pattern Recognition (CVPR), 10684–10695. Saharia, C.; Chan, W.; Saxena, S.; Li, L.; Whang, J.; Denton, E.; Ghasemipour, S. K. S.; Ayan, B. K.; Mahdavi, S. S.;',
  'chunk_char_count': 287,
  'chunk_word_count': 44,
  'chunk_token_count': 71.75},
 {'page_number': 7,
  'chunks': 'Watermarked Brightness Rotation JPEG Compression Gaussian Noise Visual Paraphrase (Ours) η = 1 η = 0.989 η = 0.841 η = 0.624 η = 0.671 η = 0.263 η = 1 η = 0.991 η = 0.813 η = 0.611 η = 0.633 η = 0.334 η = 1 η = 0.984 η = 0.837 η = 0.656 η = 0.603 η = 0.297 η = 1 η = 0.994 η = 0.784 η = 0.609 η = 0.579 η = 0.273 η = 1 η = 0.997 η = 0.759 η = 0.702 η = 0.682 η = 0.311 Table 2: The figure shows watermarked images, images under various attacks, and our visual paraphrase method. The attacks include Brightness adjustment, Rotation, JPEG Compression, and Gaussi

Embedding chunks 

In [47]:
""" 
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-1.5B-instruct", trust_remote_code=True)
# In case you want to reduce the maximum length:
model.max_seq_length = 8192

queries = [
    "how much protein should a female eat",
    "summit define",
]
documents = [
    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
]

query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)

scores = (query_embeddings @ document_embeddings.T) * 100
print(scores.tolist())


"""

' \nfrom sentence_transformers import SentenceTransformer\n\nmodel = SentenceTransformer("Alibaba-NLP/gte-Qwen2-1.5B-instruct", trust_remote_code=True)\n# In case you want to reduce the maximum length:\nmodel.max_seq_length = 8192\n\nqueries = [\n    "how much protein should a female eat",\n    "summit define",\n]\ndocuments = [\n    "As a general guideline, the CDC\'s average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you\'ll need to increase that if you\'re expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",\n    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",\n]\n\nquery_embeddings = model.encode(queries, prompt_name="query")\ndocument_embeddings = model.encode(documents)\n\nscor

In [48]:
"""Testing """

test_sentences = ["Testing a local rag system.", " I hope this works.", "I am wasting too much time on this."]

test_embeddings = model.encode(test_sentences)
embeddings_dict = dict(zip(test_sentences, test_sentences))

In [49]:
# See the embeddings
for test_sentences, test_sentences in embeddings_dict.items():
    print("Sentence:", test_sentences)
    print("Embedding:", test_embeddings)
    print("Embedding size :", test_embeddings.shape)
    print("")

Sentence: Testing a local rag system.
Embedding: [[ 1.8763652e-02  2.7032044e-02  2.6048908e-02 ...  1.8244106e-02
  -4.5662776e-02 -2.0056074e-02]
 [-5.5436410e-02  5.1500294e-02  2.7566036e-02 ... -2.5363853e-02
  -1.6618188e-02 -3.0015926e-03]
 [-4.8850998e-02  1.0981756e-02  3.5240576e-02 ... -1.3871044e-02
  -9.4725488e-05 -1.8300353e-02]]
Embedding size : (3, 1536)

Sentence:  I hope this works.
Embedding: [[ 1.8763652e-02  2.7032044e-02  2.6048908e-02 ...  1.8244106e-02
  -4.5662776e-02 -2.0056074e-02]
 [-5.5436410e-02  5.1500294e-02  2.7566036e-02 ... -2.5363853e-02
  -1.6618188e-02 -3.0015926e-03]
 [-4.8850998e-02  1.0981756e-02  3.5240576e-02 ... -1.3871044e-02
  -9.4725488e-05 -1.8300353e-02]]
Embedding size : (3, 1536)

Sentence: I am wasting too much time on this.
Embedding: [[ 1.8763652e-02  2.7032044e-02  2.6048908e-02 ...  1.8244106e-02
  -4.5662776e-02 -2.0056074e-02]
 [-5.5436410e-02  5.1500294e-02  2.7566036e-02 ... -2.5363853e-02
  -1.6618188e-02 -3.0015926e-03]
 [-

In [50]:
test_embeddings[0].shape

(1536,)

In [51]:
text_chunks = [item["chunks"] for item in pages_and_chunks_over_threshold]
text_chunks[10]

'Watermarking is executed by subtly altering the latent repre- sentation in a manner imperceptible to human observation but discernible by a pretrained watermark extractor network. The essence of the Stable Signature technique revolves around refining the LDM decoder to yield images that manifest a predetermined signature when scrutinized by the watermark extractor network. This involves the minimization of a loss function that amalgamates the reconstruction loss and the watermark loss, wherein the former assesses the variance be- tween the generated image and the target image, and the latter quantifies the discrepancy between the signature of the gener- ated image and the desired watermark signature. The balance between these two aspects is regulated by a hyperparameter denoted as λ. In essence, the Stable Signature approach operates as fol- lows: First, a watermark extractor network is trained to recog- nize a particular watermark within images. Subsequently, the LDM decoder is metic

In [52]:
len(text_chunks)

41

In [53]:
# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_threshold):
    item["embedding"] = model.encode(item["chunks"], batch_size=32, convert_to_numpy=True)

100%|██████████| 41/41 [00:02<00:00, 16.99it/s]


In [None]:
# text_chunk_embeddings = model.encode(text_chunks,
#                                                batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
#                                                convert_to_tensor=True) # optional to return embeddings as tensor instead of array

# text_chunk_embeddings

In [54]:
pages_and_chunks_over_threshold[0]

{'page_number': 0,
 'chunks': 'Image Watermarking Techniques are Brittle: Investigating Visual Paraphrasing for De-Watermarking AI-Generated Images Anonymous submission Forward Diffusion Process Text Caption Generator Black Box An image of Pope Francis smiling and wearing a white jacket, surrounded by a crowd White Box An image of Pope Francis smiling and wearing a white jacket, surrounded by a crowd E UNet .... D UNet Denoising Dewatermarked Image Figure 1: Block diagram of the visual paraphrasing technique illustrating the dewatermarking process. The diagram includes a forward diffusion process for encoding and decoding images to generate visually paraphrased outputs. It features a White Box scenario, where access to prompts is available, allowing direct manipulation of the image using descriptive prompts. In contrast, the Black Box scenario does not have access to prompts, relying on a caption generator (Kosmos 2)(Peng et al.2023) to interpret and paraphrase the image context indire

In [55]:
# text_chunk_embeddings.shape

In [57]:
text_chunk_embeddings_df = pd.DataFrame(pages_and_chunks_over_threshold)
embedding_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunk_embeddings_df.to_csv(embedding_df_save_path, index=False, escapechar='\\')


In [58]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embedding_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,chunks,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,Image Watermarking Techniques are Brittle: Inv...,1081,158,270.25,[ 0.00183125 0.04405575 0.01164224 ... 0.00...
1,0,"2022a), Midjourney (Holz 2022), Imagen (Sahari...",1474,218,368.5,[ 0.02201897 0.04330228 0.01203674 ... 0.00...
2,0,The central concern noted in the letter (Marcu...,484,76,121.0,[-0.01184466 0.04079079 -0.00218739 ... -0.00...
3,1,than ever. The findings of the latest (seventh...,1696,245,424.0,[ 0.03676558 0.06140035 0.01088286 ... 0.00...
4,1,The methodology aims to en- hance the durabili...,1744,247,436.0,[ 0.02832914 0.03568636 0.00855604 ... 0.01...


In [59]:
# Assuming pages_and_chunks_over_threshold is a list of dictionaries and 'embedding' is one of the keys
text_chunk_embeddings_df = pd.DataFrame(pages_and_chunks_over_threshold)

# Save the entire DataFrame including embeddings using pickle
text_chunk_embeddings_df.to_pickle("text_chunks_and_embeddings.pkl")

In [60]:
# Load the entire DataFrame including embeddings using pickle
text_chunks_and_embedding_df = pd.read_pickle("text_chunks_and_embeddings.pkl")

# Convert embeddings to torch tensor and send to device
embeddings_tensor = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
print(embeddings_tensor.shape)

torch.Size([41, 1536])


RAG search and answer 

In [61]:
import pandas as pd
import numpy as np
import torch

# Load DataFrame from pickle file
text_chunks_and_embedding_df = pd.read_pickle("text_chunks_and_embeddings.pkl")

# Example: Convert back to torch tensor assuming 'embedding' is a key containing numpy arrays
embeddings_tensor = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32)

# Prepare a similar DataFrame for the loaded data
loaded_df = pd.DataFrame(text_chunks_and_embedding_df)

# Ensure 'embedding' column remains as numpy arrays
loaded_df["embedding"] = loaded_df["embedding"].apply(lambda x: np.array(x))

# Now you have a DataFrame 'loaded_df' which should be structurally similar to 'text_chunk_embeddings_df'


In [62]:
embeddings_tensor[0].shape

torch.Size([1536])

In [63]:
embeddings_tensor.shape

torch.Size([41, 1536])

In [64]:
loaded_df.head()

Unnamed: 0,page_number,chunks,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,Image Watermarking Techniques are Brittle: Inv...,1081,158,270.25,"[0.0018312462, 0.044055752, 0.011642245, 0.028..."
1,0,"2022a), Midjourney (Holz 2022), Imagen (Sahari...",1474,218,368.5,"[0.022018967, 0.043302275, 0.012036744, 0.0303..."
2,0,The central concern noted in the letter (Marcu...,484,76,121.0,"[-0.01184466, 0.040790785, -0.0021873878, 0.02..."
3,1,than ever. The findings of the latest (seventh...,1696,245,424.0,"[0.03676558, 0.06140035, 0.010882864, 0.021724..."
4,1,The methodology aims to en- hance the durabili...,1744,247,436.0,"[0.028329136, 0.035686363, 0.008556045, 0.0272..."


In [65]:
# # import random

# # import torch
# import numpy as np 
# import pandas as pd

# # device = "cuda" if torch.cuda.is_available() else "cpu"

# # Import texts and embedding df
# text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# # Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
# # text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# # Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
# # text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep="  "))

# # embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].to_list(), axis=0))


# embeddings_from_df = text_chunk_embeddings_df["embedding"].tolist()
# # embeddings_from_df

# embeddings = torch.tensor(embeddings_from_df, dtype=torch.float32).to(device)

# # # Convert texts and embedding df to list of dicts
# # pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")



In [66]:
# import numpy as np
# import pandas as pd
# import ast
# import torch

# # device = "cuda" if torch.cuda.is_available() else "cpu"

# # Import texts and embedding df
# text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# def safe_literal_eval(val):
#     try:
#         return np.array(ast.literal_eval(val))
#     except (SyntaxError, ValueError):
#         # Handle cases where the string might not be properly formatted
#         return np.array([])

# # Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
# text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(safe_literal_eval)

# # Check for and remove any empty arrays resulting from parsing errors
# text_chunks_and_embedding_df = text_chunks_and_embedding_df[text_chunks_and_embedding_df["embedding"].apply(len) > 0]

# # Stack the numpy arrays into a single numpy array
# embeddings = np.stack(text_chunks_and_embedding_df["embedding"].to_list(), axis=0)

# # Convert the numpy array to a torch tensor
# embeddings = torch.tensor(embeddings, dtype=torch.float32).to(device)

# # # Convert texts and embedding df to list of dicts
# # pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")


In [67]:
# embeddings 

In [68]:
# embeddings = (embeddings_from_df).to(device)

# Convert texts and embedding df to list of dicts
# pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")



In [69]:
loaded_df

Unnamed: 0,page_number,chunks,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,Image Watermarking Techniques are Brittle: Inv...,1081,158,270.25,"[0.0018312462, 0.044055752, 0.011642245, 0.028..."
1,0,"2022a), Midjourney (Holz 2022), Imagen (Sahari...",1474,218,368.5,"[0.022018967, 0.043302275, 0.012036744, 0.0303..."
2,0,The central concern noted in the letter (Marcu...,484,76,121.0,"[-0.01184466, 0.040790785, -0.0021873878, 0.02..."
3,1,than ever. The findings of the latest (seventh...,1696,245,424.0,"[0.03676558, 0.06140035, 0.010882864, 0.021724..."
4,1,The methodology aims to en- hance the durabili...,1744,247,436.0,"[0.028329136, 0.035686363, 0.008556045, 0.0272..."
5,1,This exemplifies the growing challenge of rely...,1335,187,333.75,"[0.028052146, 0.053355083, 0.021795772, 0.0127..."
6,1,Visual paraphrasing is not yet a widely recogn...,255,31,63.75,"[0.008295866, 0.027954832, 0.029583707, 0.0046..."
7,2,demonstrating the vulnerability of existing wa...,1523,210,380.75,"[0.029363248, 0.04905099, 0.016219674, 0.01408..."
8,2,The watermarked image is obtained via inverse ...,1070,169,267.5,"[0.020887714, 0.061208963, 0.016250888, 0.0189..."
9,2,By comparing the L1 distance between the inver...,1223,185,305.75,"[0.008077232, 0.04057452, 0.006709632, 0.02762..."


In [70]:
embeddings_tensor

tensor([[ 0.0018,  0.0441,  0.0116,  ...,  0.0083,  0.0151, -0.0108],
        [ 0.0220,  0.0433,  0.0120,  ...,  0.0093,  0.0231, -0.0173],
        [-0.0118,  0.0408, -0.0022,  ..., -0.0061,  0.0167,  0.0045],
        ...,
        [ 0.0301,  0.0104,  0.0320,  ...,  0.0012,  0.0062, -0.0291],
        [-0.0005,  0.0096,  0.0018,  ..., -0.0031,  0.0428, -0.0017],
        [ 0.0151,  0.0315,  0.0103,  ...,  0.0121,  0.0380, -0.0005]])

In [71]:
embeddings_tensor.shape

torch.Size([41, 1536])

In [72]:
loaded_df.head()

Unnamed: 0,page_number,chunks,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,Image Watermarking Techniques are Brittle: Inv...,1081,158,270.25,"[0.0018312462, 0.044055752, 0.011642245, 0.028..."
1,0,"2022a), Midjourney (Holz 2022), Imagen (Sahari...",1474,218,368.5,"[0.022018967, 0.043302275, 0.012036744, 0.0303..."
2,0,The central concern noted in the letter (Marcu...,484,76,121.0,"[-0.01184466, 0.040790785, -0.0021873878, 0.02..."
3,1,than ever. The findings of the latest (seventh...,1696,245,424.0,"[0.03676558, 0.06140035, 0.010882864, 0.021724..."
4,1,The methodology aims to en- hance the durabili...,1744,247,436.0,"[0.028329136, 0.035686363, 0.008556045, 0.0272..."


In [73]:
pages_and_chunks

[{'page_number': 0,
  'chunks': 'Image Watermarking Techniques are Brittle: Investigating Visual Paraphrasing for De-Watermarking AI-Generated Images Anonymous submission Forward Diffusion Process Text Caption Generator Black Box An image of Pope Francis smiling and wearing a white jacket, surrounded by a crowd White Box An image of Pope Francis smiling and wearing a white jacket, surrounded by a crowd E UNet .... D UNet Denoising Dewatermarked Image Figure 1: Block diagram of the visual paraphrasing technique illustrating the dewatermarking process. The diagram includes a forward diffusion process for encoding and decoding images to generate visually paraphrased outputs. It features a White Box scenario, where access to prompts is available, allowing direct manipulation of the image using descriptive prompts. In contrast, the Black Box scenario does not have access to prompts, relying on a caption generator (Kosmos 2)(Peng et al.2023) to interpret and paraphrase the image context indi

In [74]:
# text_chunks_and_embedding_df["embedding"]

Query embedding and stored embedding matching

In [75]:
from sentence_transformers import util 

In [76]:
embeddings_tensor = embeddings_tensor.to(device)

In [77]:
query = "ZoDiac Watermarking"
print(f"query : {query}")

query_embeddings = model.encode(query, convert_to_tensor=True ).to(device)

dot_scores = util.dot_score(a= query_embeddings, b=embeddings_tensor)[0]

query : ZoDiac Watermarking


In [80]:
top_k_dot_results = torch.topk(dot_scores,k=5)

In [81]:
top_k_dot_results

torch.return_types.topk(
values=tensor([0.6789, 0.6488, 0.6200, 0.5694, 0.5684], device='cuda:0'),
indices=tensor([12, 11, 37, 40, 34], device='cuda:0'))

In [82]:
pages_and_chunks[11]

{'page_number': 3,
 'chunks': 'Generated Image Watermarked Image Difference Watermarked Image Visual Paraphrased Difference Prompt: A portrait of a Victorian family, painted in the style of John Singer Sargent. Figure 4: The figure demonstrates the effects of watermark embedding and visual paraphrasing on images. The first row presents the individual pixel-wise differences between the original generated image and the watermarked image, high- lighting the specific pixels modified by the watermarking process. The second row illustrates the individual pixel-wise differences between the watermarked image and its visually paraphrased version, indicating the pixels initially impacted by the watermark embedding that were subsequently altered through visual paraphrasing. ZoDiac Watermarking ZoDiac(Zhang et al.2024) is a zero-shot watermarking technique that leverages pre-trained diffusion models to embed watermarks into images while maintaining visual similarity between the watermarked and ori

In [83]:
query_embeddings.dtype

torch.float32

In [84]:
# query_embeddings

In [85]:
# embeddings[0].dtype

In [86]:
import textwrap

def print_wrapped(text,wrap_length=80) : 
    wrapped_text = textwrap.fill(text,wrap_length)
    print(wrapped_text)

In [87]:
query = "Tree-Ring Watermarking"
print(f"query : {query}")

for value, index in zip(top_k_dot_results[0], top_k_dot_results[1]): 
    print(f"Score: {value:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[index]["chunks"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[index]['page_number']}")
    print("\n")

query : Tree-Ring Watermarking
Score: 0.6789
Text:
Watermark Encoding: The latent vector ZT is trans- formed into its Fourier
space, where a concentric ring-like watermark is embedded. This watermark is
similar to the one used in tree-ring watermarking. To ensure that the final
watermarked image ˆx0 closely resembles the origi- nal image, ZoDiac iteratively
refines the latent vector ZT using a custom reconstruction loss. III. Adaptive
Image Enhancement: Once the watermarked image ˆx0 is generated, its visual
quality is enhanced by adaptively mixing it with the original image x0 to meet a
desired image quality threshold. Unlike tree-ring watermarking, ZoDiac can be
used to water- mark existing images. SynthID SynthID (Deepmind 2023) is a
toolkit from Google DeepMind that watermarks AI-generated content. It utilizes a
data-driven watermarking approach, embedding an imperceptible mark during AI-
generated content (AIGC) creation. This mark, robust to post-processing edits,
persists across

Functinons for semantic search 

In [88]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores on embeddings
    dot_scores = util.dot_score(query_embedding, embeddings)[0]


    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["chunks"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [90]:
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings_tensor)
scores, indices

(tensor([0.7149, 0.6988, 0.6573, 0.6527, 0.6410], device='cuda:0'),
 tensor([ 8, 34, 12, 37, 39], device='cuda:0'))

In [91]:
# Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=embeddings_tensor)

Query: Tree-Ring Watermarking

Results:
Score: 0.7149
The watermarked image is obtained via inverse transform. Learning-based
Watermarking Methods Here encoders and decoders (Huynh-The et al.2019) are neu-
ral networks and learn via back-propagation. A watermarking method has three key
components: watermark (w), encoder (E), and decoder (D). An encoder takes an
image X and watermark w as inputs and produces an watermarked image (Xw). So, Xw
= E(X, w) and a decoder takes Xw as an input and produces ˆw = D(Xw).ˆwi = [ ˆwi
≥ τ], where [·] represents the indicator function and τ is a threshold value we
decide based on the problem requirements. Tree Ring Watermark The proposed tree-
ring watermark- ing (Wen et al.2023) technique involves embedding the wa-
termark into the frequency domain of the initial noise vector using Fast Fourier
Transform (FFT), followed by a diffu- sion process applied to the watermarked
latent image. To ascertain whether an image has been watermarked, we utilize the

LLMs

GPT-2

In [None]:
# # https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/generation/logits_process.py#L411
# from transformers.generation.logits_process import LogitsWarper

In [None]:
# class TopPLogitsWarper(LogitsWarper):
#     """
#     [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off. Often
#     used together with [`TemperatureLogitsWarper`] and [`TopKLogitsWarper`].

#     Args:
#         top_p (`float`):
#             If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
#             higher are kept for generation.
#         filter_value (`float`, *optional*, defaults to -inf):
#             All filtered values will be set to this float value.
#         min_tokens_to_keep (`int`, *optional*, defaults to 1):
#             Minimum number of tokens that cannot be filtered.
#     """

#     def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
#         top_p = float(top_p)
#         if top_p < 0 or top_p > 1.0:
#             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
#         if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
#             raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")

#         self.top_p = top_p
#         self.filter_value = filter_value
#         self.min_tokens_to_keep = min_tokens_to_keep

#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
#         s_sorted_vals, s_sorted_indices = torch.sort(scores, descending=True, dim = -1)
#         softmax_outputs_cumsum = s_sorted_vals.softmax(dim = -1, ).cumsum(dim = -1)
#         indices_to_remove = softmax_outputs_cumsum <= self.top_p
#         indices_to_remove = indices_to_remove.scatter(1, s_sorted_indices, indices_to_remove)
#         indices_to_remove = ~indices_to_remove
#         scores_processed = scores.masked_fill(indices_to_remove, self.filter_value)
#         #print(scores[0], scores_processed[0], '11')
#         return scores_processed

In [None]:
# from transformers import GPT2Tokenizer, GPT2LMHeadModel, LogitsProcessorList
# from transformers import MaxLengthCriteria, StoppingCriteriaList  # Correct import path
# import torch

# # Initialize the tokenizer and model
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2")

# # Setup the prompt and other beam search settings
# x = 'The capital of India?'
# input_ids = tokenizer(x, return_tensors='pt').input_ids.to(model.device)
# print(f'input_ids = {input_ids}')
# y = 'Delhi'
# output_ids = tokenizer(y, return_tensors='pt').input_ids.to(model.device)
# print(f'output_ids = {output_ids}')

# # Number of beams
# num_beams = 10

# logits_top_p = TopPLogitsWarper(top_p=0.9)

# # Logits processor and stopping criteria
# logits_processor = LogitsProcessorList([logits_top_p])


# #Processing logits
# Temp_scale = 2
# with torch.no_grad():
#     logits = model(input_ids).logits[0]
#     processed_logits = logits_top_p(input_ids = input_ids, scores = logits)
#     probabilities = (processed_logits / Temp_scale).softmax(dim = -1)
#     probabilities_final = probabilities[-1, :]
#     print(probabilities.shape, torch.argmax(probabilities_final), probabilities_final[13856])
#     #fx_y = probabilities_final[]


# # Generate text using beam search
# output_sequences = model.generate(
#     input_ids,
#     max_length=12,
#     num_beams=num_beams,
#     num_return_sequences=2,
#     logits_processor=logits_processor,
# )

# # Decode and print the output beams
# for index, output_sequence in enumerate(output_sequences):
#     output_text = tokenizer.decode(output_sequence, skip_special_tokens=True)
#     print(f'beam {index}: {output_text}')


Gemma-2-9b-it

In [None]:
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

In [None]:
from transformers.utils import is_flash_attn_2_available 

if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

In [None]:
# from transformers import pipeline 

# model_id  = "google/gemma-2-9b"


In [None]:
# from transformers import pipeline
# import torch

# pipe = pipeline(
#     "text-generation",
#     model="google/gemma-2-9b-it",
#     model_kwargs={"torch_dtype": torch.bfloat16},
#     device="cuda",
# )

# messages = [
#     {"role": "user", "content": "Who are you? Please, answer in pirate-speak."},
# ]
# outputs = pipe(
#     messages,
#     max_new_tokens=1024,
#     do_sample=False,
# )
# assistant_response = outputs[0]["generated_text"][-1]["content"]
# print(assistant_response)

In [92]:
# pip install accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
llm = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16
)

# llm.to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [93]:
input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = llm.generate(**input_ids, max_new_tokens=512)
print(tokenizer.decode(outputs[0]))

<bos>Write me a poem about Machine Learning.

In silicon valleys, where data flows,
A new intelligence, silently grows.
Machine Learning, a name whispered low,
Algorithms dance, where patterns they know.

From pixels to prose, from sound to design,
Machines learn to mimic, to reason, to shine.
With every iteration, their knowledge expands,
Unveiling insights, hidden in sands.

Neural networks, a web intricate and vast,
Connecting nodes, memories amassed.
They sift through mountains, of information's might,
Finding connections, hidden from sight.

Supervised, unsupervised, reinforcement's sway,
Machines evolve, learning day by day.
Predicting the future, understanding the past,
Machine Learning's power, forever to last.

But with great knowledge, comes ethical strife,
Bias and fairness, a constant fight.
Transparency's call, a crucial plea,
To guide this evolution, responsibly.

So let us tread carefully, with wisdom and grace,
As Machine Learning shapes our human race.
For in its poten

In [94]:
# input = "Write a poem about time in 50 words"
# input_ids = tokenizer(input, return_tensors="pt").to("cuda")

# outputs = model.generate(**input_ids)
# print(tokenizer.decode(outputs[0]))

In [None]:
# tokenizer.decode(outputs[0])

In [None]:
# input_text = "Write a poem about time"
# dialogue_template = [{"role": "user", "content": input_text}]

# # Assuming input_data is a tensor, directly move it to the GPU
# input_data = tokenizer.apply_chat_template(conversation=dialogue_template, return_tensors="pt").to("cuda")

# # Generate outputs directly using input_data
# outputs = model.generate(input_ids=input_data, max_new_tokens=256)

# # Decode and print the output
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [95]:
input_text = "Write a poem about time"
dialogue_template = [{"role": "user", "content": input_text}]

# Assuming input_data is a tensor, directly move it to the GPU
prompt = tokenizer.apply_chat_template(dialogue_template, tokenize=False, add_generation_prompt=True)


In [98]:
prompt

'<bos><start_of_turn>user\nWrite a poem about time<end_of_turn>\n<start_of_turn>model\n'

In [96]:
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate outputs directly using input_data
outputs = llm.generate(**input_ids, max_new_tokens=256)

# Decode and print the output
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


user
Write a poem about time
model
Time, a river ever flowing,
A current swift, a silent knowing.
It carries us along its stream,
A ceaseless journey, a waking dream.

From dawn's first light to twilight's hush,
It whispers secrets, soft as brush.
Each second ticks, a fleeting grain,
A moment lost, a whispered pain.

The past, a shadow, fades away,
A memory's echo, come what may.
The future beckons, veiled and bright,
A canvas blank, a hopeful light.

But present moment, hold it dear,
For time's swift passage, ever near.
Embrace the joy, the laughter's sound,
Before it's lost, on shifting ground.

For time, a thief, steals moments fast,
A treasure fleeting, meant to last.
So cherish now, each precious breath,
Before time's river claims its death. 





In [97]:
pages_and_chunks[5]

{'page_number': 1,
 'chunks': 'This exemplifies the growing challenge of relying on visible markers for image authenticity in the context of rapidly advancing generative AI capabilities. Similarly, metadata consists of additional tags that can be easily stripped from files using a simple wrapper. Refer to a detailed example in the Appendix for further clarification. This paper exclusively critiques current techniques and empirically illustrates the deficiencies of state-of-the-art (SOTA) methods for AI-generated image detection. Rather than proposing a superior alternative method, this paper serves as a call to action for the scientific community to prioritize the development of more robust AI-generated im- age detection techniques. In this paper, our primary focus is on critiquing water- marking techniques. Although watermarking is primarily a technique originating from the computer vision community, there have been recent attempts to apply watermarking to AI- generated text. These en

In [None]:
# def prompt_formatter(query, context_items ):
#     """
#     Augments query with text-based context from context_items.
#     """
#     # Join context items into one dotted paragraph
#     context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

#     # Create a base prompt with examples to help the model
#     # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
#     # We could also write this in a txt file and import it in if we wanted.
#     base_prompt = """Based on the following context items, please answer the query.
# Give yourself room to think by extracting relevant passages from the context before answering the query.
# Don't return the thinking, only return the answer.
# Make sure your answers are as explanatory as possible.
# Use the following examples as reference for the ideal answer style.
# \nExample 1:
# Query: What are the fat-soluble vitamins?
# Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
# \nExample 2:
# Query: What are the causes of type 2 diabetes?
# Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
# \nExample 3:
# Query: What is the importance of hydration for physical performance?
# Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
# \nNow use the following context items to answer the user query:
# {context}
# \nRelevant passages: <extract relevant passages from the context here>
# User query: {query}
# Answer:"""

#     # Update base prompt with context items and query   
#     base_prompt = base_prompt.format(context=context, query=query)

#     # Create prompt template for instruction-tuned model
#     dialogue_template = [
#         {"role": "user",
#         "content": base_prompt}
#     ]

#     # Apply the chat template
#     prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
#                                           tokenize=False,
#                                           add_generation_prompt=True)
#     return prompt


In [99]:
def prompt_formatter(query, context_items, use_dialogue_template=True):
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    # context = "- " + "\n- ".join([item["chunks"] for item in context_items])
    context = " ".join([item["chunks"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """
        Based on the following context items, please answer the query.
        Context item 1 : 
        {context}
        User query: {query}
        Answer:
        """

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    if(use_dialogue_template == True) :
        # Apply the chat template
        prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                            tokenize=False,
                                            add_generation_prompt=True)
    else : 
        prompt = tokenizer.apply_chat_template(conversation=base_prompt,
                                            tokenize=False,
                                            add_generation_prompt=True) 
    return prompt


In [101]:
query = "Explain the black-box visual paraphrase"
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings_tensor)
    
# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)


Query: Explain the black-box visual paraphrase
<bos><start_of_turn>user
Based on the following context items, please answer the query.
        Context item : 
        A higher number of steps generally allows for finer reconstruction, leading to higher quality images, but at the cost of increased computa- tional complexity and time. There are two distinct approaches to visual paraphrasing: (i) White Box and (ii) Black Box. Each method offers unique Figure 7: This figure illustrates the image-to-image diffusion process (Gilboa, Sochen, and Zeevi 2002). The top row demonstrates the forward diffusion process, where the original image progressively becomes more noisy. The bottom row shows the denoising process, where noise is incrementally removed from the noisy image, guided by text conditioning to generate the final, Visual Paraphrased image.advantages and is suited to different use cases. The following sections will explore the specifics of each approach, detailing their methodologies a

In [102]:
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt 

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")


Query: Explain the black-box visual paraphrase
RAG answer:
<bos>In the context of de-watermarking AI-generated images,  "Black Box" visual paraphrase refers to a method where the original text prompt used to generate the watermarked image is **not available**. 

This poses a challenge because the prompt provides crucial context for understanding the image's content. Without it, directly removing the watermark becomes difficult.

To overcome this, the Black Box approach relies on a **caption generator**.  This tool analyzes the watermarked image itself and attempts to generate a textual description of its content. This generated caption then acts as a proxy for the original prompt, guiding a text-to-image generation system (like Stable Diffusion) to create a visually similar image *without* the watermark.

Essentially, the Black Box method tries to "reverse engineer" the original prompt's information from the image itself, allowing for de-watermarking even when the original prompt is mi

In [103]:
def ask(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """
    
    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings_tensor)
    
    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU 
        
    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)
    
    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text
    
    return output_text, context_items


In [104]:
# query = random.choice(query_list)
query = "What is the aim of this paper?"
print(f"Query: {query}")

# Answer query with context and return context 
answer, context_items = ask(query=query, 
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items


Query: What is the aim of this paper?
Answer:

The aim of this paper is to demonstrate the vulnerability of existing
watermarking techniques to visual paraphrase attacks.   The paper does not
propose solutions but serves as a call to action for the scientific community to
prioritize the development of more robust watermarking techniques.
<end_of_turn>
Context items:


[{'page_number': 1,
  'chunks': 'Visual paraphrasing is not yet a widely recognized sub-discipline. However, this paper demonstrates how visual paraphrasing can be accomplished using state-of-the-art text-to-image generation systems. This paper presents a critical assessment, empirically',
  'chunk_char_count': 255,
  'chunk_word_count': 31,
  'chunk_token_count': 63.75,
  'score': tensor(0.4268)},
 {'page_number': 4,
  'chunks': 'Tree Ring Stable Signature Figure 5: This figure shows the variation of CMMD (Jayasumana et al.2024) and detectability of visual paraphrases with respect to strength and guidance scale. The images were watermarked using Tree Ring Watermarking (Wen et al.2023) and Stable Signature (Fernandez et al.2023). VS: We need justification of why we only benchmark these two and how they are representative of the broader class on watermarking methods. In lit review we mention SythiID and Zodiac but dont benchmark them?original image s=0.2 s=0.3 s=0.4 s=0.5 s=0.6 s=0.7 Pr