Imports

In [33]:
import sys
sys.path.append("../src")
import os
from functions import *
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from torch import save
import shutil


In [None]:
# Remove the entire directory if it exists to avoid conflicts
if os.path.exists('../data/processed_files/'):
    shutil.rmtree('../data/processed_files/')

# Recreate empty directories
os.makedirs('../data/processed_files/extracted_images/', exist_ok=True)
os.makedirs('../data/processed_files/JSON/', exist_ok=True)
os.makedirs('../data/processed_files/embeddings/', exist_ok=True)

# Get all PDF files from raw directory
filename_list = ["../data/raw_files/pdf/"+f for f in os.listdir('../data/raw_files/pdf') if f.endswith('.pdf')]

text_content_list = []
image_content_list = []
for filename in filename_list:
    text_content_list.extend(parse_pdf_content(filename))
    image_content_list.extend(parse_pdf_images(filename))
# Extracts the text and image content from all PDFs and saves the images into the processed_files/extracted_images/ directory

In [None]:
print(len(text_content_list))
print(len(image_content_list))

# Prints the number of text chunks and images extracted

14
1


In [36]:
# import processor first to use for tokenization
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

text_list = []
for content in text_content_list:
    # concatenate title and section header
    section = content['section'] + ": "
    full_text = section + content['text']
    
    # Tokenize and truncate to CLIP's max length (77 tokens)
    tokens = processor.tokenizer(full_text, truncation=True, max_length=77, return_tensors="pt")
    # Decode back to text to ensure consistency
    truncated_text = processor.tokenizer.decode(tokens['input_ids'][0], skip_special_tokens=True)
    
    text_list.append(truncated_text)

image_list = []
for content in image_content_list:
    image_list.append(Image.open(content['image_path']))


In [37]:
print(len(text_list))
print(len(image_list))

14
1


In [38]:
# import model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

In [39]:
# pre-process text and images
inputs = processor(text=text_list, images=image_list, return_tensors="pt", padding=True, max_length=77, truncation=True)


In [40]:
# compute embeddings with CLIP
outputs = model(**inputs)

In [41]:
# store embeddings in single torch tensor
text_embeddings = outputs.text_embeds
image_embeddings = outputs.image_embeds

In [42]:
print(text_embeddings.shape)
print(image_embeddings.shape)

torch.Size([14, 512])
torch.Size([1, 512])


In [43]:
# save content list as JSON
save_to_json(text_content_list, output_file='../data/processed_files/JSON/text_content.json')
save_to_json(image_content_list, output_file='../data/processed_files/JSON/image_content.json')

In [44]:
# save embeddings to file
save(text_embeddings, '../data/processed_files/embeddings/text_embeddings.pt')
save(image_embeddings, '../data/processed_files/embeddings/image_embeddings.pt')