Imports

In [33]:
import sys
sys.path.append("../src")
import os
from functions import *
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from torch import save
import shutil


In [None]:
# Remove the 'processed_files' directory if it exists to ensure a clean start
if os.path.exists('../data/processed_files/'):
    shutil.rmtree('../data/processed_files/')

# Create the necessary subdirectories for storing images, JSON data, and embeddings
os.makedirs('../data/processed_files/extracted_images/', exist_ok=True)
os.makedirs('../data/processed_files/JSON/', exist_ok=True)
os.makedirs('../data/processed_files/embeddings/', exist_ok=True)

# List all PDF files found in the raw data directory
filename_list = ["../data/raw_files/pdf/"+f for f in os.listdir('../data/raw_files/pdf') if f.endswith('.pdf')]

text_content_list = []
image_content_list = []
for filename in filename_list:
    text_content_list.extend(parse_pdf_content(filename))
    image_content_list.extend(parse_pdf_images(filename))
# Extract text and images from all PDF files and save images to disk

In [None]:
print(len(text_content_list))
print(len(image_content_list))

# Print the total number of extracted text chunks and images for verification

14
1


In [36]:
# Initialize the CLIP processor for tokenizing text content
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

text_list = []
for content in text_content_list:
    # Combine the section header and text content into a single string
    section = content['section'] + ": "
    full_text = section + content['text']
    
    # Tokenize the text and truncate it to the CLIP model's maximum length of 77 tokens
    tokens = processor.tokenizer(full_text, truncation=True, max_length=77, return_tensors="pt")
    # Decode the tokens back to text to ensure consistency with the model's input format
    truncated_text = processor.tokenizer.decode(tokens['input_ids'][0], skip_special_tokens=True)
    
    text_list.append(truncated_text)

image_list = []
for content in image_content_list:
    image_list.append(Image.open(content['image_path']))


In [37]:
print(len(text_list))
print(len(image_list))

14
1


In [38]:
# Load the pre-trained CLIP model for generating embeddings
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

In [39]:
# Pre-process the text list and image list into tensors for the model
inputs = processor(text=text_list, images=image_list, return_tensors="pt", padding=True, max_length=77, truncation=True)


In [40]:
# Pass the inputs through the CLIP model to generate text and image embeddings
outputs = model(**inputs)

In [41]:
# Extract and store the text and image embeddings from the model output
text_embeddings = outputs.text_embeds
image_embeddings = outputs.image_embeds

In [42]:
print(text_embeddings.shape)
print(image_embeddings.shape)

torch.Size([14, 512])
torch.Size([1, 512])


In [43]:
# Save the extracted text and image content lists to JSON files
save_to_json(text_content_list, output_file='../data/processed_files/JSON/text_content.json')
save_to_json(image_content_list, output_file='../data/processed_files/JSON/image_content.json')

In [44]:
# Save the generated text and image embeddings to .pt files for later use
save(text_embeddings, '../data/processed_files/embeddings/text_embeddings.pt')
save(image_embeddings, '../data/processed_files/embeddings/image_embeddings.pt')