In [None]:
!python --version

In [None]:
!pip install PyMuPDF
!pip install pymongo
!pip install PyDrive2
!pip install OpenAI
!pip install bert-score
!pip install rouge-score

In [None]:
!pip install langchain langchain_community langchain-openai

Transforming PDF into individual images:

In [None]:
import fitz  # PyMuPDF
from PIL import Image
from google.colab import drive

In [None]:
drive.mount('/content/drive/')

In [None]:
def pdf_to_images(pdf_path, output_folder):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    number_of_pages = pdf_document.page_count

    # Loop through each page
    for page_num in range(number_of_pages):
        # Get the page
        page = pdf_document.load_page(page_num)
        # Render page to an image
        pix = page.get_pixmap(dpi=200)

        # Convert to a PIL image
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # Save the image
        img.save(f"{output_folder}/page_{page_num + 1}.png")

In [None]:
pdf_to_images("/content/drive/MyDrive/bachelor-thesis/pdfs/01.pdf", "/content/drive/MyDrive/bachelor-thesis/output-images")

Embedding and storing in MongoDB vector store:

In [None]:
!gcloud init

In [None]:
!gcloud auth application-default login

In [None]:
import requests
import base64

def get_image_embedding(image_url):
  google_token = !gcloud auth print-access-token
  google_token = google_token[0]

  b64img = base64.b64encode(requests.get(image_url).content).decode("ascii")
  response = requests.post(
      headers={
          "Authorization": "Bearer " + google_token,
          "Content-Type": "application/json"
          },
      url="https://europe-west3-aiplatform.googleapis.com/v1/projects/bachelor-thesis-428711/locations/europe-west3/publishers/google/models/multimodalembedding@001:predict",
      json={
        "instances": [
          {
            "image": {
              "bytesBase64Encoded": b64img
            }
          }
        ],
        "parameters": {
          "dimension": 1408
        }
      }
  )

  return response.json()['predictions'][0]['imageEmbedding']

In [None]:
def get_text_embedding(text):
  text = text[:1000]

  google_token = !gcloud auth print-access-token
  google_token = google_token[0]

  response = requests.post(
      headers={
          "Authorization": "Bearer " + google_token,
          "Content-Type": "application/json"
          },
      url="https://europe-west3-aiplatform.googleapis.com/v1/projects/bachelor-thesis-428711/locations/europe-west3/publishers/google/models/multimodalembedding@001:predict",
      json={
        "instances": [
          {
            "text": text
          }
        ],
        "parameters": {
          "dimension": 1408
        }
      }
  )

  return response.json()['predictions'][0]['textEmbedding']

In [None]:
import pymongo
import requests
from google.colab import userdata

from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
mongo_client = pymongo.MongoClient(userdata.get('CLUSTER_BACHELOR_CLUSTER_CONNECTION_STRING'))

db = mongo_client["approach-1-new"]
collection = db["embeddings"]

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
gdrive = GoogleDrive(gauth)

In [None]:
output_images_folder = gdrive.ListFile({'q': "mimeType = 'application/vnd.google-apps.folder' and title = 'output-images'" }).GetList()[0]

output_images_files = gdrive.ListFile({'q': "'"+ output_images_folder['id'] + "' in parents and fileExtension = 'png'"}).GetList()

In [None]:
data_list = []

for file in output_images_files:
  file.InsertPermission({
      'type': 'anyone',
      'value': 'anyone',
      'role': 'reader'
  })
  image_url = "https://drive.usercontent.google.com/download?id=" + file['id'] + "&authuser=0"
  #response = requests.get(image_url)
  #embedding = get_features(response.content)
  embedding = get_image_embedding(image_url)
  data_list.append({"embedding": embedding, "url": image_url})

In [None]:
collection.insert_many(data_list)

Retrieve image for text query (similarity search):

In [None]:
query = "What are the ESG targets?"

embedded_query = get_text_embedding(query)
len(embedded_query)

In [None]:
def get_similar_images(query):
  embedded_query = get_text_embedding(query)

  similarity_search_pipeline = [
    {
      '$vectorSearch': {
        'index': 'approach_1_index',
        'path': 'embedding',
        'queryVector': embedded_query,
        'numCandidates': 150,
        'limit': 5
      }
    },
    {
      "$project": {
        "_id": 0,
        "embedding": 1,
        "url": 1,
        "score": { "$meta": "vectorSearchScore" }
      }
    }
  ]

  result = db.embeddings.aggregate(similarity_search_pipeline)

  url_list = []
  for image in result:
    url_list.append(image['url'])

  return url_list

In [None]:
get_similar_images(query)

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import MessagesPlaceholder
from langchain_core.messages.system import SystemMessage
from langchain_core.messages.human import HumanMessage

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o")

def get_answer(query):
  similar_images = get_similar_images(query)

  image_prompt_text = []
  for url in similar_images:
    image_prompt_text.append({"type": "image_url", "image_url": {"url": url}})

  image_prompt_text.append({"type": "text", "text": "Based on the information in the images, answer the question: {question}"})

  prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a powerful visual assistant who can answer questions based on information in the given images."),
    ("human", image_prompt_text)
  ])

  llm_chain = prompt | llm
  try:
    result = llm_chain.invoke(query).content
  except:
    print("Error while sending query to LLM")
    result = get_answer(query)

  return result

In [None]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score

def create_scores():
  df_bleu = pd.read_csv("/content/drive/MyDrive/bachelor-thesis/results/bleu.csv")
  df_rouge1 = pd.read_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-1.csv")
  df_rouge2 = pd.read_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-2.csv")
  df_rougel = pd.read_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-l.csv")
  df_bertscore = pd.read_csv("/content/drive/MyDrive/bachelor-thesis/results/bertscore.csv")

  for i, question in enumerate(df_bleu["question"]):
    if i >= 50:
      break

    while(True):
      print("Question " + str(i))
      try:
        answer = get_answer(question)
      except:
        print("Error in getting answer. Trying again")
        continue
      break

    df_bleu["approach-1-text"][i] = answer
    df_rouge1["approach-1-text"][i] = answer
    df_rouge2["approach-1-text"][i] = answer
    df_rougel["approach-1-text"][i] = answer
    df_bertscore["approach-1-text"][i] = answer

    split_answer = answer.split()

    reference_answer = df_bleu["reference-text"][i]
    split_reference_answer = reference_answer.split()

    bleu_score = sentence_bleu([split_reference_answer], split_answer)

    _rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)
    rouge_scores = _rouge_scorer.score(reference_answer, answer)

    rouge1 = rouge_scores['rouge1'].precision
    rouge2 = rouge_scores['rouge2'].precision
    rougeLsum = rouge_scores['rougeLsum'].precision

    bertscore_tensor, _, _ = score(cands=[answer], refs=[reference_answer], lang="en")
    bertscore = bertscore_tensor.numpy()[0]


    df_bleu["approach-1-score"][i] = bleu_score
    df_rouge1['approach-1-score'][i] = rouge1
    df_rouge2['approach-1-score'][i] = rouge2
    df_rougel['approach-1-score'][i] = rougeLsum
    df_bertscore['approach-1-score'][i] = bertscore

    df_bleu.to_csv("/content/drive/MyDrive/bachelor-thesis/results/bleu.csv")
    df_rouge1.to_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-1.csv")
    df_rouge2.to_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-2.csv")
    df_rougel.to_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-l.csv")
    df_bertscore.to_csv("/content/drive/MyDrive/bachelor-thesis/results/bertscore.csv")



In [None]:
create_scores()