In [None]:
!pip install PyMuPDF pymongo PyDrive2

!pip install langchain langchain_community langchain-openai langsmith pytesseract openai
!sudo apt install tesseract-ocr tesseract-ocr-deu
!pip install bert-score rouge-score

Transforming PDF into individual images:

In [None]:
import fitz  # PyMuPDF
from PIL import Image
from google.colab import drive
from google.colab import userdata

import io
import os
import numpy as np

from typing import List, Union, Tuple

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
LANGCHAIN_API_KEY = userdata.get('LANGCHAIN_API')

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Bachelor Thesis"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY

from langsmith import Client

langsmith_client = Client()

In [None]:
from langchain_openai import ChatOpenAI
from langchain.agents import tool, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import MessagesPlaceholder
from langchain_core.messages.system import SystemMessage
from langchain_core.messages.human import HumanMessage
from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser

from typing import Optional

import json

llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o")

In [None]:
drive.mount('/content/drive/')

In [None]:
def pdf_to_images(pdf_path, output_folder):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    number_of_pages = pdf_document.page_count

    # Loop through each page
    for page_num in range(number_of_pages):
        # Get the page
        page = pdf_document.load_page(page_num)
        # Render page to an image
        pix = page.get_pixmap(dpi=200)

        # Convert to a PIL image
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # Save the image
        img.save(f"{output_folder}/page_{page_num + 1}.png")

In [None]:
pdf_to_images("/content/drive/MyDrive/bachelor-thesis/pdfs/01.pdf", "/content/drive/MyDrive/bachelor-thesis/output-images")

Create description from image, embed it and store in MongoDB vector store

In [None]:
def get_image_paths(directory: str, number: int = None) -> List[str]:
    image_paths = []
    count = 0
    for filename in os.listdir(directory):
        if filename.endswith('.png'):
            image_paths.append(os.path.join(directory, filename))
            if number is not None and count == number:
                return [image_paths[-1]]
            count += 1
    return image_paths
direc = '/content/drive/MyDrive/bachelor-thesis/output-images/'
image_paths = get_image_paths(direc)
image_paths

Tools for Agent:

In [None]:
@tool
def get_diagrams_from_slide(imgurl):
  """Returns the types of diagrams and charts and their topic in a image for a given URL. Must use this, if the image has charts or diagrams in it."""
  prompt = ChatPromptTemplate.from_messages([
    ("system", "You will be given an image. Your task is to identify the types of diagrams or charts used in the image. Only return the titles of the chart or diagram and their type. Examples for such charts can be pie charts, bar charts, line graphs, histograms, ..."),
    ("human", [
              {
                  "type": "image_url",
                  "image_url": {
                      "url": "{imgurl}"
                      }
                  }
              ]),
  ])
  prompt_value = prompt.invoke({"imgurl": imgurl})
  try:
    answer = llm.invoke(prompt_value)
  except:
    print("Error in get_diagrams_from_slide. Trying again")
    answer = get_diagrams_from_slide(imgurl)
  return answer

@tool
def describe_bar_chart(imgurl, diagram_info, bars_count, ocr_text):
  """Returns a detailed description of a given bar chart from an image. Use this tool when the amount of bars is known. Always use this tool if the image contains a bar chart."""
  prompt = ChatPromptTemplate.from_messages([
    ("system", "You will be given an image. Your task is to describe the bar chart given in the image. If asked about a specific part of the image, only return the information in the specific part. Use the information about the amount of bars to make sure you describe every bar in detail."),
    ("human", [
        {
                  "type": "text",
                  "text": """Focus on the following chart: {diagram_info}
                  There is the following amount of bars in the chart: {bars_count}

                  The following text is in the image: {ocr_text}"""
                  },
              {
                  "type": "image_url",
                  "image_url": {
                      "url": "{imgurl}"
                      }
                  }
              ]),
  ])
  title = diagram_info
  if(isinstance(diagram_info, dict)):
    if "topic" in diagram_info:
      title = diagram_info["topic"]
  prompt_value = prompt.invoke({"imgurl": imgurl, "diagram_info": title, "bars_count": bars_count, "ocr_text": ocr_text})
  try:
    answer = llm.invoke(prompt_value)
  except:
    print("Error in describe_bar_chart. Trying again")
    answer = describe_bar_chart(imgurl, diagram_info, bars_count, ocr_text)
  return answer

@tool
def describe_pie_chart(imgurl, diagram_info, ocr_text, color_meaning: Optional[str] = None):
  """Returns a detailed description of a given pie chart from an image. Always use this tool when the image contains a pie chart."""
  if color_meaning is None:
    color_meaning = "None"
  prompt = ChatPromptTemplate.from_messages([
    ("system", "You will be given an image. Your task is to describe the pie chart given in the image. If asked about a specific part of the image, only return the information in the specific part."),
    ("human", [
        {
                  "type": "text",
                  "text": """Focus on the following chart: {diagram_info}

                  The following information on the meaning of the colors of the slices is known: {color_meaning}

                  The following text is in the image: {ocr_text}"""
                  },
              {
                  "type": "image_url",
                  "image_url": {
                      "url": "{imgurl}"
                      }
                  }
              ]),
  ])
  title = diagram_info
  if(isinstance(diagram_info, dict)):
    if "topic" in diagram_info:
      title = diagram_info["topic"]
  prompt_value = prompt.invoke({"imgurl": imgurl, "diagram_info": title, "color_meaning": color_meaning, "ocr_text": ocr_text})
  try:
    answer = llm.invoke(prompt_value)
  except:
    print("Error in describe_pie_chart. Trying again")
    answer = describe_pie_chart(imgurl, diagram_info, ocr_text, color_meaning)
  return answer

@tool
def describe_timeline_chart(imgurl, diagram_info, ocr_text):
  """Returns a detailed description of a given timeline chart from an image. Always use this tool when the image contains a timeline chart."""
  prompt = ChatPromptTemplate.from_messages([
    ("system", "You will be given an image. Your task is to describe the timeline chart given in the image. If asked about a specific part of the image, only return the information in the specific part. Describe with as detail as possible, describing each point in time."),
    ("human", [
        {
                  "type": "text",
                  "text": """Focus on the following chart: {diagram_info}

                  The following text is in the image: {ocr_text}"""
                  },
              {
                  "type": "image_url",
                  "image_url": {
                      "url": "{imgurl}"
                      }
                  }
              ]),
  ])
  title = diagram_info
  if(isinstance(diagram_info, dict)):
    if "topic" in diagram_info:
      title = diagram_info["topic"]
  prompt_value = prompt.invoke({"imgurl": imgurl, "diagram_info": title, "ocr_text": ocr_text})
  try:
    answer = llm.invoke(prompt_value)
  except:
    print("Error in describe_timeline_chart. Trying again")
    answer = describe_timeline_chart(imgurl, diagram_info, ocr_text)
  return answer

@tool
def describe_table(imgurl, diagram_info, ocr_text):
  """Returns a detailed description of a given table from an image. Always use this tool when the image contains a table."""
  prompt = ChatPromptTemplate.from_messages([
    ("system", "You will be given an image. Your task is to describe the table given in the image. If asked about a specific part of the image, only return the information in the specific part. Describe with as detail as possible, describing each entry."),
    ("human", [
        {
                  "type": "text",
                  "text": """Focus on the following table: {diagram_info}

                  The following text is in the image: {ocr_text}"""
                  },
              {
                  "type": "image_url",
                  "image_url": {
                      "url": "{imgurl}"
                      }
                  }
              ]),
  ])
  title = diagram_info
  if(isinstance(diagram_info, dict)):
    if "topic" in diagram_info:
      title = diagram_info["topic"]
  prompt_value = prompt.invoke({"imgurl": imgurl, "diagram_info": title, "ocr_text": ocr_text})
  try:
    answer = llm.invoke(prompt_value)
  except:
    print("Error in describe_table. Trying again")
    answer = describe_table(imgurl, diagram_info, ocr_text)
  return answer

@tool
def describe_architecture_flowchart_diagram(imgurl, diagram_info, ocr_text):
  """Returns a detailed description of a software architecture diagram. This can be in the form of a flowchart diagram that shows a technical architecture of a software system. Use this tool only for software architecture diagrams."""
  prompt = ChatPromptTemplate.from_messages([
    ("system", "You will be given an image. Your task is to describe the architecture given in the image. If asked about a specific part of the image, only return the information in the specific part. Describe with as detail as possible, describing each component and the relationships between the components. Also explain what the components and relationships might mean."),
    ("human", [
        {
                  "type": "text",
                  "text": """Focus on the following diagram: {diagram_info}

                  The following text is in the image: {ocr_text}"""
                  },
              {
                  "type": "image_url",
                  "image_url": {
                      "url": "{imgurl}"
                      }
                  }
              ]),
  ])
  title = diagram_info
  if(isinstance(diagram_info, dict)):
    if "topic" in diagram_info:
      title = diagram_info["topic"]
  prompt_value = prompt.invoke({"imgurl": imgurl, "diagram_info": title, "ocr_text": ocr_text})
  try:
    answer = llm.invoke(prompt_value)
  except:
    print("Error in describe_architecture_flowchart_diagram. Trying again")
    answer = describe_architecture_flowchart_diagram(imgurl, diagram_info, ocr_text)
  return answer

@tool
def describe_generic(imgurl, diagram_info, ocr_text):
  """Returns a detailed description of a generic type of visual information. Use this tool when the other tools for describing visual information in the image do not entirely fit."""
  prompt = ChatPromptTemplate.from_messages([
    ("system", "You will be given an image. Your task is to describe the visual context of the image in as much detail as possible. Describe every component of the image and every relationship between the components in detail. If asked about a specific part of the image, only return the information in the specific part."),
    ("human", [
        {
                  "type": "text",
                  "text": """Focus on the following content: {diagram_info}

                  The following text is in the image: {ocr_text}"""
                  },
              {
                  "type": "image_url",
                  "image_url": {
                      "url": "{imgurl}"
                      }
                  }
              ]),
  ])
  title = diagram_info
  if(isinstance(diagram_info, dict)):
    if "topic" in diagram_info:
      title = diagram_info["topic"]
  prompt_value = prompt.invoke({"imgurl": imgurl, "diagram_info": title, "ocr_text": ocr_text})
  try:
    answer = llm.invoke(prompt_value)
  except:
    print("Error in describe_generic. Trying again")
    answer = describe_generic(imgurl, diagram_info, ocr_text)
  return answer

@tool
def get_bars_count(imgurl, diagram_info):
  """Returns the amount of bars for any specified bar chart in an image. Always use this tool first when processing bar charts."""
  prompt = ChatPromptTemplate.from_messages([
    ("system", "You will be given an image with a bar chart in it. Your task is to identify the amount bars in the chart. If asked about a specific bar chart of the image, only return the amount of bars in the specified bar chart."),
    ("human", [
        {
                  "type": "text",
                  "text": "Focus on the following chart: {diagram_info}"
                  },
              {
                  "type": "image_url",
                  "image_url": {
                      "url": "{imgurl}"
                      }
                  }
              ]),
  ])
  title = diagram_info
  if(isinstance(diagram_info, dict)):
    if "topic" in diagram_info:
      title = diagram_info["topic"]
  prompt_value = prompt.invoke({"imgurl": imgurl, "diagram_info": title})
  try:
    answer = llm.invoke(prompt_value)
  except:
    print("Error in get_bars_count. Trying again")
    answer = get_bars_count(imgurl, diagram_info)
  return answer

@tool
def get_color_meaning(imgurl, diagram_info):
  """Returns the meaning of the colors used in a diagram. Always use this tool before describing the diagram."""
  prompt = ChatPromptTemplate.from_messages([
    ("system", "You will be given an image with a diagram in it. Your task is to identify the meaning of the colors in the diagram. If asked about a specific diagram of the image, only return the information for the colors in the specified diagram."),
    ("human", [
        {
                  "type": "text",
                  "text": "Focus on the following diagram: {diagram_info}"
                  },
              {
                  "type": "image_url",
                  "image_url": {
                      "url": "{imgurl}"
                      }
                  }
              ]),
  ])
  title = diagram_info
  if(isinstance(diagram_info, dict)):
    if "topic" in diagram_info:
      title = diagram_info["topic"]
  prompt_value = prompt.invoke({"imgurl": imgurl, "diagram_info": title})
  try:
    answer = llm.invoke(prompt_value)
  except:
    print("Error in get_color_meaning. Trying again")
    answer = get_color_meaning(imgurl, diagram_info)
  return answer

@tool
def get_word_length(word):
  """Returns the length of a word"""
  return len(word)

@tool
def get_factor(initial_value, result_value):
  """Returns the factor that the initial value has to be multiplied with, in order to equal the result value"""
  return result_value/initial_value

@tool
def get_difference(val1, val2):
  """Returns the difference between two numbers"""
  return abs(val1 - val2)

In [None]:
import pytesseract
import requests
import cv2
import numpy as np
from PIL import Image
import io
from matplotlib import pyplot as plt

def recognize_text(imgurl):
  response = requests.get(imgurl)
  img = Image.open(io.BytesIO(response.content))

  img_array = np.asarray(img)

  kernelSize = 5
  maxKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernelSize, kernelSize))
  localMax = cv2.morphologyEx(img_array, cv2.MORPH_CLOSE, maxKernel, None, None, 1, cv2.BORDER_REFLECT101)

  gainDivision = np.where(localMax == 0, 0, (img_array/localMax))

  gainDivision = np.clip((255 * gainDivision), 0, 255)

  gainDivision = gainDivision.astype("uint8")

  grayscaleImage = cv2.cvtColor(gainDivision, cv2.COLOR_BGR2GRAY)

  _, binaryImage = cv2.threshold(grayscaleImage, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

  grayscaleImage = cv2.cvtColor(gainDivision, cv2.COLOR_BGR2GRAY)

  _, binaryImage = cv2.threshold(grayscaleImage, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

  text = pytesseract.image_to_string(binaryImage, lang="eng+deu")
  return text

In [None]:
def generate_description(image_url):
  tools = [get_diagrams_from_slide, get_word_length, get_factor, get_difference, get_bars_count, get_color_meaning, describe_bar_chart, describe_pie_chart, describe_timeline_chart, describe_table, describe_architecture_flowchart_diagram, describe_generic] #, recognize_text]
  llm_with_tools = llm.bind_tools(tools)

  prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a powerful visual assistant who can extract visual information from presentation slides with the help of some specified tools"),
    ("human", [
              {
                  "type": "text",
                  "text": """Based on the following data, answer the question: {question}

                  Data: {data},
                  Text in the image: {ocr_text}"""
              }
              ]),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
  ])

  agent = (
    {
        "question": lambda x: x["question"],
        "data" : lambda x: x["data"],
        "ocr_text": lambda x: x["ocr_text"],
        "agent_scratchpad": lambda x: format_to_openai_tool_messages(
            x["intermediate_steps"]
        )
    }
    | prompt
    | llm_with_tools
    | OpenAIToolsAgentOutputParser()
  )

  text = recognize_text(image_url)

  agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
  try:
    agent_executor_result = agent_executor.invoke({"question": "Describe the given image in detail", "data": image_url, "ocr_text": text})
  except:
    print("Error running agent")
    agent_executor_result = generate_description(image_url)

  return agent_executor_result["output"]

In [None]:
from openai import OpenAI

client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))

def get_text_embedding(text):
  response = client.embeddings.create(
    input=text,
    model="text-embedding-3-large"
    )
  return response.data[0].embedding

In [None]:
import pymongo
import requests
from google.colab import userdata

from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
output_images_folder = drive.ListFile({'q': "mimeType = 'application/vnd.google-apps.folder' and title = 'output-images'" }).GetList()[0]

output_images_files = drive.ListFile({'q': "'"+ output_images_folder['id'] + "' in parents and fileExtension = 'png'"}).GetList()

In [None]:
mongo_client = pymongo.MongoClient(userdata.get('CLUSTER_BACHELOR_CLUSTER_CONNECTION_STRING'))

db = mongo_client["approach-3"]
collection = db["embeddings"]

In [None]:
data_list = []

for file in output_images_files:
  file.InsertPermission({
      'type': 'anyone',
      'value': 'anyone',
      'role': 'reader'
  })
  image_url = "https://drive.usercontent.google.com/download?id=" + file['id'] + "&authuser=0"
  while(True):
    print("Trying: " + image_url)
    try:
      text = generate_description(image_url)
    except:
      print("Error in generate_description. Trying again")
      continue
    break
  embedding = get_text_embedding(text)
  data_list.append({"embedding": embedding, "text": text})

In [None]:
collection.insert_many(data_list)

In [None]:
def get_similar_descriptions(query):
  embedded_query = get_text_embedding(query)

  similarity_search_pipeline = [
    {
      "$vectorSearch": {
        "index": "approach_3_index",
        "path": "embedding",
        "queryVector": embedded_query,
        "numCandidates": 150,
        "limit": 5
      }
    },
    {
      "$project": {
        "_id": 0,
        "embedding": 1,
        "text": 1,
        "score": { "$meta": "vectorSearchScore" }
      }
    }
  ]

  result = db.embeddings.aggregate(similarity_search_pipeline)

  text_list = []
  for chunk in result:
    text_list.append(chunk['text'])

  return text_list

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import MessagesPlaceholder
from langchain_core.messages.system import SystemMessage
from langchain_core.messages.human import HumanMessage

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o")

def get_answer(query):
  similar_descriptions = get_similar_descriptions(query)

  inner_text = ""
  for text in similar_descriptions:
    inner_text += f"{text}\n\n"

  inner_text = inner_text.replace("{", "{{").replace("}", "}}")

  prompt_text = []
  prompt_text.append(
      {"type": "text",
      "text": f"""Based on the following data, answer the question: {{question}}

      Data: {inner_text}
      """}
      )

  prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a powerful assistant who can answer questions based on information in the given text."),
    ("human", prompt_text)
  ])

  llm_chain = prompt | llm
  result = llm_chain.invoke(query).content

  return result

In [None]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score

def create_scores():
  df_bleu = pd.read_csv("/content/drive/MyDrive/bachelor-thesis/results/bleu.csv")
  df_rouge1 = pd.read_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-1.csv")
  df_rouge2 = pd.read_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-2.csv")
  df_rougel = pd.read_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-l.csv")
  df_bertscore = pd.read_csv("/content/drive/MyDrive/bachelor-thesis/results/bertscore.csv")

  for i, question in enumerate(df_bleu["question"]):
    if i >= 50:
      break

    while(True):
      print("Question " + str(i))
      try:
        answer = get_answer(question)
      except Exception as error:
        print("Error in getting answer. Trying again. Error:")
        print(error)
        continue
      break

    df_bleu["approach-3-text"][i] = answer
    df_rouge1["approach-3-text"][i] = answer
    df_rouge2["approach-3-text"][i] = answer
    df_rougel["approach-3-text"][i] = answer
    df_bertscore["approach-3-text"][i] = answer

    split_answer = answer.split()

    reference_answer = df_bleu["reference-text"][i]
    split_reference_answer = reference_answer.split()

    bleu_score = sentence_bleu([split_reference_answer], split_answer)

    _rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)
    rouge_scores = _rouge_scorer.score(reference_answer, answer)

    rouge1 = rouge_scores['rouge1'].precision
    rouge2 = rouge_scores['rouge2'].precision
    rougeLsum = rouge_scores['rougeLsum'].precision

    bertscore_tensor, _, _ = score(cands=[answer], refs=[reference_answer], lang="en")
    bertscore = bertscore_tensor.numpy()[0]


    df_bleu["approach-3-score"][i] = bleu_score
    df_rouge1['approach-3-score'][i] = rouge1
    df_rouge2['approach-3-score'][i] = rouge2
    df_rougel['approach-3-score'][i] = rougeLsum
    df_bertscore['approach-3-score'][i] = bertscore

    df_bleu.to_csv("/content/drive/MyDrive/bachelor-thesis/results/bleu.csv")
    df_rouge1.to_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-1.csv")
    df_rouge2.to_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-2.csv")
    df_rougel.to_csv("/content/drive/MyDrive/bachelor-thesis/results/rouge-l.csv")
    df_bertscore.to_csv("/content/drive/MyDrive/bachelor-thesis/results/bertscore.csv")

In [None]:
create_scores()