In [1]:
import os
from constants import (EMBEDDING_MODEL,
                       LLAMAPARSE_API_KEY,
                       LLM,
                       LANGCHAIN_API_KEY,
                       LANGCHAIN_PROJECT,
                       LANGCHAIN_URL,
                       GEMINIPRO_API_KEY,
                       QDRANT_API_KEY,
                       QDRANT_CLUSTER,
                       UNSTRUCTURED_API_KEY,
                       UNSTRUCTURED_URL)

from pprint import pprint

In [2]:
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = LANGCHAIN_URL
os.environ["LANGCHAIN_PROJECT"] = LANGCHAIN_PROJECT

### Check Access to Gemini 1.5 Pro is available

In [3]:
import pathlib
import textwrap

from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [4]:
import google.generativeai as genai


In [5]:
genai.configure(api_key=GEMINIPRO_API_KEY)

In [6]:
for m in genai.list_models():
    if 'generateContent' or 'embedContent' in m.supported_generation_methods:
        pprint(m.name)

'models/chat-bison-001'
'models/text-bison-001'
'models/embedding-gecko-001'
'models/gemini-1.0-pro'
'models/gemini-1.0-pro-001'
'models/gemini-1.0-pro-latest'
'models/gemini-1.0-pro-vision-latest'
'models/gemini-1.5-pro-latest'
'models/gemini-pro'
'models/gemini-pro-vision'
'models/embedding-001'
'models/text-embedding-004'
'models/aqa'


# Data Preparation - ETL(Extract, Transform, Load)

In [40]:
import fitz
from typing import List

In [41]:
def partition_pdf(page_numbers: List[int], new_pdf_name: str, pdf_path: str = "data/raw/Master_Thesis_Aidana_Rakhatbekova.pdf") -> None:
    """
    Partitions a PDF by selecting specific pages and saves the result to a new PDF file.

    Parameters:
        page_numbers (List[int]): A list of page numbers to select from the original PDF.
        new_pdf_name (str): The file path where the new PDF will be saved.
        pdf_path (str, optional): The file path of the original PDF. Defaults to "data/raw/Master_Thesis_Aidana_Rakhatbekova.pdf".

    Returns:
        None
    """
    doc = fitz.open(pdf_path)
    doc.select(page_numbers)
    doc.save(new_pdf_name)

In [42]:
partition_pdf([1, 2, 3, 4, 5], "data/processed/Introduction.pdf")

In [44]:
partition_pdf([5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], "data/processed/State_of_the_Art.pdf")

In [45]:
partition_pdf([18, 19, 20, 21], "data/processed/Theoretical_Framework.pdf")

In [46]:
partition_pdf([21, 22, 23, 24, 25, 26], "data/processed/Methodology.pdf")

In [47]:
partition_pdf([27, 28, 29, 30, 
               31, 32, 33, 34, 35, 
               36, 37, 38, 39, 40,
               41, 42, 43, 44, 45, 
               46, 47, 48, 49, 50, 
               51, 52, 53, 54, 55, 
               56, 57, 58, 59], "data/processed/Analysis_and_Presentation_of_Findings.pdf")

In [49]:
partition_pdf([59, 60, 61, 62, 63], "data/processed/Conclusion.pdf")

In [50]:
partition_pdf([64, 65, 66, 67, 68, 69, 70], "data/processed/References.pdf")

In [51]:
partition_pdf([71, 72], "data/processed/Appendix_A.pdf")

In [52]:
partition_pdf([75], "data/processed/Appendix_B.pdf")

## LLamaParse to Parse PDF to Markdown

In [7]:
from llama_parse import LlamaParse

In [8]:
import nest_asyncio

nest_asyncio.apply()

In [9]:
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown",  # "markdown" and "text" are available
    verbose=True,
    language="en"
)

In [11]:
documents = parser.load_data("data/Master_Thesis_Aidana_Rakhatbekova.pdf")

Started parsing the file under job_id f0861fcb-4fb3-489a-88a3-0f366434b490


In [12]:
len(documents)

1

In [13]:
type(documents)

list

In [64]:
json_objs = parser.get_json_result("data/Master_Thesis_Aidana_Rakhatbekova.pdf")
json_list = json_objs[0]["pages"]

Started parsing the file under job_id 85510ff3-5c67-43e5-8f33-909d7e8373d0


In [65]:
type(json_list)

list

In [66]:
len(json_list)

77

In [69]:
json_list[0]

{'page': 1,
 'text': '                  Table of content\n1. Introduction................................................................................................................................1\n         1.1 Purpose of the Research and Research Question.................................................................3\n         1.2 Overview of the Thesis........................................................................................................ 5\n2. State-of-the-art........................................................................................................................... 5\n         2.1 Settler Colonialism...............................................................................................................7\n                  2.1.1 Russian context........................................................................................................... 7\n         2.2 Federation..............................................................

In [30]:
documents = parser.load_data("data/Master_Thesis_Aidana_Rakhatbekova.pdf")

Started parsing the file under job_id f61f1646-6c9c-4324-a105-8533c112a9d8
.......

In [31]:
print("Total Number of Documents ->", len(documents))

Total Number of Documents -> 1


In [32]:
type(documents)

list

In [33]:
documents

[Document(id_='dd1d12d5-77f3-4f24-9b01-4c4eae4987ce', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='{\n"text": "Table of content\\n1. Introduction................................................................................................................................1\\n1.1 Purpose of the Research and Research Question.................................................................3\\n1.2 Overview of the Thesis........................................................................................................ 5\\n2. State-of-the-art........................................................................................................................... 5\\n2.1 Settler Colonialism...............................................................................................................7\\n2.1.1 Russian context............................................................................................

In [17]:
from llama_index.core.node_parser import MarkdownNodeParser

markdown_parser = MarkdownNodeParser()
nodes = markdown_parser.get_nodes_from_documents(documents, include_metadata=True, include_prev_next_rel=True)

In [18]:
len(nodes)

78

In [19]:
type(nodes)

list

In [20]:
nodes[0].relationships

{<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ac2c4170-ddc8-4b99-ac91-c35e4170b216', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='912cd7d81950622e25d901979de27ab4f6851138b9bd4a2323c30ad59e73774d'),
 <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='60c7c6c5-17b4-4453-98b3-63f97e227eaa', node_type=<ObjectType.TEXT: '1'>, metadata={'Header_2': 'Introduction'}, hash='87cad9e2fe7237cce8dc1e05429b481aef24c51440c05c35bde90206bafbe29f')}

In [21]:
markdown_parser.class_name()

'MarkdownNodeParser'

In [22]:
markdown_parser.get_nodes_from_node(nodes[1])

[TextNode(id_='e29a0b05-8a38-4993-862f-b6a729804f9d', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='60c7c6c5-17b4-4453-98b3-63f97e227eaa', node_type=<ObjectType.TEXT: '1'>, metadata={'Header_2': 'Introduction'}, hash='87cad9e2fe7237cce8dc1e05429b481aef24c51440c05c35bde90206bafbe29f')}, text='Introduction\n\nWith the escalation of the conflict between Russia and Ukraine and the ongoing war that began in February 2022 to undermine the independence of sovereign Ukraine, the Kremlin’s agenda and foreign policy are attracting increasing attention and deeper analysis from the academic world and major media (Mankoff, 2022: 1). Growing concerns about the Kremlin’s “imperialist” attitude and policies, pressures and Russia’s latent threat to some countries of “near abroad” put the topic of Russia’s still existing colonialist approach and mentality at the forefront of many politi

In [24]:
nodes[77].metadata

{'Header_1': 'Meduza (2023). Основательницу фонда «Свободная Бурятия» Александру Гармажапову заочно приговорили к 7 годам колонии по делу о “фейках” о российской армии (The founder of the “Free Buryatia” Foundation, Alexandra Garmazhapova, was sentenced in absentia to 7 years in prison in the case of “fakes” about the Russian army). 10 November. Available at: https://meduza.io/news/2023/11/10/osnovatelnitsu-fonda-svobodnaya-buryatiya-aleksand',
 'Header_2': 'Translation of the Interview questions into Russian',
 'Header_3': 'Интервью вопросы:',
 'Header_4': 'Заключительная часть'}

In [10]:
to_markdown(nodes[1].text)

NameError: name 'nodes' is not defined

## Embeddings

In [41]:
from llama_index.embeddings.gemini import GeminiEmbedding

In [42]:
model_name = "models/embedding-001"

embed_model = GeminiEmbedding(
    model_name=model_name, api_key=GEMINIPRO_API_KEY, title="this is a document"
)

embeddings = embed_model.get_text_embedding("Google Gemini Embeddings.")

In [43]:
print(f"Dimension of embeddings: {len(embeddings)}")

Dimension of embeddings: 768


In [44]:
embeddings[:5]

[0.028174246, -0.0290093, -0.013280814, 0.008629, 0.025442218]

In [11]:
chapters = ["INTRODUCTION",
            "STATE OF THE ART",
            "THEORETICAL FRAMEWORK",
            "METHODOLOGY",
            "ANALYSIS AND PRESENTATION OF FINDINGS",
            "CONCLUSION",
            "APPENDIX A",
            "APPENDIX B"]

In [12]:
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.chunking.title import chunk_by_title

from unstructured.staging.base import dict_to_elements

s = UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY,
                       server_url=UNSTRUCTURED_URL)

In [13]:
filename = "data/Master_Thesis_Aidana_Rakhatbekova.pdf"

with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = shared.PartitionParameters(
    files=files,
    strategy="hi_res",
    hi_res_model_name="yolox",
    pdf_infer_table_structure=True,
    skip_infer_table_types=[],
)

try:
    resp = s.general.partition(req)
    pdf_elements = dict_to_elements(resp.elements)
except SDKError as e:
    print(e)

KeyboardInterrupt: 

In [None]:
pdf_elements[0].to_dict()