In [4]:
import os
from typing import List

import fitz

from rich import print
from rich.pretty import pprint

# DATA PREPARATION

## SPLIT THE MAIN PDF BY SECTION

In [None]:
def partition_pdf(page_numbers: List[int], new_pdf_name: str, pdf_path: str = "data/raw/Master_Thesis_Aidana_Rakhatbekova.pdf") -> None:
    """
    Partitions a PDF by selecting specific pages and saves the result to a new PDF file.

    Parameters:
        page_numbers (List[int]): A list of page numbers to select from the original PDF.
        new_pdf_name (str): The file path where the new PDF will be saved.
        pdf_path (str, optional): The file path of the original PDF. Defaults to "data/raw/Master_Thesis_Aidana_Rakhatbekova.pdf".

    Returns:
        None
    """
    doc = fitz.open(pdf_path)
    doc.select(page_numbers)
    doc.save(new_pdf_name)

In [None]:
partition_pdf([1, 2, 3, 4, 5], "data/processed/Introduction.pdf")

In [None]:
partition_pdf([5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], "data/processed/State_of_the_Art.pdf")

In [None]:
partition_pdf([18, 19, 20, 21], "data/processed/Theoretical_Framework.pdf")

In [None]:
partition_pdf([21, 22, 23, 24, 25, 26], "data/processed/Methodology.pdf")

In [None]:
partition_pdf([27, 28, 29, 30, 
               31, 32, 33, 34, 35, 
               36, 37, 38, 39, 40,
               41, 42, 43, 44, 45, 
               46, 47, 48, 49, 50, 
               51, 52, 53, 54, 55, 
               56, 57, 58, 59], "data/processed/Analysis_and_Presentation_of_Findings.pdf")

In [None]:
partition_pdf([59, 60, 61, 62, 63], "data/processed/Conclusion.pdf")

In [None]:
partition_pdf([64, 65, 66, 67, 68, 69, 70], "data/processed/References.pdf")

In [None]:
partition_pdf([71, 72], "data/processed/Appendix_A.pdf")

In [None]:
partition_pdf([75], "data/processed/Appendix_B.pdf")

## 

## LLamaParse to Parse PDF to Markdown

In [None]:
## LLamaParse to Parse PDF to Markdown

In [None]:
from llama_parse import LlamaParse

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown",  # "markdown" and "text" are available
    verbose=True,
    language="en"
)

In [None]:
documents = parser.load_data("data/Master_Thesis_Aidana_Rakhatbekova.pdf")

In [None]:
len(documents)

In [None]:
from llama_index.core.node_parser import MarkdownNodeParser

markdown_parser = MarkdownNodeParser()
nodes = markdown_parser.get_nodes_from_documents(documents, include_metadata=True, include_prev_next_rel=True)

In [None]:
from llama_index.embeddings.gemini import GeminiEmbedding

In [None]:
chapters = ["INTRODUCTION",
            "STATE OF THE ART",
            "THEORETICAL FRAMEWORK",
            "METHODOLOGY",
            "ANALYSIS AND PRESENTATION OF FINDINGS",
            "CONCLUSION",
            "APPENDIX A",
            "APPENDIX B"]

In [None]:
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = LANGCHAIN_URL
os.environ["LANGCHAIN_PROJECT"] = LANGCHAIN_PROJECT

In [3]:
from constants import (AZURE_OPENAI_ENDPOINT,
                       AZURE_OPENAI_API_KEY,
                       AZURE_OPENAI_API_VERSION,
                       DB_CONNECTION,
                       EMBEDDING_MODEL,
                       LLM)

In [6]:
from langchain_openai import AzureChatOpenAI

In [7]:
llm = AzureChatOpenAI(            
        temperature = 0.2,
        openai_api_version = AZURE_OPENAI_API_VERSION,
        azure_endpoint = AZURE_OPENAI_ENDPOINT,
        openai_api_key = AZURE_OPENAI_API_KEY,         
        model_name = LLM
        )

In [8]:
from langchain_core.messages import HumanMessage

In [9]:
message = HumanMessage(
    content="Translate this sentence from English to German. I love programming."
)

In [6]:
llm.invoke([message])

AIMessage(content='Ich liebe Programmierung.', response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 19, 'total_tokens': 24}, 'model_name': 'gpt-4-32k:swedencentral', 'system_fingerprint': None, 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}, id='run-32465900-612f-4c0c-9ab7-3a91ea00b114-0', usage_metadata={'input_tokens': 19, 'output_tokens': 5, 'total_tokens': 24})

In [10]:
from openai import AzureOpenAI

client = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_API_VERSION,
)

In [11]:
embeddings = client.embeddings.create(
    input = "Hey I am Ashish",
    model= EMBEDDING_MODEL
)

In [13]:
# print(embeddings)

In [35]:
from langchain_community.vectorstores import AzureCosmosDBVectorSearch

In [None]:
vector_store = AzureCosmosDBVectorSearch.from_connection_string(
    connection_string = DB_CONNECTION,
    namespace = "test",
    embedding = embedding_model,
    index_name = "test_index",
    embedding_key = "content_vector"

)

In [1]:
import pymongo

In [4]:
client = pymongo.MongoClient(DB_CONNECTION)

  client = pymongo.MongoClient(DB_CONNECTION)


In [5]:
db = client.thesis_vectordb