In [1]:
import os
from constants import (EMBEDDING_MODEL,
                       LLAMAPARSE_API_KEY,
                       LLM,
                       LANGCHAIN_API_KEY,
                       LANGCHAIN_PROJECT,
                       LANGCHAIN_URL,
                       GEMINIPRO_API_KEY,
                       QDRANT_API_KEY,
                       QDRANT_CLUSTER)

from pprint import pprint

In [2]:
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = LANGCHAIN_URL
os.environ["LANGCHAIN_PROJECT"] = LANGCHAIN_PROJECT

### Check Access to Gemini 1.5 Pro is available

In [2]:
import pathlib
import textwrap

from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [3]:
import google.generativeai as genai


In [4]:
genai.configure(api_key=GEMINIPRO_API_KEY)

In [10]:
for m in genai.list_models():
    if 'generateContent' or 'embedContent' in m.supported_generation_methods:
        pprint(m.name)

'models/chat-bison-001'
'models/text-bison-001'
'models/embedding-gecko-001'
'models/gemini-1.0-pro'
'models/gemini-1.0-pro-001'
'models/gemini-1.0-pro-latest'
'models/gemini-1.0-pro-vision-latest'
'models/gemini-1.5-pro-latest'
'models/gemini-pro'
'models/gemini-pro-vision'
'models/embedding-001'
'models/text-embedding-004'
'models/aqa'


# Data Preparation - ETL(Extract, Transform, Load)

## LLamaParse to Parse PDF to Markdown

In [16]:
from llama_parse import LlamaParse

In [17]:
import nest_asyncio

nest_asyncio.apply()

In [18]:
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown"  # "markdown" and "text" are available
)

In [19]:
documents = parser.load_data("data/Master_Thesis_Aidana.pdf")

Started parsing the file under job_id c279b3bd-ff48-4bbc-a60d-21d4b85d0e22


In [20]:
print("Total Number of Documents ->", len(documents))

Total Number of Documents -> 1


In [21]:
type(documents)

list

In [22]:
from llama_index.core.node_parser import MarkdownNodeParser

markdown_parser = MarkdownNodeParser()
nodes = markdown_parser.get_nodes_from_documents(documents, include_metadata=True, include_prev_next_rel=True)

In [23]:
len(nodes)

45

In [26]:
markdown_parser.class_name()

'MarkdownNodeParser'

In [29]:
markdown_parser.get_nodes_from_node(nodes[1])

[TextNode(id_='01a78bfd-6517-4be0-b892-e978c2f620a0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d9883ffd-3f4c-417f-ad80-d000e5fb10b7', node_type=<ObjectType.TEXT: '1'>, metadata={'Header_2': 'Introduction (Kassymbekova & Marat, 2022: 5)'}, hash='a5461bdde39ffe23e8d0436e5cf797ceb1d26f50be74d8f326a58b51ac77fba2')}, text='Introduction (Kassymbekova & Marat, 2022: 5)\n\nWith the escalation of the conflict between Russia and Ukraine and the ongoing war that began in February 2022 to undermine the independence of sovereign Ukraine, the Kremlin’s agenda and foreign policy are attracting increasing attention and deeper analysis from the academic world and major media (Mankoff, 2022: 1). Growing concerns about the Kremlin’s “imperialist” attitude and policies, pressures and Russia’s latent threat to some countries of “near abroad” put the topic of Russia’s still existing co

In [24]:
to_markdown(nodes[1].text)

> Introduction (Kassymbekova & Marat, 2022: 5)
> 
> With the escalation of the conflict between Russia and Ukraine and the ongoing war that began in February 2022 to undermine the independence of sovereign Ukraine, the Kremlin’s agenda and foreign policy are attracting increasing attention and deeper analysis from the academic world and major media (Mankoff, 2022: 1). Growing concerns about the Kremlin’s “imperialist” attitude and policies, pressures and Russia’s latent threat to some countries of “near abroad” put the topic of Russia’s still existing colonialist approach and mentality at the forefront of many political debates (Expert Institute for Social Research, 2023: 10, 14). In this context, Ukrainian and Western politicians and the media talk about the future of Russia and the need for decolonization, which has caught the attention of Russian academics and politicians (ibid: 7).
> 
> To begin with, it is crucial to note that Russia, unlike any other colonizing country, was simultaneously a colony. Historically, it represented an imperial power that colonized and enslaved many other countries and peoples, with some of its parts also being colonies. While Soviet academics accused the West of imperialism and colonial history, some experts argue that both of Russia’s predecessors were, indeed, colonial powers of their own. Tsarist Russia went hand in hand with extreme aggressiveness, colonization, and imperialization from the beginning (Horvath, 1972: 45).
> 
> The Russian Empire featured unequal attitudes and approaches between the Russified, though in some cases coercive, “metropolis” or ruling elite and the non-Russian peoples living on the periphery, which resulted in their inability to be fully incorporated into a unified nation. When it comes to the Soviet Union, the situation is somewhat contradictory. The ideas underlying the creation of “the voluntary union” run counter to the imperialist features. Lenin preached equality of nations with non-exploitive relations, wanting to integrate other countries into his model of the state. Nevertheless, imperialist relations persisted in the USSR, where the power structure and resources were centralized in the metropolis, and other units remained exploited. Given that decision-making power was entirely concentrated in Moscow, the
> ---
> Relationship between the capital and all other republics and regions could be characterized as a subordination of the periphery to the metropolis, as in the tsarist empire, despite its original intentions (Suny, 2001: 50-55).
> 
> Contemporary Russia, however, is legally a federation of semi-presidential republics. It comprises eighty-three regions, comprising forty-six oblasts, twenty-one republics, nine krais, four autonomous districts, one autonomous oblast, and two cities of federal significance. Eight federal districts unite the regions. Each district has a federal representative whom the president appoints. They are the main link between the regions and the federal government (StatData.ru, 2023). According to the latest open census of 2010, Russia has a population of over 140 million and is primarily made up of ethnic Russians, 70 percent of whom identify as Orthodox Christians. It is important to note that the country is ethnically, religiously, and regionally highly diverse. It is estimated that there are about 25 million Muslims and, in addition, 170 different ethnic groups in Russia (Rosstat, 2010; Heinemann-Grüder, 2013: 5). Indeed, these numbers show the multicultural environment of the country. However, according to many historians and scholars, including Heinemann-Grüder, it developed as a result of territorial expansion, exploitation, and colonization that began during the Tsarist Empire (2013: 5). It is crucial, for one, to determine to what extent the violence against indigenous peoples has stopped since the end of the Tsarist Empire and whether present-day Russia is repeating the actions of its predecessors.
> 
> Gosart, in her study on structural violence against indigenous communities in contemporary Russia, reveals that structural violence is embedded in the Russian state in its attitude toward indigenous communities. The needs of these communities are separated from the interests and functions of the state, following the path of Soviet-era laws and viewpoints. Therefore, indigenous peoples born on Russian territories are subjected to violence and neglect through state administration and law enforcement (Gosart, 2018: 193).
> 
> Furthermore, indigenous populations are severely declining, and some communities are at risk of extinction. Despite this concern, the “partial mobilization” into the Russian army due to the war in Ukraine, which took place in September 2022, strongly affected primarily indigenous peoples. The first and second waves mobilized young people from Russia’s poorest regions, inhabited mainly by indigenous peoples, even those who are not eligible for military service.
> 
> 1This paper does not consider the annexed and occupied territories of Crimea, Donetsk, Luhansk, Kherson, and Zaporizhia as Russian territory. The author of the thesis deducted these territories from the list.
> ---
> According to Article 18 of Federal Law No. 31-FZ of 1997, these include: students, the elderly, the disabled, and fathers of three or more children. This move by the Russian state and officials directly threatens physical survival and contributes to decreasing the indigenous population or their extinction. Many activists have called this act a new genocide (ADC Memorial & ICIPR, 2023: 23). The report cites numerous supporting facts and evidence to corroborate this statement. As a case in point, the mortality statistics for soldiers from Russia, presented by the Free Buryatia Foundation, show a disparity in the deaths of mobilized indigenous men compared to those of Moscow. Indigenous activists, thus, concluded that, say, Buryats have a 300 times higher risk of being killed in the war than residents of Moscow despite the population difference (ibid: 24).
> 
> The obvious implication is that the political system in contemporary Russia creates conditions for the continuation and intensification of structural violence against indigenous peoples and their further marginalization. The state can be held responsible for the harsh living conditions, discrimination, and population decrease that indigenous communities face (Gosart, 2018: 257).
> 
> Thus, when discussing Russia’s future, the question of decolonization arises as a way of addressing the ongoing injustice and exploitation, as well as achieving peace in the region and ceasing conflicts.

## Embeddings

In [11]:
from llama_index.embeddings.gemini import GeminiEmbedding

In [12]:
model_name = "models/embedding-001"

embed_model = GeminiEmbedding(
    model_name=model_name, api_key=GEMINIPRO_API_KEY, title="this is a document"
)

embeddings = embed_model.get_text_embedding("Google Gemini Embeddings.")

In [13]:
print(f"Dimension of embeddings: {len(embeddings)}")

Dimension of embeddings: 768


In [14]:
embeddings[:5]

[0.028174246, -0.0290093, -0.013280814, 0.008629, 0.025442218]