In [5]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [6]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

#### embedding -- converting text into vectors

In [9]:
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(model = "text-embedding-3-large")

In [2]:
text = "This is a introduction to Open AI embedding"
query_result = embedding.embed_query(text)

In [3]:
query_result

[0.005008628126233816,
 0.013089119456708431,
 -0.010641560889780521,
 -0.042481109499931335,
 0.02417762577533722,
 0.0012707797577604651,
 0.00798826478421688,
 0.03887717053294182,
 -0.004792249761521816,
 -0.005462667904794216,
 -0.011251676827669144,
 0.0034780167043209076,
 0.009513555094599724,
 -7.094373722793534e-05,
 -0.02387966215610504,
 0.024560721591114998,
 0.014529277570545673,
 0.03263412043452263,
 -0.014337729662656784,
 -0.027625491842627525,
 0.022829694673419,
 -0.005430743098258972,
 -0.03357057645916939,
 -0.024645855650305748,
 0.028703836724162102,
 -0.02893085591495037,
 0.01335161179304123,
 0.04055144265294075,
 -0.03618130832910538,
 0.040267664939165115,
 0.012776967138051987,
 -0.03612455353140831,
 0.020573684945702553,
 -0.025482991710305214,
 -0.016004906967282295,
 0.005810292437672615,
 0.040125779807567596,
 0.055449627339839935,
 -0.051420021802186966,
 0.0212689321488142,
 0.03501782938838005,
 0.009478083811700344,
 0.012911760248243809,
 0.0109

In [16]:
len(query_result)

3072

#### to specify dimensions

In [10]:
embedding_1024 = OpenAIEmbeddings(model = "text-embedding-3-large", dimensions=1024)

In [11]:
query_result_1024 = embedding_1024.embed_query(text)

In [12]:
len(query_result_1024)

1024

#### converting docs into embeddings

In [13]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("TDP.txt")
docs = loader.load()

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 500)
final_doc = text_splitter.split_documents(docs)

In [15]:
len(final_doc)

73

In [18]:
doc_embedding = embedding_1024.embed_documents([i.page_content for i in final_doc])

In [19]:
len(doc_embedding)

73

#### vector embedding and vector store

In [27]:
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(final_doc, embedding_1024)

In [31]:
query = "The TDP uses yellow as the background colour for its flag, with a hut, wheel and plough symbol in the foreground"
retrieved_results = db.similarity_search(query)

In [32]:
retrieved_results

[Document(metadata={'source': 'TDP.txt'}, page_content="The TDP uses yellow as the background colour for its flag, with a hut, wheel and plough symbol in the foreground. The party's electoral symbol is bicycle."),
 Document(metadata={'source': 'TDP.txt'}, page_content='The Telugu Desam Party follows a pro-Telugu ideology. It was founded as an alternative to the Congress hegemony, by emphasizing Telugu regional pride and serving as the party for farmers, backward castes and middle-class people. Since the 1990s, it has followed an economically liberal policy that has been seen as pro-business and pro-development.[19]'),
 Document(metadata={'source': 'TDP.txt'}, page_content='TDP party was founded thinking that we should stand by the people of the state who are struggling with many such problems and put a political party to stand by the people. The party was formed on 29 March 1982 and contested the 1983 elections. The people of Andhra Pradesh, who were fed up with the Congress government