In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter

loader=TextLoader("What_is_DataIngestion.txt")
documents=loader.load()
text_splitter=CharacterTextSplitter(chunk_size=1000,chunk_overlap=30)
docs=text_splitter.split_documents(documents)

In [3]:
docs

[Document(metadata={'source': 'What_is_DataIngestion.txt'}, page_content='Data ingestion is the process of collecting and importing raw data from multiple and diverse sources into a centralized destination (such as a data lake, data warehouse, or database) where it can be stored, processed, and analyzed. It is the foundational first step in any modern data pipeline. \nKey Aspects\nPurpose: The main goal is to efficiently gather data and make it available and accessible for business intelligence, analytics, and machine learning/AI initiatives.\nSources: Data can come from a wide variety of sources, including databases, APIs, log files, IoT devices, applications (SaaS), and file storage systems.\nFormats: The process handles data in various formats, including structured (like databases), semi-structured (like JSON or XML), and unstructured data (like images, audio, or text files).\nDestination: The data is typically moved into a single repository for consolidation, allowing organizations

In [None]:
embeddings=OllamaEmbeddings(model="nomic-embed-text")
db=FAISS.from_documents(docs,embeddings)
db

  embeddings=OllamaEmbeddings(model="nomic-embed-text")


<langchain_community.vectorstores.faiss.FAISS at 0x296f81dda50>

In [6]:
## querying the vector store
query="The method chosen depends on business needs"
docs=db.similarity_search(query)
docs[0].page_content

'Data ingestion is the process of collecting and importing raw data from multiple and diverse sources into a centralized destination (such as a data lake, data warehouse, or database) where it can be stored, processed, and analyzed. It is the foundational first step in any modern data pipeline. \nKey Aspects\nPurpose: The main goal is to efficiently gather data and make it available and accessible for business intelligence, analytics, and machine learning/AI initiatives.\nSources: Data can come from a wide variety of sources, including databases, APIs, log files, IoT devices, applications (SaaS), and file storage systems.\nFormats: The process handles data in various formats, including structured (like databases), semi-structured (like JSON or XML), and unstructured data (like images, audio, or text files).\nDestination: The data is typically moved into a single repository for consolidation, allowing organizations to create a unified "single source of truth".\nTransformation: Unlike th

In [10]:
retriever=db.as_retriever()
docs=retriever.invoke(query)
docs[0].page_content

'Data ingestion is the process of collecting and importing raw data from multiple and diverse sources into a centralized destination (such as a data lake, data warehouse, or database) where it can be stored, processed, and analyzed. It is the foundational first step in any modern data pipeline. \nKey Aspects\nPurpose: The main goal is to efficiently gather data and make it available and accessible for business intelligence, analytics, and machine learning/AI initiatives.\nSources: Data can come from a wide variety of sources, including databases, APIs, log files, IoT devices, applications (SaaS), and file storage systems.\nFormats: The process handles data in various formats, including structured (like databases), semi-structured (like JSON or XML), and unstructured data (like images, audio, or text files).\nDestination: The data is typically moved into a single repository for consolidation, allowing organizations to create a unified "single source of truth".\nTransformation: Unlike th

In [11]:
docs_and_scores=db.similarity_search_with_score(query)
docs_and_scores[0]

(Document(id='486ce116-6ee6-4669-8d7c-d08297cbe8bb', metadata={'source': 'What_is_DataIngestion.txt'}, page_content='Data ingestion is the process of collecting and importing raw data from multiple and diverse sources into a centralized destination (such as a data lake, data warehouse, or database) where it can be stored, processed, and analyzed. It is the foundational first step in any modern data pipeline. \nKey Aspects\nPurpose: The main goal is to efficiently gather data and make it available and accessible for business intelligence, analytics, and machine learning/AI initiatives.\nSources: Data can come from a wide variety of sources, including databases, APIs, log files, IoT devices, applications (SaaS), and file storage systems.\nFormats: The process handles data in various formats, including structured (like databases), semi-structured (like JSON or XML), and unstructured data (like images, audio, or text files).\nDestination: The data is typically moved into a single repositor

In [12]:
embedding_vector=embeddings.embed_query(query)
embedding_vector

[-0.354330837726593,
 1.0491324663162231,
 -3.1301980018615723,
 0.8620954751968384,
 0.44422829151153564,
 0.3157077729701996,
 0.8483039140701294,
 -0.8800861239433289,
 -0.03295496106147766,
 0.21802985668182373,
 0.15647000074386597,
 -0.2636837363243103,
 2.1121342182159424,
 0.8778420090675354,
 -0.14179521799087524,
 -0.7251920104026794,
 0.5065534114837646,
 -0.959740161895752,
 -0.5729237198829651,
 0.5232858061790466,
 -1.0980387926101685,
 -0.39854639768600464,
 -0.5913237929344177,
 -0.7556244730949402,
 -0.09717559814453125,
 -0.41141533851623535,
 0.39855441451072693,
 0.5490773916244507,
 -0.47680193185806274,
 -0.07130981236696243,
 -0.8282347917556763,
 0.5495675802230835,
 1.0335781574249268,
 -1.1027220487594604,
 -2.242661476135254,
 -1.6500064134597778,
 0.5256590843200684,
 0.7628863453865051,
 0.1106613427400589,
 -0.7909427285194397,
 -0.5791946649551392,
 -0.5491098761558533,
 0.9729088544845581,
 -0.9991717338562012,
 1.9032334089279175,
 -0.37852743268013,
 1

In [13]:
docs_store=db.similarity_search_by_vector(embedding_vector)
docs_store[0].page_content

'Data ingestion is the process of collecting and importing raw data from multiple and diverse sources into a centralized destination (such as a data lake, data warehouse, or database) where it can be stored, processed, and analyzed. It is the foundational first step in any modern data pipeline. \nKey Aspects\nPurpose: The main goal is to efficiently gather data and make it available and accessible for business intelligence, analytics, and machine learning/AI initiatives.\nSources: Data can come from a wide variety of sources, including databases, APIs, log files, IoT devices, applications (SaaS), and file storage systems.\nFormats: The process handles data in various formats, including structured (like databases), semi-structured (like JSON or XML), and unstructured data (like images, audio, or text files).\nDestination: The data is typically moved into a single repository for consolidation, allowing organizations to create a unified "single source of truth".\nTransformation: Unlike th

In [14]:
### Saving and Loading the Vector Store
db.save_local("faiss_index")


In [None]:
new_db=FAISS.load_local("faiss_index",embeddings,allow_dangerous_deserialization=True)
docs=new_db.similarity_search("What is Data Ingestion?")
docs[0].page_content

'Data ingestion is the process of collecting and importing raw data from multiple and diverse sources into a centralized destination (such as a data lake, data warehouse, or database) where it can be stored, processed, and analyzed. It is the foundational first step in any modern data pipeline. \nKey Aspects\nPurpose: The main goal is to efficiently gather data and make it available and accessible for business intelligence, analytics, and machine learning/AI initiatives.\nSources: Data can come from a wide variety of sources, including databases, APIs, log files, IoT devices, applications (SaaS), and file storage systems.\nFormats: The process handles data in various formats, including structured (like databases), semi-structured (like JSON or XML), and unstructured data (like images, audio, or text files).\nDestination: The data is typically moved into a single repository for consolidation, allowing organizations to create a unified "single source of truth".\nTransformation: Unlike th