### Faiss VectorDB

In [6]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

loader = TextLoader("speech1.txt")
documents=loader.load()
text_splitter=CharacterTextSplitter(chunk_size=1000,chunk_overlap=30)
docs=text_splitter.split_documents(documents)


Created a chunk of size 1217, which is longer than the specified 1000


In [7]:
docs

[Document(metadata={'source': 'speech1.txt'}, page_content="Speech is the use of the human voice as a medium for language. Spoken language combines vowel and consonant sounds to form units of meaning like words, which belong to a language's lexicon. There are many different intentional speech acts, such as informing, declaring, asking, persuading, directing; acts may vary in various aspects like enunciation, intonation, loudness, and tempo to convey meaning. Individuals may also unintentionally communicate aspects of their social position through speech, such as sex, age, place of origin, physiological and mental condition, education, and experiences.\nWhile normally used to facilitate communication with others, people may also use speech without the intent to communicate. Speech may nevertheless express emotions or desires; people talk to themselves sometimes in acts that are a development of what some psychologists (e.g., Lev Vygotsky) have maintained is the use of silent speech in a

In [9]:
embeddings =(
    OllamaEmbeddings(model="gemma:2b")
)
db=FAISS.from_documents(docs,embeddings)
db

<langchain_community.vectorstores.faiss.FAISS at 0x2bb56411750>

In [12]:
### Querying
query = "What do researchers study in speech?"
docs=db.similarity_search(query)
docs[0].page_content


'Determining the timeline of human speech evolution is made additionally challenging by the lack of data in the fossil record. The human vocal tract does not fossilize, and indirect evidence of vocal tract changes in hominid fossils has proven inconclusive.[10]\n\nProduction\nMain articles: Speech production and Linguistics\nSpeech production is an unconscious multi-step process by which thoughts are generated into spoken utterances. Production involves the unconscious mind selecting appropriate words and the appropriate form of those words from the lexicon and morphology, and the organization of those words through the syntax. Then, the phonetic properties of the words are retrieved and the sentence is articulated through the articulations associated with those phonetic properties.[11]'

### Retriever

In [16]:
retriever = db.as_retriever()
docs=retriever.invoke(query)
docs[0].page_content

'Determining the timeline of human speech evolution is made additionally challenging by the lack of data in the fossil record. The human vocal tract does not fossilize, and indirect evidence of vocal tract changes in hominid fossils has proven inconclusive.[10]\n\nProduction\nMain articles: Speech production and Linguistics\nSpeech production is an unconscious multi-step process by which thoughts are generated into spoken utterances. Production involves the unconscious mind selecting appropriate words and the appropriate form of those words from the lexicon and morphology, and the organization of those words through the syntax. Then, the phonetic properties of the words are retrieved and the sentence is articulated through the articulations associated with those phonetic properties.[11]'

In [19]:
docs_and_scores = db.similarity_search_with_score(query)
docs_and_scores[0]

(Document(id='1585dd6d-924f-4de0-b9cd-3806dc581a65', metadata={'source': 'speech1.txt'}, page_content='Determining the timeline of human speech evolution is made additionally challenging by the lack of data in the fossil record. The human vocal tract does not fossilize, and indirect evidence of vocal tract changes in hominid fossils has proven inconclusive.[10]\n\nProduction\nMain articles: Speech production and Linguistics\nSpeech production is an unconscious multi-step process by which thoughts are generated into spoken utterances. Production involves the unconscious mind selecting appropriate words and the appropriate form of those words from the lexicon and morphology, and the organization of those words through the syntax. Then, the phonetic properties of the words are retrieved and the sentence is articulated through the articulations associated with those phonetic properties.[11]'),
 np.float32(2207.3184))

In [20]:
embedding_vector = embeddings.embed_query(query)
embedding_vector

[-0.42496001720428467,
 -1.128978967666626,
 -0.9913154244422913,
 1.1732068061828613,
 0.3855898678302765,
 2.634631395339966,
 0.4700370132923126,
 -1.2127798795700073,
 0.051621582359075546,
 0.012172882445156574,
 -0.619571328163147,
 -0.49495306611061096,
 1.9965109825134277,
 0.8628216981887817,
 -0.4442700147628784,
 -1.0475341081619263,
 3.568234920501709,
 0.041424740105867386,
 -0.873879611492157,
 0.8680794835090637,
 1.0577807426452637,
 -0.49058112502098083,
 0.0921226441860199,
 -0.4534042477607727,
 -1.4834386110305786,
 -1.0995372533798218,
 -0.44460397958755493,
 -0.2217162400484085,
 0.6087970733642578,
 -0.8639507293701172,
 -0.20267276465892792,
 0.772862434387207,
 0.02497190050780773,
 1.0806001424789429,
 -0.6061378121376038,
 -0.4915497601032257,
 0.03508390486240387,
 0.8466776609420776,
 0.2933919429779053,
 -1.7249910831451416,
 -1.0768465995788574,
 -0.1238497719168663,
 1.1107350587844849,
 -0.6420193910598755,
 0.8928545117378235,
 -1.2146022319793701,
 1.

In [21]:
db.similarity_search_by_vector(embedding_vector)

[Document(id='1585dd6d-924f-4de0-b9cd-3806dc581a65', metadata={'source': 'speech1.txt'}, page_content='Determining the timeline of human speech evolution is made additionally challenging by the lack of data in the fossil record. The human vocal tract does not fossilize, and indirect evidence of vocal tract changes in hominid fossils has proven inconclusive.[10]\n\nProduction\nMain articles: Speech production and Linguistics\nSpeech production is an unconscious multi-step process by which thoughts are generated into spoken utterances. Production involves the unconscious mind selecting appropriate words and the appropriate form of those words from the lexicon and morphology, and the organization of those words through the syntax. Then, the phonetic properties of the words are retrieved and the sentence is articulated through the articulations associated with those phonetic properties.[11]'),
 Document(id='bd3f4ff6-766b-4e51-ab98-85dfa10604d9', metadata={'source': 'speech1.txt'}, page_con

In [23]:
### saving and Loading
db.save_local("faiss_index")