In [17]:
import tiktoken

def get_num_of_token(string: str, enc: "string"):
    """this returns the number of token in text string"""
    encoding = tiktoken.get_encoding(enc)
    token = len(encoding.encode(string))
    return token


get_num_of_token("Hello world", "cl100k_base")


2

In [18]:
from langchain_ollama import OllamaEmbeddings

questions= "what is javascript"
document = "JavaScript is a high-level, interpreted programming language primarily used for creating interactive web applications. It is dynamically typed, prototype-based, and supports functional, object-oriented, and imperative programming styles."
embeddings = OllamaEmbeddings(model="mxbai-embed-large")



query_result = embeddings.embed_query(questions)
document_result = embeddings.embed_query(document)
print(len(query_result))

1024


In [19]:
import numpy
def cosine_similarity(vec1, vec2):
    by_product = numpy.dot(vec1, vec2)
    norm_vec1 = numpy.linalg.norm(vec1)
    norm_vec2 = numpy.linalg.norm(vec2)

    cosine_sim = by_product / ( norm_vec1 * norm_vec2)
    return cosine_sim

cosine_similarity(query_result, document_result)

np.float64(0.8309821924823665)

In [20]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(web_path="https://blog.jetbrains.com/webstorm/2024/10/javascript-best-practices-2024/")
data = loader.load()
print(data)

[Document(metadata={'source': 'https://blog.jetbrains.com/webstorm/2024/10/javascript-best-practices-2024/', 'title': 'JavaScript Best Practices | The WebStorm Blog', 'description': 'Learn about the latest JavaScript best practices. This blog post covers some of the recommended best practices when coding with JavaScript.', 'language': 'en_US'}, page_content='\n\n\nJavaScript Best Practices | The WebStorm Blog\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n\n\n\nTopics\n\n\nSearch\n\nBurger menu icon\n\n\nIDEs\n\nCLion\nDataGrip\nDataSpell\nFleet\nGoLand\nIntelliJ IDEA\nPhpStorm\nPyCharm\nRustRover\nRider\nRubyMine\nWebStorm\n\n\nPlugins & Services\n\nBig Data Tools\nCode With Me\nQuality Assurance\nJetBrains Platform\nScala\nToolbox App\nWriterside\nJetBrains AI\nGrazie\nJunie\n\n\nTeam Tools\n\nDatalore\nSpace\nTeamCit

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

textSplitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=1000)
splitted_docs = textSplitter.split_documents(data)

In [22]:
from pymongo import MongoClient
from os import path
import certifi
import dotenv
from langchain_mongodb import MongoDBAtlasVectorSearch
env_path = path.expanduser("~"+"/langchain-python/.env")
dotenv.load_dotenv(env_path,encoding="utf-8")
MONGODB_URI = dotenv.get_key(env_path,  "MONGODB_URI")

client = MongoClient(MONGODB_URI, tlsCAFile=certifi.where())
DB_NAME="langchain-test-db"
COLL_NAME="langchain-test-vectorstore"
ATLAS_VECTOR_SEARCH_INDEX_NAME="langchain-test-index"

collection = client[DB_NAME][COLL_NAME]


atlas_vector_search = MongoDBAtlasVectorSearch(collection=collection,embedding=embeddings, index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,relevance_score_fn="cosine")
vector_store = atlas_vector_search.from_documents(documents=splitted_docs, embedding=embeddings,collection=collection)
print(vector_store)
retriever = vector_store.as_retriever(search_kwargs={"k": 1})
print(splitted_docs)
docs = retriever.invoke("when was javascript invented ?")
docs


[Document(metadata={'source': 'https://blog.jetbrains.com/webstorm/2024/10/javascript-best-practices-2024/', 'title': 'JavaScript Best Practices | The WebStorm Blog', 'description': 'Learn about the latest JavaScript best practices. This blog post covers some of the recommended best practices when coding with JavaScript.', 'language': 'en_US'}, page_content="JavaScript Best Practices | The WebStorm Blog\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n\n\n\nTopics\n\n\nSearch\n\nBurger menu icon\n\n\nIDEs\n\nCLion\nDataGrip\nDataSpell\nFleet\nGoLand\nIntelliJ IDEA\nPhpStorm\nPyCharm\nRustRover\nRider\nRubyMine\nWebStorm\n\n\nPlugins & Services\n\nBig Data Tools\nCode With Me\nQuality Assurance\nJetBrains Platform\nScala\nToolbox App\nWriterside\nJetBrains AI\nGrazie\nJunie\n\n\nTeam Tools\n\nDatalore\nSpace\nTeamCity\nUps

[]

In [23]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import init_chat_model



llm = init_chat_model(model="llama3-8b-8192", model_provider="groq")
template_string = """answer the following question based on this context:  {context}
question: {question}
"""

prompt = ChatPromptTemplate.from_template(template=template_string)



In [24]:
chain = prompt | llm


chain.invoke({"context": docs, "question": "who is  Maurice Zeldman?"  })

AIMessage(content="Based on the provided context [], I couldn't find any information about Maurice Zeldman. Since the context is empty, I'm assuming it's a new question. After conducting a search, I couldn't find any notable individuals with the name Maurice Zeldman. If you could provide more context or details, I'd be happy to help you find the answer.", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 74, 'prompt_tokens': 31, 'total_tokens': 105, 'completion_time': 0.061666667, 'prompt_time': 0.006154928, 'queue_time': 0.172711921, 'total_time': 0.067821595}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_dadc9d6142', 'finish_reason': 'stop', 'logprobs': None}, id='run-9e03703f-2fd1-4772-a86d-67c9ea793501-0', usage_metadata={'input_tokens': 31, 'output_tokens': 74, 'total_tokens': 105})