In [56]:
!pip install langchain_community tiktoken langchain-groq rank_bm25 langchainhub chromadb langchain langgraph tavily-python langchain-huggingface -q

In [57]:

import os
import time
import warnings
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import Runnable
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from rank_bm25 import BM25Okapi

from typing import cast

In [58]:
warnings.filterwarnings('ignore')

In [59]:
# groq_key = os.getenv("GROQ_API_KEY")
# tavily_key = os.getenv("TAVILY_API_KEY")
from google.colab import userdata
groq_key = userdata.get('GROQ_API_KEY')
tavily_key =userdata.get('TAVILY_API_KEY'),

In [60]:
urls = [
    "https://medium.com/@muddassir10/machine-learning-based-price-estimation-a-practical-approach-7164b35d10fd",
]
loader = WebBaseLoader(urls)

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0
)

In [61]:
doc_splits = text_splitter.split_documents(docs_list)
# Add to vectorDB
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
)
retriever = vectorstore.as_retriever()

In [62]:
def reranked_docs(query,docs,k=3):
  if any(isinstance(doc, str) for doc in docs):
    tokenized_corpus = [doc.split() for doc in docs]
  else:
    tokenized_corpus = [doc.page_content.split() for doc in docs]
  bm25 = BM25Okapi(tokenized_corpus)
  scores = bm25.get_scores(query.split())
  if sum([1 for score in scores if score > .4]) < 3:
    return "no relevant docs found"
  else:
    scored_docs = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
  return scored_docs[:3]

In [63]:
rag_llm = llm = ChatGroq(
        api_key=groq_key,
        model='llama-3.2-1b-preview',
        temperature=0,
        max_retries=3,
        timeout=None,
        max_tokens=512
)

In [64]:
rag_prompt = hub.pull("rlm/rag-prompt")

rag_chain = rag_prompt | rag_llm | StrOutputParser()

In [65]:
def generate(query,doc,prompt):
  rag_chain = prompt | rag_llm | StrOutputParser()
  generation = rag_chain.invoke({"context": doc, "question": query})
  return generation

In [66]:
prompt1 = '''You are an assistant for question-answering tasks.
                Use the following documents to answer the question.
                If you don't know the answer, just say that you don't know.
                Use three sentences maximum and keep the answer concise:
                Question: {question}
                Context: {context}
                Answer:
'''

prompt2 = """
You are a helpful assistant.
            Answer the question according to the query and given context:
            Question: {question}
            Context: {context}
            Provide an accurate response in bullet points but don't mention it in the response,
            the answer should be brief (max 5 lines/points).
            Do not hallucinate.
"""

prompt3 ="""
"You are an assistant for question-answering tasks. Use the following context extracted from a webpage to answer the question.
If you don't know the answer, just say that you don't know. Keep the answer concise and brief."
Question: {question}
Context:{context}
Answer:
"""

prompt4 ="""
Given the context provided, respond to the question accurately, using only the information in the context.

Context:
{context}

Question:
{question}

Response:
"""


prompts = [rag_prompt,prompt1,prompt2,prompt3,prompt4]

for prompt_id in range(len(prompts)):
  try:
    prompts[prompt_id] = ChatPromptTemplate.from_template(prompts[prompt_id])
  except:
    continue


In [70]:
query = "what are the main processessing steps in a ml project?"
reranked_doc = reranked_docs(query,doc_splits)
responses = []
for prompt in prompts:
  response = generate(query,reranked_doc,prompt)
  responses.append(response)
  print(response)

The main processing steps in a machine learning project include:

1. Data collection and preprocessing
2. Feature engineering
3. Model selection and training
4. Model evaluation and tuning
5. Model deployment and integration

These steps are applied to various machine learning algorithms, such as linear regression, gradient boosting regressor, decision tree regressor, AdaBoost regressor, and stochastic gradient descent, to develop a comprehensive multiple domain pricing estimator.
The main processing steps in a machine learning project are:

1. Data collection and preprocessing
2. Feature engineering
3. Model selection and training
4. Model evaluation and tuning
5. Model deployment and integration

These steps are crucial in building a comprehensive multiple domain pricing estimator.
• Collect and preprocess data from various sources.
• Employ Exploratory Data Analysis (EDA) to identify patterns and outliers.
• Feature engineer and scale the data for algorithmic modeling.
• Apply domai

In [71]:
def get_best_response(responses):
  tokenized_corpus = [response.split() for response in responses]
  bm25 = BM25Okapi(tokenized_corpus)
  scores = bm25.get_scores(query.split())
  scored_responses= sorted(zip(responses, scores), key=lambda x: x[1], reverse=True)
  return scored_responses[0][0]

In [72]:
print(get_best_response(responses))

The main processing steps in a machine learning (ML) project typically involve the following:

1. **Data Collection**: Gathering relevant data from various sources, including datasets, APIs, or user input.
2. **Data Preprocessing**: Cleaning, transforming, and preparing the data for modeling by handling missing values, encoding categorical variables, and scaling/normalizing the data.
3. **Feature Engineering**: Creating new features or transforming existing ones to improve model performance, such as extracting relevant information from text data or creating new variables.
4. **Model Selection**: Choosing the most suitable machine learning algorithm or model type for the problem, considering factors like data type, complexity, and performance requirements.
5. **Model Training**: Training the selected model using the preprocessed data, often involving iterative optimization and hyperparameter tuning.
6. **Model Evaluation**: Assessing the performance of the trained model using metrics li