In [None]:
!pip install transformers sentence-transformers datasets rouge-score streamlit evaluate

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import evaluate # Import evaluate directly
import numpy as np
import streamlit as st  # For optional UI later

In [None]:
dataset= []

with open('/content/LangChain_Bot_dataset.txt', 'r') as file:
  dataset= file.readlines()
  print(f'Loaded {len(dataset)} lines from file.')

In [None]:
Embedding_model= 'all-MiniLM-L6-v2'
embedder= SentenceTransformer(Embedding_model)

Vector_DB= []
for chunk in dataset:
  embedding = embedder.encode([chunk])[0]
  Vector_DB.append((chunk, embedding))
  print(f'Add chunks to DB (shape: {embedding.shape})')

In [None]:
def cosine_similarity(a, b):
  dot_product = np.dot(a, b)
  norm_a= np.linalg.norm(a)
  norm_b= np.linalg.norm(b)
  return dot_product / (norm_a * norm_b)

In [None]:
def retrieve(query, top_n= 3):
  query_embedding= embedder.encode([query])[0]
  similarities= []
  for chunk, embedding in Vector_DB:
    sim= cosine_similarity(query_embedding, embedding)
    similarities.append([chunk, sim])

  similarities.sort(key= lambda x: x[1], reverse= True)
  return similarities[:top_n]

In [None]:
Generation_model = 'gpt2'
generator = pipeline('text-generation', model= Generation_model)

def generate_ans(query, retrieved_knowledge):
  context = '\n'.join([f'- {chunk}' for chunk, _ in retrieved_knowledge])
  prompt= f'''You are a helpful LangChain documentation assistant.
  Use ONLY the following context to answer the technical question. Be concise, accurate, and code-focused for developers.
  context: {context}
  Question: {query}
  Answer:'''

  response = generator(prompt, max_new_tokens= 50, num_return_sequences= 1, do_sample= True, temperature= 0.1)[0]['generated_text']
  answer = response.split('Answer:')[-1].strip()
  return answer

In [None]:
input_query= input('Ask any Langchain doc Question:')
retrieved= retrieve(input_query)
print('Retrieved Info: ')
for chunk, sim in retrieved:
  print(f'- (similarity {sim:.2f}) {chunk}')

generated_ans= generate_ans(input_query, retrieved)
print('\nGenerated Answer: ', generated_ans)

In [None]:
rouge = evaluate.load('rouge')
test_queries = [
    ("What is LCEL in LangChain?", "LCEL is LangChain Expression Language for composing chains declaratively."),
    ("How do agents work?", "Agents use LLMs for reasoning and tools for actions like ReAct."),
    ("Explain retrieval in LangChain.", "Retrieval enables RAG by embedding and fetching relevant doc chunks."),
    ("What are chains in LangChain?", "Chains are sequences of calls to LLMs or tools combined for a task."),
    ("What is a retriever?", "A retriever is an interface that returns relevant documents given a query."),
    ("How does memory work in LangChain?", "Memory lets chains and agents persist state across interactions."),
    ("What is RAG?", "RAG stands for Retrieval-Augmented Generation, combining retrieval with LLM outputs."),
    ("What are tools in LangChain?", "Tools are external functions or APIs that agents can call to take actions."),
    ("What is LangSmith?", "LangSmith is a platform for debugging, testing, and monitoring LangChain apps."),
    ("How do callbacks work?", "Callbacks let you log, stream, or monitor events during chain or agent execution.")
]

scores = []
for q, ref in test_queries:
    ret = retrieve(q)
    ans = generate_ans(q, ret)
    score = rouge.compute(predictions=[ans], references=[ref])['rougeL']
    scores.append(score)
print(f'Avg ROUGE-L Score: {np.mean(scores):.2f}')

In [None]:
def streamlit_app():
    st.title('RAG LangChain DocBot')
    query = st.text_input('Ask about LangChain:')
    if query:
        ret = retrieve(query)
        ans = generate_ans(query, ret)
        st.write('Retrieved:', [c for c,_ in ret])
        st.write('Answer:', ans)

if __name__ == '__main__':
    streamlit_app()