In [68]:
!pip install -Uq langchain faiss-cpu langchain-community langchain-core langchain-groq PyMuPDF tiktoken bs4 fastembed tavily-python rerankers

In [69]:
import os
import time
import warnings
from langchain_community.retrievers import TavilySearchAPIRetriever
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from tavily.errors import InvalidAPIKeyError
from langchain.schema.runnable import Runnable
from typing import cast

In [70]:
warnings.filterwarnings('ignore')

In [71]:
def search_web(query: str) -> list:
    """Search the web."""
    try:
        tavily_tool = TavilySearchAPIRetriever(
            k=10,
            #api_key=os.getenv("TAVILY_API_KEY"),
            api_key =userdata.get('TAVILY_API_KEY'),
            max_tokens=10000,
            search_depth='advanced'
        )
        results = tavily_tool.invoke(query)
        if not results:
            return ["No relevant context found."]
        return [result.page_content for result in results]
    except InvalidAPIKeyError as e:
        return ["Invalid API key."]
    except Exception as e:
        return [f"Error occurred while fetching context: {str(e)}"]

In [72]:
# groq_key = os.getenv("GROQ_API_KEY")
# tavily_key = os.getenv("TAVILY_API_KEY")
from google.colab import userdata
groq_key = userdata.get('GROQ_API_KEY')


In [73]:
llm = ChatGroq(
        api_key=groq_key,
        model='llama-3.2-1b-preview',
        temperature=0,
        max_retries=3,
        timeout=None,
        max_tokens=1024
    )
llm.invoke("What is AI")

AIMessage(content='Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that typically require human intelligence, such as:\n\n1. Learning: AI systems can learn from data and improve their performance over time.\n2. Problem-solving: AI systems can analyze data, identify patterns, and make decisions.\n3. Reasoning: AI systems can draw conclusions based on data and rules.\n4. Perception: AI systems can interpret and understand data from sensors and other sources.\n5. Natural Language Processing (NLP): AI systems can understand and generate human language.\n\nAI can be categorized into several types, including:\n\n1. Narrow or Weak AI: Designed to perform a specific task, such as facial recognition or language translation.\n2. General or Strong AI: Designed to perform any intellectual task that a human can, such as reasoning, problem-solving, and learning.\n3. Superintelligence: Significantly more intelligent than the best human minds, with cap

In [74]:
prompt = """You are a helpful assistant.

            Answer the question according to the query and given context:
            Question: {question}
            Context: {context}
            Provide an accurate response in bullet points but don't mention it in the response,
            the answer should be brief (max 5 lines/points).
            Do not hallucinate.
    """

prompt_template = ChatPromptTemplate.from_template(prompt)

In [75]:
chain = prompt_template | llm

In [76]:
query = input("")
context = search_web(query)
response = chain.invoke({'question':query,
                         'context':context})
response.content

what is cv


"Here are the key points about CVs:\n\n• A CV is a short written summary of a person's career, qualifications, and education.\n• It is used in various fields, including academia, science, and research.\n• A CV is a detailed list of achievements, education, and work experience.\n• It is typically 1-2 pages long and provides an in-depth review of a person's professional history.\n• A CV is used for job applications in academia, science, and research, as well as in the US and EU."

## Method 1

In [77]:
!pip install rank_bm25



In [78]:
from rank_bm25 import BM25Okapi

tokenized_corpus = [doc.split() for doc in context]
bm25 = BM25Okapi(tokenized_corpus)
scores = bm25.get_scores(query.split())
scored_docs = sorted(zip(context, scores), key=lambda x: x[1], reverse=True)
top3_doc=[doc for doc, _ in scored_docs[:3]]
response = chain.invoke({'question':query,
                         'context':top3_doc})
response.content

'Here are the key points about CVs:\n\n• A CV is a comprehensive document listing academic and professional experience.\n• It is a detailed summary of your life, usually more than two pages long.\n• The main difference between a CV and a resume is its length and purpose.\n• A CV is used for academic purposes, while a resume is for job search.\n• It typically includes information about your education, work experience, and skills.'

## Method 2

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(context)
query_vector = vectorizer.transform([query])
similarity = cosine_similarity(query_vector,doc_vectors).flatten()
scored_docs = sorted(zip(context, similarity), key=lambda x: x[1], reverse=True)
top3_doc = [doc for doc, _ in scored_docs[:3]]

## Method 3

In [80]:
!pip install transformers



In [81]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

inputs = tokenizer(
        [(query, doc) for doc in context],
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=512,
    )
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}
with torch.no_grad():
      scores = model(**inputs).logits.squeeze(-1)
scored_docs = sorted(zip(context, scores.tolist()), key=lambda x: x[1], reverse=True)
top3_doc = [doc for doc, _ in scored_docs[:3]]

## Method 4

In [82]:
!pip install sentence-transformers



In [83]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
query_embedding = model.encode(query, convert_to_tensor=True)
doc_embeddings = model.encode(context, convert_to_tensor=True)
similarities = util.cos_sim(query_embedding, doc_embeddings)[0].cpu().numpy()
scored_docs = sorted(
        zip(context, similarities), key=lambda x: x[1], reverse=True
    )
top3_doc = [doc for doc, _ in scored_docs[:3]]

## Method 5

In [84]:
!pip install spacy



In [88]:
import spacy

nlp = spacy.load("en_core_web_sm")
query_doc = nlp(query)
scored_docs = sorted(
        context,
        key=lambda doc: query_doc.similarity(nlp(doc)),
        reverse=True,
    )
top3_doc = scored_docs[:3]

['What is a CV? "CV" stands for curriculum vitae. This Latin phrase is the technical meaning of a CV and stands for "course of life." So, what does a CV mean on a job application?It\'s a detailed summary of your academic and professional life — this document usually is more than two pages long.',
 'A CV is a detailed document highlighting your professional and academic history, while a resume is a summary of your skills and experience. Learn when to use a CV, how to format it and see examples of both.',
 'A curriculum vitae (CV) is a detailed list of your achievements, education, and work experience. Learn when and how to use a CV, and see a sample CV for academia.']