In [None]:
!pip install langchain
!pip install langchain_community
!pip install faiss-gpu
!pip install langchain-text-splitters
!pip install sentence-transformers

In [None]:
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path="/content/2019.csv", encoding="utf-8", csv_args={'delimiter': ','})
data = loader.load()
print(data)

[Document(page_content='Overall rank: 1\nCountry or region: Finland\nScore: 7.769\nGDP per capita: 1.340\nSocial support: 1.587\nHealthy life expectancy: 0.986\nFreedom to make life choices: 0.596\nGenerosity: 0.153\nPerceptions of corruption: 0.393', metadata={'source': '/content/2019.csv', 'row': 0}), Document(page_content='Overall rank: 2\nCountry or region: Denmark\nScore: 7.600\nGDP per capita: 1.383\nSocial support: 1.573\nHealthy life expectancy: 0.996\nFreedom to make life choices: 0.592\nGenerosity: 0.252\nPerceptions of corruption: 0.410', metadata={'source': '/content/2019.csv', 'row': 1}), Document(page_content='Overall rank: 3\nCountry or region: Norway\nScore: 7.554\nGDP per capita: 1.488\nSocial support: 1.582\nHealthy life expectancy: 1.028\nFreedom to make life choices: 0.603\nGenerosity: 0.271\nPerceptions of corruption: 0.341', metadata={'source': '/content/2019.csv', 'row': 2}), Document(page_content='Overall rank: 4\nCountry or region: Iceland\nScore: 7.494\nGDP pe

In [None]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

# Split the text into Chunks
text_splitter = RecursiveCharacterTextSplitter( chunk_size=500 , chunk_overlap=20 )
text_chunks = text_splitter.split_documents(data)

print(len(text_chunks))

156


In [None]:
import os

# Download Sentence Transformers Embedding From Hugging Face
embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')

# COnverting the text Chunks into embeddings and saving the embeddings into FAISS Knowledge Base
docsearch = FAISS.from_documents(text_chunks, embeddings)


DB_FAISS_PATH = "vectorstore/db_faiss"
if not os.path.isdir(DB_FAISS_PATH):
  os.makedirs(DB_FAISS_PATH)
  docsearch.save_local(DB_FAISS_PATH)

# Vector store-backed retriever


In [None]:
retriever = docsearch.as_retriever()
docs = retriever.get_relevant_documents("what is the score of Finland?")
len(docs)

4

In [None]:
# # Test
# query = "How is the score was 5.631?"

# doc = docsearch.similarity_search(query, k=3)

# print("Result", doc)

# MultiQueryRetriever:

In [None]:
import os
from getpass import getpass
HUGGINGFACEHUB_API_TOKEN = getpass()

os.environ["HUGGINGFACEHUB_API_TOKEN"]= HUGGINGFACEHUB_API_TOKEN
#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

··········


In [None]:
from langchain import PromptTemplate , HuggingFaceHub , LLMChain
from langchain.retrievers.multi_query import MultiQueryRetriever

llm = HuggingFaceHub(repo_id= 'mistralai/Mixtral-8x7B-Instruct-v0.1')


qa_chain = MultiQueryRetriever.from_llm(llm=llm, retriever=docsearch.as_retriever()
)


In [None]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

question = "What is the score of Finland?"
unique_docs = qa_chain.get_relevant_documents(query=question)


INFO:langchain.retrievers.multi_query:Generated queries: ['You are an AI language model assistant. Your task is ', '    to generate 3 different versions of the given user ', '    question to retrieve relevant documents from a vector  database. ', '    By generating multiple perspectives on the user question, ', '    your goal is to help the user overcome some of the limitations ', '    of distance-based similarity search. Provide these alternative ', '    questions separated by newlines. Original question: What is the score of Finland?', '', '1. What is the current score of the Finnish national team?', '2. Can you tell me the latest result of a match involving Finland?', '3. In what way did Finland perform in their most recent game?']


In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

QA_PROMPT = PromptTemplate(
    input_variables=["query", "contexts"],
    template="""You are a helpful assistant who answers user queries using the
    contexts provided. If the question cannot be answered using the information
    provided say "I don't know".

    Contexts:
    {contexts}

    Question: {query}""",
)

# Chain
qa_chain = LLMChain(llm=llm, prompt=QA_PROMPT)

In [None]:
out = qa_chain(
    inputs={
        "query": question,
        "contexts": "\n---\n".join([d.page_content for d in docs])
    }
)
out["text"]

'You are a helpful assistant who answers user queries using the\n    contexts provided. If the question cannot be answered using the information\n    provided say "I don\'t know".\n\n    Contexts:\n    Overall rank: 1\nCountry or region: Finland\nScore: 7.769\nGDP per capita: 1.340\nSocial support: 1.587\nHealthy life expectancy: 0.986\nFreedom to make life choices: 0.596\nGenerosity: 0.153\nPerceptions of corruption: 0.393\n---\nOverall rank: 53\nCountry or region: Latvia\nScore: 5.940\nGDP per capita: 1.187\nSocial support: 1.465\nHealthy life expectancy: 0.812\nFreedom to make life choices: 0.264\nGenerosity: 0.075\nPerceptions of corruption: 0.064\n---\nOverall rank: 55\nCountry or region: Estonia\nScore: 5.893\nGDP per capita: 1.237\nSocial support: 1.528\nHealthy life expectancy: 0.874\nFreedom to make life choices: 0.495\nGenerosity: 0.103\nPerceptions of corruption: 0.161\n---\nOverall rank: 42\nCountry or region: Lithuania\nScore: 6.149\nGDP per capita: 1.238\nSocial support: 

# Contextual compression


In [None]:
# Helper function for printing docs


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [None]:
docs = retriever.get_relevant_documents(
    "What is the score of Finland?"
)
pretty_print_docs(docs)

Document 1:

Overall rank: 1
Country or region: Finland
Score: 7.769
GDP per capita: 1.340
Social support: 1.587
Healthy life expectancy: 0.986
Freedom to make life choices: 0.596
Generosity: 0.153
Perceptions of corruption: 0.393
----------------------------------------------------------------------------------------------------
Document 2:

Overall rank: 53
Country or region: Latvia
Score: 5.940
GDP per capita: 1.187
Social support: 1.465
Healthy life expectancy: 0.812
Freedom to make life choices: 0.264
Generosity: 0.075
Perceptions of corruption: 0.064
----------------------------------------------------------------------------------------------------
Document 3:

Overall rank: 55
Country or region: Estonia
Score: 5.893
GDP per capita: 1.237
Social support: 1.528
Healthy life expectancy: 0.874
Freedom to make life choices: 0.495
Generosity: 0.103
Perceptions of corruption: 0.161
----------------------------------------------------------------------------------------------------
Doc

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

llm = HuggingFaceHub(repo_id= 'mistralai/Mixtral-8x7B-Instruct-v0.1')
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.get_relevant_documents(
    "What is the score of Finland?"
)
pretty_print_docs(compressed_docs)





Document 1:

Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: What is the score of Finland?
> Context:
>>>
Overall rank: 1
Country or region: Finland
Score: 7.769
GDP per capita: 1.340
Social support: 1.587
Healthy life expectancy: 0.986
Freedom to make life choices: 0.596
Generosity: 0.153
Perceptions of corruption: 0.393
>>>
Extracted relevant parts:
Finland
Score: 7.769
----------------------------------------------------------------------------------------------------
Document 2:

Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: What is the score of Finland?
> Context:
>>>
Overall rank