In [None]:
!pip install streamlit langchain langchain_community unstructured faiss-cpu

In [52]:
!pip install -qU langchain-groq

In [39]:
!pip install -qU langchain-ollama

In [48]:
import os
import time
import pickle
import langchain
import streamlit as st
from google.colab import userdata
from langchain_groq import ChatGroq
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain

In [49]:
GROQ_API_KEY = userdata.get('GROQ_API_KEY')
os.environ['GROQ_API_KEY'] = GROQ_API_KEY

In [50]:
llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0,
)

### (1) Load data

In [53]:
loaders = UnstructuredURLLoader(urls=[
    "https://en.wikipedia.org/wiki/Elon_Musk",
    "https://en.wikipedia.org/wiki/Cristiano_Ronaldo"
    # "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    # "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load()
len(data)

2

### (2) Split data to create chunks

In [54]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [55]:
len(docs)

519

In [56]:
docs[0]

Document(metadata={'source': 'https://en.wikipedia.org/wiki/Elon_Musk'}, page_content='Elon Musk\n\nAfrikaans\n\nአማርኛ\n\nÆnglisc\n\nالعربية\n\nAragonés\n\nԱրեւմտահայերէն\n\nঅসমীয়া\n\nAsturianu\n\nअवधी\n\nAzərbaycanca\n\nتۆرکجه\n\nBasa Bali\n\nবাংলা\n\n閩南語 / Bân-lâm-gú\n\nБеларуская\n\nБеларуская (тарашкевіца)\n\nभोजपुरी\n\nBikol Central\n\nБългарски\n\nBoarisch\n\nBosanski\n\nBrezhoneg\n\nБуряад\n\nCatalà\n\nЧӑвашла\n\nCebuano\n\nČeština\n\nCymraeg\n\nDansk\n\nالدارجة\n\nDeitsch\n\nDeutsch\n\nडोटेली\n\nEesti\n\nΕλληνικά\n\nEspañol\n\nEsperanto\n\nEuskara\n\nفارسی\n\nFrançais\n\nGaeilge\n\nGaelg\n\nGalego\n\nગુજરાતી\n\n客家語 / Hak-kâ-ngî\n\n한국어\n\nHausa\n\nՀայերեն\n\nहिन्दी\n\nHrvatski\n\nIdo\n\nBahasa Indonesia\n\nInterlingua\n\nInterlingue\n\nIsiZulu\n\nÍslenska\n\nItaliano\n\nעברית\n\nJawa\n\nಕನ್ನಡ\n\nKapampangan\n\nქართული\n\nकॉशुर / کٲشُر\n\nҚазақша\n\nKernowek\n\nKiswahili\n\nKreyòl ayisyen\n\nKurdî\n\nКыргызча\n\nLadin\n\nລາວ\n\nLatina\n\nLatviešu\n\nLëtzebuergesch\n\nLietuvių\n\n

### (3) Create embeddings for these chunks and save them to FAISS index

In [57]:
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

# Pass the documents and embeddings inorder to create FAISS vector index
vector_index = FAISS.from_documents(docs, embeddings)

In [58]:
# Storing vector index create in local
file_path="wiki_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vector_index, f)

In [59]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

### (4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [60]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain



In [63]:
query = "which team does cristiano ronaldo play now"
# query = "what are the main features of punch iCNG?"

# langchain.debug=True

chain({"question": query}, return_only_outputs=True)

{'answer': 'Cristiano Ronaldo plays for Al-Nassr, a Saudi Pro League club, and also captains the Portugal national team.\n',
 'sources': 'https://en.wikipedia.org/wiki/Cristiano_Ronaldo'}