In [1]:
import os
import random
import requests
import fiass
from bs4 import BeautifulSoup
from google.oauth2 import service_account  
from googleapiclient.discovery import build 
import langchain
from langchain_openai import OpenAI
from langchain.schema import Document
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain

In [2]:
import json

config = {}
try:
    with open("config.json") as file:
        config = json.load(file)
except FileNotFoundError:
    print("Error: config.json file not found")

In [3]:
# Load the API key from the JSON file
def load_api_key():
    with open('config.json', 'r') as file:
        config = json.load(file)
    return config["api_key"]

# Use the loaded API key
api_key = load_api_key()

In [4]:
# Define the URLs to be used as knowledge base
urls = [
    "https://businessday.ng/news/article/nigerias-biggest-single-local-investment-faces-age-old-threats/",
    "https://en.wikipedia.org/wiki/Tesla,_Inc.",
    "https://www.macrotrends.net/stocks/charts/TSLA/tesla/net-worth",
    "https://edition.cnn.com/2024/07/21/politics/inside-bidens-exit-from-2024-race/index.html",
    "https://edition.cnn.com/politics/live-news/joe-biden-election-drop-out-07-22-24/index.html",
    "https://edition.cnn.com/2024/07/22/world/world-reaction-biden-exit-2024-us-presidential-race/index.html",
    "https://edition.cnn.com/2024/07/21/politics/kamala-harris-biden-endorsement-democratic-nominee/index.html"
]

def fetch_content_from_urls(urls):
    documents = []
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        content = soup.get_text()
        
        # Create a document with content and metadata
        doc = Document(
            page_content=content,
            metadata={"source": url}
        )
        documents.append(doc)
    return documents

In [5]:
data = fetch_content_from_urls(urls)

In [6]:
len(data)

7

In [7]:
llm = OpenAI(temperature = 0.9, max_tokens = 500, openai_api_key=api_key)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200
)

docs = text_splitter.split_documents(data)
len(docs)

653

In [9]:
embeddings = OpenAIEmbeddings(openai_api_key=api_key)

vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [10]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorindex_openai.as_retriever())
chain



In [12]:
query = "is biden still contesting to be american president?"

langchain.debug = True

chain({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "is biden still contesting to be american president?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Read more about the economy under Biden here.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n            Link Copied!\n        \n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAnswers to some key questions about Biden’s decision to exit the race\n\n                    From CNN's Jack Forrest and Danya Gainor\n                \n\n\n\n\n\n\n\n\n\nPresident Joe Biden departs the White House on July 15, in Washington, DC.\n\nKevin Dietsch/Getty Images\n\n\n\n            With just over 100 days until 

{'answer': ' No, Joe Biden announced in July 2024 that he is no longer contesting to be the American president.\n',
 'sources': 'https://edition.cnn.com/politics/live-news/joe-biden-election-drop-out-07-22-24/index.html'}