# Question-Answering with LangChain and GPT-3

## Data

Document Source: the content of the web portal [FOSS](https://archive.ph/o/8NCVk/https://itsfoss.com/), which specializes in Open Source technologies, with a particular focus on Linux.

A list of all the articles to process can be found from the site's [sitemap-posts.xml file](https://news.itsfoss.com/sitemap-posts.xml), which contains a list of links to all the articles.

In [None]:
import os

import pandas as pd
import numpy as np

import xmltodict
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm


In [None]:
r = requests.get("https://news.itsfoss.com/sitemap-posts.xml")
xml = r.text
rss = xmltodict.parse(xml)

article_links = [entry["loc"] for entry in rss["urlset"]["url"]]

print(f"Total number of articles: {len(article_links)}")

In [None]:
def extract_content(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    elements = [
        soup.select_one(".c-topper__headline"),
        soup.select_one(".c-topper__standfirst"),
        soup.select_one(".c-content"),
        ]
    
    text = "".join([element.get_text() for element in elements])
    
    return text

In [None]:
# Limited the list of articles to 10 for demo only
articles = (
    [{"source": url, "content": extract_content(url)}
     for url in tqdm(article_links[0:10], desc="Extracting article content")
     ]
)


In [None]:
articles[0]["source"], articles[0]["content"]

In [None]:
articles_df = pd.DataFrame(articles)
articles_df.head()

## Embedding

### Splitting into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

rec_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)

web_docs, meta = [], []
for article in tqdm(articles, desc="Splitting articles into chunks"):
    splits = rec_splitter.split_text(article["content"])
    web_docs.extend(splits)
    meta.extend([{"source": article["source"]}] * len(splits))

### Embedding chunks

In [None]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# make sure the OPENAI_API_KEY environment variable has been set to be the OpenAI key 
#os.environ["OPENAI_API_KEY"] = "YOUR KEY"
article_store = FAISS.from_texts(
    texts=web_docs, embedding=OpenAIEmbeddings(), metadatas=meta)


## Query

In [None]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="question",
    output_key="answer",
    return_messages=True,
    )

In [None]:
from langchain import PromptTemplate

template = """You are a chatbot having a conversation with a human.
Given the following extracted parts of a long document and a question,
create a final answer.
{context}
{chat_history}
Human: {question}
Chatbot:"""

question_prompt = PromptTemplate(
    input_variables=["chat_history", "question", "context"], 
    template=template
    )

In [None]:
from langchain import OpenAI, PromptTemplate
from langchain.chains import RetrievalQAWithSourcesChain

article_chain = RetrievalQAWithSourcesChain.from_llm(
    llm=OpenAI(temperature=0.0),
    retriever=article_store.as_retriever(k=4),
    memory=memory,
    question_prompt=question_prompt,
    )

result = article_chain(
    {"question": "What is Skiff?"},
    return_only_outputs=True
)

In [None]:
result