In [21]:
import xmltodict
import requests

r = requests.get("https://news.itsfoss.com/sitemap-posts.xml")
print(r.text)
xml = r.text
rss = xmltodict.parse(xml)

article_links = [entry["loc"] for entry in rss["urlset"]["url"]]



In [22]:
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm


def extract_content(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")

    elements = [
        soup.select_one(".c-topper__headline"),
        soup.select_one(".c-topper__standfirst"),
        soup.select_one(".c-content"),
    ]

    text = "".join([element.get_text() for element in elements])

    return text


articles = []
# Limited the list of > 900 articles to 10 for this example
for url in tqdm(article_links[0:10], desc="Extracting article content"):
    articles.append({"source": url, "content": extract_content(url)})

Extracting article content:   0%|          | 0/10 [00:00<?, ?it/s]

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

rec_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, 
                                              chunk_overlap=150)

web_docs, meta = [], []

for article in tqdm(articles, desc="Splitting articles into chunks"):
    splits = rec_splitter.split_text(article["content"])
    web_docs.extend(splits)
    meta.extend([{"source": article["source"]}] * len(splits))

Splitting articles into chunks:   0%|          | 0/10 [00:00<?, ?it/s]

In [25]:
# from transformers import BertModel, BertTokenizer
# import torch

# class EmbedingWrapper():
#     def __init__(self) -> None:
#         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#         self.model = BertModel.from_pretrained('bert-base-uncased')
#         self.model.eval()

#     def __call__(self, text):
#         inputs = self.tokenizer(text, return_tensors="pt")
#         with torch.no_grad():
#             outputs = self.model(**inputs)
#             return outputs.last_hidden_state[:, 0, :].flatten().tolist()

# Embed = EmbedingWrapper()


In [26]:
# print(len(Embed("hee")))
# print(len(Embed("dsadasda")))
# print(len(Embed("dsadasd")))

# vector_size = 768


In [27]:
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance, for testing, CI/CD

collection_name = "my_collection"

In [28]:
qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config=rest.VectorParams(
                    size=384,
                    distance=rest.Distance.COSINE,
                ),
)

# for a,b in zip(web_docs,meta):
#     print(a)
#     print("------------")
#     print(b)
#     print("------------")


# for doc, meta,index in tqdm(zip(web_docs, meta,range(len(web_docs))), desc="Indexing documents"):
#     qdrant.upsert(
#         collection_name=collection_name,
#         wait=True,
#         points=[rest.PointStruct(id=index,payload={"source":meta["source"],"doc":doc}, vector=Embed(doc))],
#     )

True

In [29]:
from langchain.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2", model_kwargs={"device": "cpu"}
    )
qdrant_v = Qdrant(client=qdrant,collection_name=collection_name,embeddings=embeddings)

In [31]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="question",
    output_key="answer",
    return_messages=True,
)

In [32]:
from langchain import PromptTemplate
template = """You are a chatbot having a conversation with a human.
Given the following extracted parts of a long document and a question, 
create a final answer.
{context}

Human: {question}
Chatbot:"""

question_prompt = PromptTemplate(
    input_variables=[ "question", "context"], template=template
)

In [40]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import Ollama
from langchain.chains import ReduceDocumentsChain


llm = Ollama(model="llama2")

article_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    retriever=qdrant_v.as_retriever(),
    memory=memory,
    # question_prompt=question_prompt,
    max_tokens_limit=100,
    reduce_k_below_max_tokens=True
)


result = article_chain({"question": "What is Skiff?"}, 
                        return_only_outputs=True)
print(result)

{'answer': 'The final answer to the question "What did the president say about Michael Jackson?" is that he did not mention Michael Jackson at all in any of the provided texts. Therefore, the source for this question is blank (0-pl).', 'sources': ''}
