Trying LCEL

In [83]:
pip install langchain bs4 sentence_transformers feedparser newspaper3k --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [84]:
from langchain.llms import Ollama
from langchain.vectorstores import Chroma
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain.prompts import ChatPromptTemplate

import feedparser
from requests.exceptions import Timeout
import json
from datetime import datetime, timedelta

from newspaper import Article


In [85]:
feeds = [
    ["CNN Top Stories", "http://rss.cnn.com/rss/cnn_topstories.rss"],
    ["CNN World", "http://rss.cnn.com/rss/cnn_world.rss"],
    ["CNN US", "http://rss.cnn.com/rss/cnn_us.rss"],
    ["CNN Business", "http://rss.cnn.com/rss/money_latest.rss"],
    ["CNN Politics", "http://rss.cnn.com/rss/cnn_allpolitics.rss"],
    ["CNN Tech", "http://rss.cnn.com/rss/cnn_tech.rss"],
    ["CNN Health", "http://rss.cnn.com/rss/cnn_health.rss"],
    ["CNN Entertainment", "http://rss.cnn.com/rss/cnn_showbiz.rss"],
    ["CNN Travel", "http://rss.cnn.com/rss/cnn_travel.rss"],
   ]

In [86]:
count = 0
links = []

for sub_feed in feeds:
    feed = feedparser.parse(sub_feed[1])
    one_day_ago = datetime.now() - timedelta(days=1)

    recent_items = []
    for entry in feed.entries:
        try:
            published = datetime(*entry.published_parsed[:6])
            # if published > one_day_ago:
            recent_items.append(entry)
        except:
            if not entry.title == "":
                recent_items.append(entry)

    for item in recent_items:
        try:
            links.append(item.links[0].href)

        except Timeout:
            print("ARTICLE COLLECTION TIMED OUT:", item.links[0].href)
        except Exception as e:
            print("ARTICLE COLLECTION ERROR:", e)

In [87]:
model = Ollama(model='mistral')


embedder_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

embedder = HuggingFaceBgeEmbeddings(
    model_name=embedder_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

split_data = []
for link in links:
    good_2_go = True
    try:
        article = Article(link)
        article.download()
        article.parse()
    except:
        good_2_go = False
        print("ERROR DOWNLOADING ARTICLE:", link)
    

    meta_data = f"Source \n Title: {article.title} \n Url: {link} \n Publish Date: {article.publish_date} \n  Excerpt from source: "

    if good_2_go and not article.text is None:
        candidate = text_splitter.create_documents([article.text])

        temp_split_data = text_splitter.split_documents(candidate)

        for split in temp_split_data:
            split.page_content = meta_data + split.page_content
        
        split_data += temp_split_data


vectorstore = Chroma.from_documents(documents=split_data, embedding=embedder)

output_parser = StrOutputParser()

setup_and_retrieval = RunnableParallel(
    {"context": vectorstore.as_retriever(), "question": RunnablePassthrough()}
)

ERROR DOWNLOADING ARTICLE: https://www.cnn.com/cnn-underscored/home/editors-favorite-sustainable-products?iid=CNNUnderscoredHPcontainer
ERROR DOWNLOADING ARTICLE: https://www.cnn.com/cnn-underscored/gifts/best-mothers-day-gifts-2023?iid=CNNUnderscoredHPcontainer
ERROR DOWNLOADING ARTICLE: https://www.cnn.com/cnn-underscored/fashion/mens-spring-fashion-style-guide?iid=CNNUnderscoredHPcontainer
ERROR DOWNLOADING ARTICLE: https://www.cnn.com/cnn-underscored/travel/amazon-travel-products?iid=CNNUnderscoredHPcontainer
ERROR DOWNLOADING ARTICLE: https://www.cnn.com/cnn-underscored/money/high-yield-savings-accounts?iid=CNNUnderscoredHPcontainer
ERROR DOWNLOADING ARTICLE: https://www.cnn.com/cnn-underscored/money/how-to-file-taxes?iid=CNNUnderscoredHPcontainer
ERROR DOWNLOADING ARTICLE: https://www.cnn.com/cnn-underscored/home/how-to-compost-at-home?iid=CNNUnderscoredHPcontainer
ERROR DOWNLOADING ARTICLE: https://www.cnn.com/cnn-underscored/reviews/mmmat-silicone-mats?iid=CNNUnderscoredHPconta

In [79]:
template = """
You are a news expert who answers questions about news.  

You must only use the most recent data from this context:
{context}

Do not rely on your historical records.

Answer as concisely as possible, but make sure that your information
lines up with the sources.  

Your answer should be in the following format:
    Your answer to the question here.

    Sources: [article title, source link, publish date]

Cite all Possible sources and put each on a new line.
If you don't have the relevent information to answer the question or a source,
tell the user so.  Err on the side of caution.

If the question or topic is off topic and not about news at all, tell the user so.

Here is the question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = setup_and_retrieval | prompt | model | output_parser

In [81]:
print(setup_and_retrieval.invoke("What is happening with taiwan and china?"))

print(chain.invoke("What is happening with taiwan and china?"))

{'context': [Document(page_content='Source \n Title: Three Months After Biden, It’s Xi’s Turn to Court Vietnam \n Url: https://www.nytimes.com/2023/12/12/world/asia/china-vietnam-xi-jinping.html \n Publish Date: 2023-12-12 00:00:00 \n  Excerpt from source: “This is a very delicate dance for Vietnam’s government,” said Alexander Vuving, a professor at the Asia-Pacific Center for Security Studies in Honolulu. “They have to dance on a very thin tightrope, and the tightrope has become even thinner.”\n\nMr. Xi, analysts say, wants to test Vietnam’s intentions, seeking reassurances that it is not siding against China with the United States after Washington and Hanoi agreed in September to form a “comprehensive strategic partnership.”'), Document(page_content='Source \n Title: Three Months After Biden, It’s Xi’s Turn to Court Vietnam \n Url: https://www.nytimes.com/2023/12/12/world/asia/china-vietnam-xi-jinping.html \n Publish Date: 2023-12-12 00:00:00 \n  Excerpt from source: Few nations now