Trying LCEL

In [1]:
pip install langchain bs4 sentence_transformers feedparser newspaper3k --quiet

Note: you may need to restart the kernel to use updated packages.


In [3]:
from langchain.llms import Ollama
from langchain.vectorstores import Chroma
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain.prompts import ChatPromptTemplate

import feedparser
from requests.exceptions import Timeout
import json
from datetime import datetime, timedelta

from newspaper import Article


In [4]:
feeds = [
    ["NYT", "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"]
]

In [5]:
count = 0
links = []

for sub_feed in feeds:
    feed = feedparser.parse(sub_feed[1])
    one_day_ago = datetime.now() - timedelta(days=1)

    recent_items = []
    for entry in feed.entries:
        try:
            published = datetime(*entry.published_parsed[:6])
            # if published > one_day_ago:
            recent_items.append(entry)
        except:
            if not entry.title == "":
                recent_items.append(entry)

    for item in recent_items:
        try:
            links.append(item.links[0].href)

        except Timeout:
            print("ARTICLE COLLECTION TIMED OUT:", item.links[0].href)
        except Exception as e:
            print("ARTICLE COLLECTION ERROR:", e)

In [31]:
model = Ollama(model='mistral')


embedder_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

embedder = HuggingFaceBgeEmbeddings(
    model_name=embedder_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


# url = "https://en.wikipedia.org/wiki/Cyclone_Michaung"

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

split_data = []
for link in links:
    article = Article(link)
    article.download()
    article.parse()

    meta_data = f"Source \n Title: {article.title} \n Url: {link} \n Publish Date: {article.publish_date} \n  Excerpt from source: "

    if not article.text is None:
        candidate = text_splitter.create_documents([article.text])

        temp_split_data = text_splitter.split_documents(candidate)

        for split in temp_split_data:
            split.page_content = meta_data + split.page_content
        
        split_data += temp_split_data


vectorstore = Chroma.from_documents(documents=split_data, embedding=embedder)

output_parser = StrOutputParser()

setup_and_retrieval = RunnableParallel(
    {"context": vectorstore.as_retriever(), "question": RunnablePassthrough()}
)

In [32]:
template = """
You are a news expert who answers questions about news.  

You must only use the most recent data from this context:
{context}

Do not rely on your historical records.

Answer as concisely as possible, but make sure that your information
lines up with the sources.  

You must name any sources at the end of your answer in the following format: 
"Sources: \n[article title, source link, publish date]"

If you don't have the relevent information to answer the question or a source,
tell the user so.  Err on the side of caution.

If the question or topic is not about news, tell the user "Sorry, I can only answer 
questions about news".

Here is the question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = setup_and_retrieval | prompt | model | output_parser

In [33]:
print(setup_and_retrieval.invoke("What is the latest new on isreal and gaza?"))

print("----------------------------------")

print(chain.invoke("What is the latest new on isreal and gaza?"))

{'context': [Document(page_content='Title: How the Israel-Hamas War Tore Apart Public Defenders in the Bronx \n Url: https://www.nytimes.com/2023/12/14/nyregion/bronx-defenders-israel-gaza.html \n Publish Date: 2023-12-14 00:00:00'), Document(page_content=' \n Title: To Fight Hamas, Israel’s Leaders Stopped Fighting One Another. For Now. \n Url: https://www.nytimes.com/2023/12/14/world/middleeast/israel-war-cabinet-gaza-netanyahu.html \n Publish Date: 2023-12-14 00:00:00 \n  Hints of those personal tensions surface from time to time, particularly when one member holds a solo news conference, or when they shift uncomfortably during joint briefings, practically grimacing when reporters ask how they are getting along.'), Document(page_content=' \n Title: To Fight Hamas, Israel’s Leaders Stopped Fighting One Another. For Now. \n Url: https://www.nytimes.com/2023/12/14/world/middleeast/israel-war-cabinet-gaza-netanyahu.html \n Publish Date: 2023-12-14 00:00:00 \n  It has also bought Prime M