# Test for AI news assistant

## Import libraries

In [14]:
import getpass
import os
from typing import Optional

try:
    import feedparser
except:
    !pip install feedparser > /dev/null
    import feedparser

try:
    from langchain_openai import ChatOpenAI
except:
    !pip install langchain-openai > /dev/null
    from langchain_openai import ChatOpenAI


from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser


try:
    from qdrant_client import models, QdrantClient
except:
    !pip install -U "qdrant-client[fastembed]>=1.8.2" > /dev/null
    from qdrant_client import models, QdrantClient

import pandas as pd


## Define functions

In [2]:
def fetch_rss_feed(url):
    """Function that parses an RSS feed given the
    """
    # Parse the RSS feed
    feed = feedparser.parse(url)

    records = []
    # add each entry to list
    for entry in feed.entries:
        new_entry = {}
        new_entry["title"] = entry.title
        new_entry["link"] = entry.link
        # Not all instance have a description
        try:
            new_entry["text"] = f"{entry.title}. {entry.description}"
        except:
            new_entry["text"] = entry.title
        records.append(new_entry)
    return records

## Setup LLM with structured output

In [4]:
llm = ChatOpenAI(
    model="tiiuae/falcon-180B-chat",
    api_key=getpass.getpass(),
    base_url="https://api.ai71.ai/v1/",)

··········


In [5]:
interests = "I want to have summaries of all news involving NASA, space exploration, or Donald Trump"

priorities = "I want to be immediately notified of any news regarding the olympic games"

In [6]:
class FitInterest(BaseModel):
    """Whether the news fit with user's interests."""
    fit_interests: bool = Field(description="Do the news fit the user's interest?")


class FitPriority(BaseModel):
    """Whether the news fit with user's priorities."""
    fit_priorities: bool = Field(description="Do the news fit the user's priorities?")

In [7]:
# Set up a parser

interest_parser = PydanticOutputParser(pydantic_object=FitInterest)

priority_parser = PydanticOutputParser(pydantic_object=FitPriority)


In [8]:
prompt_interests = PromptTemplate.from_template(
    """You are an expert news analyst which needs to select news  \
       according to the user's interests. The user expressed the \
       following interests:

       {interests}

       Please establish whether the following news match the user's interests:

       {news}

       Wrap the output in `json` tags\n{format_instructions}""")

partial_interests = prompt_interests.partial(interests=interests,format_instructions=interest_parser.get_format_instructions())

prompt_priorities = PromptTemplate.from_template(
    """You are an expert news analyst which needs to select news  \
       according to the user's priorities. The user expressed the \
       following priorities:

       {priorities}

       Please establish whether the following news match the user's priorities:

       {news}

       Wrap the output in `json` tags\n{format_instructions}""")

partial_priorities = prompt_priorities.partial(priorities=priorities,format_instructions=priority_parser.get_format_instructions())


In [9]:
interests_chain = partial_interests | llm | interest_parser
priorities_chain = partial_priorities | llm | priority_parser

## Read RSS feeds

In [10]:
bbc_europe = "https://feeds.bbci.co.uk/news/world/europe/rss.xml"

cnn = "http://rss.cnn.com/rss/edition.rss"

In [11]:
bbc_feeds = fetch_rss_feed(bbc_europe)

In [12]:
cnn_feeds = fetch_rss_feed(cnn)

## Analyze RSS feed

Test on one feed

In [116]:
test = priorities_chain.invoke(input={"news":bbc_feeds[1].get("text")})

In [117]:
test.fit_priorities

True

Loop over all feeds

In [126]:
analyzed_feeds = []

for feed in bbc_feeds + cnn_feeds:
    new_feed = {
        **feed,
        "interest": None,
        "interest_parsing_err": False,
        "priority": None,
        "priority_parsing_err": False,
        "summarized": False # prepopulate this field for insertion in DB
    }
    # Output parsing is not very solid...
    # Need try-except logic
    try:
        new_interest = interests_chain.invoke(input={"news":feed.get("text")})
    except:
        new_interest = FitInterest(fit_interests = False)
        new_feed["interest_parsing_err"] = True
    try:
        new_priority = priorities_chain.invoke(input={"news":feed.get("text")})
    except:
        new_priority = FitPriority(fit_priorities = False)
        new_feed["priority_parsing_err"] = True

    new_feed["interest"] = new_interest.fit_interests
    new_feed["priority"] = new_priority.fit_priorities

    analyzed_feeds.append(new_feed)


Export output to excel for checking

In [127]:
data_df = pd.DataFrame(analyzed_feeds)

data_df.to_excel("feeds_test.xlsx", index=False)

In [128]:
data_df["priority_parsing_err"].value_counts()

Unnamed: 0_level_0,count
priority_parsing_err,Unnamed: 1_level_1
False,93
True,6


In [129]:
data_df["interest_parsing_err"].value_counts()

Unnamed: 0_level_0,count
interest_parsing_err,Unnamed: 1_level_1
False,92
True,7


In [24]:
data_df["interest"].value_counts()

Unnamed: 0_level_0,count
interest,Unnamed: 1_level_1
True,80
False,19


In [25]:
data_df["priority"].value_counts()


Unnamed: 0_level_0,count
priority,Unnamed: 1_level_1
True,82
False,17


Around 6-7% error rate for classification. Way too many news classified as interesting.

From now on we could perform the actions for each record (saving in DB, sending email) according to the values of "interest" and "priority" in the dictionary.

Need to decide how to handle fields with errors. Since the output parser seems quite weak, we could consider using an agent-based workflow with CrewAI as improvement and implement tools to save to DB.

Import data from excel, avoid rerunning the LLM

In [15]:
data_df = pd.read_excel("feeds_test.xlsx")

analyzed_feeds = data_df.to_dict(orient="records")

## Save into Qdrant

In [16]:
client = QdrantClient(":memory:") # To be changed with Qdrant cloud service
client.set_model("sentence-transformers/all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

In [26]:
# In production better to test if collection exists, and make sure ids are not repeated
# This code needs to be rewritten
client.recreate_collection(
    collection_name="news",
    vectors_config=client.get_fastembed_vector_params(),
    # comment this line to use dense vectors only
    # sparse_vectors_config=client.get_fastembed_sparse_vector_params(),
)

  client.recreate_collection(


True

In [27]:
# Loop to build list of documents to be uploaded in Qdrant
metadata = []
documents = []

for feed in analyzed_feeds:
    # Only append interesting news
    if feed.get("interest"):
        metadata.append(feed)
        documents.append(feed.get("text"))

In [28]:
client.add(
    collection_name="news",
    documents=documents,
    metadata=metadata,
    parallel=0,  # Use all available CPU cores to encode data.
    # Requires wrapping code into if __name__ == '__main__' block
)

['baf2baeb6ee742a69774c2eabad3db22',
 '3305d9f610cd4852a672181d0a13c6a4',
 'd1f9c5b09d6b47aaaa67338b58217e63',
 'e934d0718a03431b8c88cea3818f65a2',
 '0cef909a312f4061b0989201b60b1366',
 '3afbeb1f39b34bea9b0dea5e9d3c9db2',
 'e44b157548c245f3a339d9b76928cb20',
 '3b4623398b634ba5883fffb85c343106',
 '4de9f6c2a51b4264819e2f2ad7e0259a',
 'd7fef33b065d4b02a9c33871270c1519',
 'd996e307f8874461a7456ed22245f11f',
 '3c1f6b39879443d9bd3d48992fecfc70',
 'be0dd65bcd124967a19c3e88cbcde240',
 '3536443294594f1583a33436e1de3efc',
 'e730545ef1f149b09e3e87362fb07c8b',
 'e45849b9bf014be2ae87ec909c7575fc',
 '5f6a73da46214d3d9b89887c3a34dfe3',
 '6634a847a08b41b4a9f2ceb55d85b36f',
 '7e0fdfa2fb3a42079b60a77e07d02006',
 'a6c4e3848f62465782a7469e6728545a',
 '55100524e7524f8383f69ff98779a846',
 '55ef648557874537862d20402b17f260',
 'cb1807130e094bf68f2f2f3e565b66cc',
 'e7ca01edc3ca4300bcb1e3fb04954a5b',
 'cf91935f298a4538b57695a91772c45e',
 '97a2d386e7964521b58b7135023e84bf',
 'e54863eb7146446386663a068fd6e672',
 