In [1]:
from numpy import *
import re
import feedparser
import operator

In [2]:
def createVocabList(dataset):
    vocab = set([])
    for document in dataset:
        vocab = vocab | set(document)
    return list(vocab)

In [3]:
def trainNaiveBayes(training_mat, labels):
    num_docs = len(training_mat)
    num_words = len(training_mat[0])
    p_feed = sum(labels)/float(num_docs)
    p0_num = ones(num_words)
    p1_num = ones(num_words)
    p0denom = 2.0
    p1denom = 2.0
    for i in range(num_docs):
        if labels[i] == 1:
            p1_num += training_mat[i]
            p1denom += sum(training_mat[i])
        else:
            p0_num += training_mat[i]
            p0denom += sum(training_mat[i])
    p1vect = log(p1_num/p1denom)
    p0vect = log(p0_num/p0denom)
    return p0vect, p1vect, p_feed
            

In [4]:
def classify(vec_to_classify, p0vec, p1vec, pclass1):
    p1 = sum(vec_to_classify*p1vec) + log(pclass1)
    p0 = sum(vec_to_classify*p0vec) + log(1.0 - pclass1)
    if p1 > p0:
        return 1
    return 0

In [5]:
def bagofWordstoVec(vocab, test_set):
    returnVec = [0]*len(vocab)
    for word in test_set:
        if word in vocab:
            returnVec[vocab.index(word)] += 1
    return returnVec   

In [6]:
def calcMostFreq(vocab, full_text, top_x):
    freq = {}
    for token in vocab:
        freq[token] = full_text.count(token)
    sorted_freq = sorted(freq.items(), key = operator.itemgetter(1), reverse = True)
    return sorted_freq[:top_x]

In [7]:
def textParse(given_string):
    tokens_list = re.split(r'\W+', given_string)
    return [t.lower() for t in tokens_list if len(t) > 2]

In [8]:
def localwords(feed0, feed1, top_x):
    if len(feed0['entries']) == 0 or len(feed1['entries']) == 0:
        print("One or both RSS feeds are empty!")
        return
    docList = []
    classList = []
    full_text = []
    min_len = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(min_len):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        full_text.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        full_text.extend(wordList)
        classList.append(0)
    vocab_list = createVocabList(docList)
    top30words = calcMostFreq(vocab_list, full_text, top_x)
    for w in top30words:
        if w[0] in vocab_list:
            vocab_list.remove(w[0])

    trainingSet = list(range(2*min_len))
    testSet = []
    for i in range(20):
        random_ind = int(random.uniform(0, len(trainingSet)))
        if random_ind in trainingSet:
            testSet.append(trainingSet[random_ind])
            del(trainingSet[random_ind])


    training_matrix = []
    training_class = []
    for i in trainingSet:
        training_matrix.append(bagofWordstoVec(vocab_list, docList[i]))
        training_class.append(classList[i])

    p0V, p1V, pSpam = trainNaiveBayes(array(training_matrix), array(training_class))
    error = 0
    for i in testSet:
        word_vector = bagofWordstoVec(vocab_list, docList[i])
        if classify(array(word_vector), p0V, p1V, pSpam) != classList[i]:
            error += 1
    print(f"The error rate is {float(error)/len(testSet)}")
    return vocab_list, p0V, p1V
        

In [9]:
def getTopWords(feed0, feed1, top_x):
    vocab_list, p0V, p1V = localwords(feed0,feed1,top_x)
    top_f0 = []
    top_f1 = []
    for i in range(len(p0V)):
        if p0V[i] > -4.5:
            top_f0.append((vocab_list[i], p0V[i]))
        if p1V[i] > -4.5:
            top_f1.append((vocab_list[i], p1V[i]))
    sorted_f1 = sorted(top_f1, key = lambda pair: pair[1], reverse = True)
    print("----------- FEED 1 -----------")
    for item in sorted_f1:
        print(item[0])

    sorted_f0 = sorted(top_f0, key = lambda pair: pair[1], reverse = True)
    print("----------- FEED 0 -----------")
    for item in sorted_f0:
        print(item[0])

In [10]:
nyt = feedparser.parse('https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml')
bbc = feedparser.parse('https://feeds.bbci.co.uk/news/world/rss.xml')

In [11]:
nyt

{'bozo': False,
 'entries': [{'title': 'Citing New Rules, Meta Says It Will End Political Ads in E.U.',
   'title_detail': {'type': 'text/plain',
    'language': None,
    'base': 'https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml',
    'value': 'Citing New Rules, Meta Says It Will End Political Ads in E.U.'},
   'links': [{'rel': 'alternate',
     'type': 'text/html',
     'href': 'https://www.nytimes.com/2025/07/25/business/meta-eu-political-ad-ban.html'},
    {'href': 'https://www.nytimes.com/2025/07/25/business/meta-eu-political-ad-ban.html',
     'rel': 'standout',
     'type': 'text/html'}],
   'link': 'https://www.nytimes.com/2025/07/25/business/meta-eu-political-ad-ban.html',
   'id': 'https://www.nytimes.com/2025/07/25/business/meta-eu-political-ad-ban.html',
   'guidislink': False,
   'summary': 'Meta said political advertising would end in October, citing a forthcoming E.U. regulation that it said presented “unworkable requirements.”',
   'summary_detail': {'type':

In [12]:
bbc

{'bozo': False,
 'entries': [{'title': 'EU and US agree trade deal, with 15% tariffs for European exports to America',
   'title_detail': {'type': 'text/plain',
    'language': None,
    'base': 'https://feeds.bbci.co.uk/news/world/rss.xml',
    'value': 'EU and US agree trade deal, with 15% tariffs for European exports to America'},
   'summary': 'US President Donald Trump and EU chief Ursula von der Leyen shake on it after "tough negotiations" in Scotland.',
   'summary_detail': {'type': 'text/html',
    'language': None,
    'base': 'https://feeds.bbci.co.uk/news/world/rss.xml',
    'value': 'US President Donald Trump and EU chief Ursula von der Leyen shake on it after "tough negotiations" in Scotland.'},
   'links': [{'rel': 'alternate',
     'type': 'text/html',
     'href': 'https://www.bbc.com/news/articles/cx2xylk3d07o?at_medium=RSS&at_campaign=rss'}],
   'link': 'https://www.bbc.com/news/articles/cx2xylk3d07o?at_medium=RSS&at_campaign=rss',
   'id': 'https://www.bbc.com/news/a

In [13]:
vocab_list, p0V, p1V =localwords(nyt,bbc,30)

The error rate is 0.4444444444444444


In [14]:
vocab_list, p0V, p1V = localwords(nyt,bbc,50)

The error rate is 0.21428571428571427


In [15]:
vocab_list, p0V, p1V = localwords(nyt,bbc,100)

The error rate is 0.4666666666666667


In [16]:
getTopWords(nyt,bbc,10)

The error rate is 0.4
----------- FEED 1 -----------
say
were
countries
most
prosecutors
smaller
claims
greece
darkly
armed
often
harvard
matcha
citizenship
people
injured
martin
police
wife
worshippers
critics
hand
mathematician
free
died
tariff
his
stopped
coldplay
investment
opponents
tea
heart
songs
prices
officers
spike
demand
heatwaves
settlement
officials
ilham
offer
thailand
little
island
such
surging
pause
during
chris
remnants
urban
kiss
which
threatened
video
negotiations
capital
employee
released
political
motive
connotations
nations
was
azerbaijan
region
wildfires
while
unearthed
archaeological
humorous
aliyev
cam
battling
train
among
200
japan
driver
men
temperatures
crackdown
dead
five
night
stormed
scandal
another
europe
pushing
part
competing
passenger
southern
historical
000
donald
incident
turkey
site
tariffs
with
keeladi
wrote
featuring
taking
vigil
until
behind
policy
hostilities
lehrer
trump
rail
crops
trained
fuelling
one
given
investigating
due
----------- FEED 