# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [177]:
import feedparser
import re

### 1. Use feedparser to parse the following RSS feed URL.

In [178]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [179]:
content = feedparser.parse(url)

### 2. Obtain a list of components (keys) that are available for this feed.

In [180]:
content.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [181]:
content['feed'].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'updated', 'updated_parsed', 'language', 'sy_updateperiod', 'sy_updatefrequency', 'generator_detail', 'generator', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [182]:
data = []

data.append(content['feed']['title']) 
data.append(content['feed']['subtitle'])
# data.append(content['feed']) # author is missing from feed
data.append(content['feed']['link'])
data

['Radar',
 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology',
 'https://www.oreilly.com/radar']

### 5. Count the number of entries that are contained in this RSS feed.

In [183]:
len(content['entries'])

60

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [184]:
content['entries'][0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [185]:
titles = [entry['title'] for entry in content['entries']]
titles

['Four short links: 28 Oct 2020',
 'Our Favorite Questions',
 'Four short links: 21 Oct 2020',
 'Four Short Links: 16 October 2020',
 'Four short links: 14 Oct 2020',
 'AI Product Management After Deployment',
 'Four short links: 9 October 2020',
 'AI and Creativity',
 'Four short links: 6 October 2020',
 'Four short links: 2 October 2020',
 'Radar trends to watch: October 2020',
 'Four short links: 29 Sep 2020',
 'Four short links: 25 September 2020',
 'Four short links: 18 Sep 2020',
 'Four short links: 16 Sep 2020',
 'How to Set AI Goals',
 'Four short links: 11 Sep 2020',
 'Four short links: 9 Sep 2020',
 'Pair Programming with AI',
 'Four short links: 4 September 2020',
 'Four short links: 2 September 2020',
 'Radar trends to watch: September 2020',
 'Four short links: 28 August 2020',
 'An Agent of Change',
 'Four short links: 25 August 2020',
 'Four short links: 21 August 2020',
 'Four Short Links: 19 August 2020',
 'Why Best-of-Breed is a Better Choice than All-in-One Platforms

### 8. Calculate the percentage of "Four short links" entry titles.

In [186]:
x = 0
for title in titles:
    if 'Four short links' in title:
        x += 1

p = 100*x/len(titles)
print(f'{round(p,2)} %')

68.33 %


### 9. Create a Pandas data frame from the feed's entries.

In [187]:
import pandas as pd
from pandas.io.json import json_normalize

In [188]:
df = pd.json_normalize(content['entries'])
df.head()

Unnamed: 0,title,links,link,comments,published,published_parsed,authors,author,tags,id,...,feedburner_origlink,title_detail.type,title_detail.language,title_detail.base,title_detail.value,author_detail.name,summary_detail.type,summary_detail.language,summary_detail.base,summary_detail.value
0,Four short links: 28 Oct 2020,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 28 Oct 2020 11:39:13 +0000","(2020, 10, 28, 11, 39, 13, 2, 302, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13382,...,https://www.oreilly.com/radar/four-short-links...,text/plain,,http://feeds.feedburner.com/oreilly/radar/atom,Four short links: 28 Oct 2020,Nat Torkington,text/html,,http://feeds.feedburner.com/oreilly/radar/atom,"Phantom of the ADAS &#8212; In this paper, we ..."
1,Our Favorite Questions,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/our-favorite-que...,"Thu, 22 Oct 2020 14:33:17 +0000","(2020, 10, 22, 14, 33, 17, 3, 296, 0)","[{'name': 'Q Ethan McCallum, Chris Butler and ...","Q Ethan McCallum, Chris Butler and Shane Glynn","[{'term': 'AI & ML', 'scheme': None, 'label': ...",https://www.oreilly.com/radar/?p=13374,...,https://www.oreilly.com/radar/our-favorite-que...,text/plain,,http://feeds.feedburner.com/oreilly/radar/atom,Our Favorite Questions,"Q Ethan McCallum, Chris Butler and Shane Glynn",text/html,,http://feeds.feedburner.com/oreilly/radar/atom,"&#8220;On peut interroger n&#8217;importe qui,..."
2,Four short links: 21 Oct 2020,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 21 Oct 2020 11:34:42 +0000","(2020, 10, 21, 11, 34, 42, 2, 295, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13376,...,https://www.oreilly.com/radar/four-short-links...,text/plain,,http://feeds.feedburner.com/oreilly/radar/atom,Four short links: 21 Oct 2020,Nat Torkington,text/html,,http://feeds.feedburner.com/oreilly/radar/atom,Justice Department Antitrust Filing Against Go...
3,Four Short Links: 16 October 2020,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Fri, 16 Oct 2020 11:21:43 +0000","(2020, 10, 16, 11, 21, 43, 4, 290, 0)",[{}],,,https://www.oreilly.com/radar/?p=13371,...,https://www.oreilly.com/radar/four-short-links...,text/plain,,http://feeds.feedburner.com/oreilly/radar/atom,Four Short Links: 16 October 2020,,text/html,,http://feeds.feedburner.com/oreilly/radar/atom,Automerge &#8212; (Github) Data structure libr...
4,Four short links: 14 Oct 2020,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 14 Oct 2020 11:46:08 +0000","(2020, 10, 14, 11, 46, 8, 2, 288, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13367,...,https://www.oreilly.com/radar/four-short-links...,text/plain,,http://feeds.feedburner.com/oreilly/radar/atom,Four short links: 14 Oct 2020,Nat Torkington,text/html,,http://feeds.feedburner.com/oreilly/radar/atom,Data Organization in Spreadsheets &#8212; Focu...


### 10. Count the number of entries per author and sort them in descending order.

In [189]:
df['author'].value_counts() # Not valid

Nat Torkington                                      42
Mike Loukides                                        9
Justin Norman, Peter Skomoroch and Mike Loukides     1
Justin Norman and Mike Loukides                      1
Q Ethan McCallum and Mike Loukides                   1
Sarah Gold                                           1
Matthew Rocklin and Hugo Bowne-Anderson              1
Q Ethan McCallum, Chris Butler and Shane Glynn       1
Mike Loukides and Steve Swoyer                       1
Alex Castrounis                                      1
                                                     1
Name: author, dtype: int64

In [190]:
authors = df['author_detail.name'].astype('str').tolist()

In [191]:
a = []
row = []
for entry in authors:
    row = []
    for i in entry.split(', '):
        row.extend(i.split(' and '))
    a.extend(row)
# a_set = set([i for i in a if i != 'nan'])
a_set = set(a)

In [192]:
c = []
for author in a_set:
    c.append((author,a.count(author)))
c.sort(key = lambda x: x[1], reverse = True)

c = pd.DataFrame(c)
c.columns = ['Author','Appearances']
c.set_index('Author')

Unnamed: 0_level_0,Appearances
Author,Unnamed: 1_level_1
Nat Torkington,42
Mike Loukides,13
Justin Norman,2
Q Ethan McCallum,2
Chris Butler,1
Matthew Rocklin,1
,1
Sarah Gold,1
Alex Castrounis,1
Shane Glynn,1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [193]:
df['title'] = df['title'].astype('str')

In [194]:
df['title_length'] = df['title'].apply(lambda x: len(x))

In [195]:
df2 = df[['title','author','title_length']]

In [196]:
df3 = df2.sort_values(by='title_length', ascending = False)
df3.head()

Unnamed: 0,title,author,title_length
27,Why Best-of-Breed is a Better Choice than All-...,Matthew Rocklin and Hugo Bowne-Anderson,79
54,Automated Coding and the Future of Programming,Mike Loukides,46
5,AI Product Management After Deployment,Justin Norman and Mike Loukides,38
21,Radar trends to watch: September 2020,Mike Loukides,37
29,The Least Liked Programming Languages,Mike Loukides,37


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [197]:
df['summary_detail.value'] = df['summary_detail.value'].astype('str')

In [198]:
# cleaning strange strings like &#8561;
df['summary_detail.value'] = df['summary_detail.value'].apply(lambda x: re.sub('.{2}\d*;','',x))

In [199]:
df4 = df.loc[df['summary_detail.value'].apply(lambda x: 'machine learning' in x.lower())]

In [200]:
lista = df4['title'].tolist()
lista

['Four short links: 21 August 2020', 'Four short links: 8 July 2020']