# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [3]:
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [4]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [5]:
#extracting and assigning this feed to "d" 
d = feedparser.parse(url)

In [None]:
d

### 2. Obtain a list of components (keys) that are available for this feed.

In [6]:
type(d)

feedparser.FeedParserDict

In [7]:
#display all the keys of "d", which is similar to a dictionary
list(d.keys())

['feed',
 'entries',
 'bozo',
 'headers',
 'updated',
 'updated_parsed',
 'href',
 'status',
 'encoding',
 'version',
 'namespaces']

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [8]:
# d['feed'] seems as a dict considering the format
type(d['feed'])

feedparser.FeedParserDict

In [None]:
d['feed']

In [9]:
#.... so it's possible to create list of keys
list(d['feed'].keys())

['title',
 'title_detail',
 'links',
 'link',
 'subtitle',
 'subtitle_detail',
 'updated',
 'updated_parsed',
 'language',
 'sy_updateperiod',
 'sy_updatefrequency',
 'generator_detail',
 'generator',
 'feedburner_info',
 'geo_lat',
 'geo_long',
 'feedburner_emailserviceid',
 'feedburner_feedburnerhostname']

### 4. Extract and print the feed title, subtitle, author, and link.

In [10]:
print(d['feed']['title'])
print(d['feed']['subtitle'])
print(d['feed']['link'])
#Author isn't in this 'feed' key

Radar
Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology
https://www.oreilly.com/radar


### 5. Count the number of entries that are contained in this RSS feed.

In [11]:
len(d['entries'])
#d['entries']

18

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [12]:
#this isn't a dictionary, it's a list
type(d['entries'])

list

In [None]:
d['entries']

In [13]:
print(list(d['entries'][0].keys()))
#I've chosen the first one to print a list of component in one element inside an entry

['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink']


### 7. Extract a list of entry titles.

In [14]:
#It's is a list of dictionaries, so I'll use a list comprehension to print the first element in heach dict
list_title = [ x['title'] for x in d['entries'] ]
list_title

['The road to Software 2.0',
 'Four short links: 10 December 2019',
 'Four short links: 9 December 2019',
 'Four short links: 6 December 2019',
 'Radar trends to watch: December 2019',
 'Four short links: 5 December 2019',
 'Four short links: 4 December 2019',
 'Use your people as competitive advantage',
 'Four short links: 3 December 2019',
 'A 5G future',
 'Four short links: 2 December 2019',
 'Four short links: 29 November 2019',
 'Four short links: 28 November 2019',
 'Four short links: 27 November 2019',
 'Moving AI and ML from research into production',
 'Four short links: 26 November 2019',
 'Four short links: 25 November 2019',
 'Four short links: 22 November 2019']

### 8. Calculate the percentage of "Four short links" entry titles.

In [18]:
import re

In [19]:
fsl = [ x for x in list_title if re.search('Four short links',str(x))!=None ]
fsl

['Four short links: 10 December 2019',
 'Four short links: 9 December 2019',
 'Four short links: 6 December 2019',
 'Four short links: 5 December 2019',
 'Four short links: 4 December 2019',
 'Four short links: 3 December 2019',
 'Four short links: 2 December 2019',
 'Four short links: 29 November 2019',
 'Four short links: 28 November 2019',
 'Four short links: 27 November 2019',
 'Four short links: 26 November 2019',
 'Four short links: 25 November 2019',
 'Four short links: 22 November 2019']

In [20]:
fsl_porc = len(fsl) / len(list_title)
fsl_porc

0.7222222222222222

### 9. Create a Pandas data frame from the feed's entries.

In [21]:
import pandas as pd

In [28]:
df = pd.DataFrame(d['entries'])
df.head(5)

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink
0,The road to Software 2.0,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/the-road-to-soft...,"Tue, 10 Dec 2019 11:00:00 +0000","(2019, 12, 10, 11, 0, 0, 1, 344, 0)",[{'name': 'Mike Loukides and Ben Lorica'}],Mike Loukides and Ben Lorica,{'name': 'Mike Loukides and Ben Lorica'},"[{'term': 'AI & ML', 'scheme': None, 'label': ...",https://www.oreilly.com/radar/?p=11155,False,"Roughly a year ago, we wrote “What machine lea...","{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/the-road-to-soft...,0,https://www.oreilly.com/radar/the-road-to-soft...
1,Four short links: 10 December 2019,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Tue, 10 Dec 2019 05:01:00 +0000","(2019, 12, 10, 5, 1, 0, 1, 344, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=11192,False,The Hidden Worries of Facial Recognition Techn...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
2,Four short links: 9 December 2019,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Mon, 09 Dec 2019 05:01:00 +0000","(2019, 12, 9, 5, 1, 0, 0, 343, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=11151,False,Learning from Incidents &#8212; super useful a...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
3,Four short links: 6 December 2019,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Fri, 06 Dec 2019 05:01:00 +0000","(2019, 12, 6, 5, 1, 0, 4, 340, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=11147,False,Declarative Assembly of Web Applications From ...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
4,Radar trends to watch: December 2019,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/radar-trends-to-...,"Thu, 05 Dec 2019 12:00:00 +0000","(2019, 12, 5, 12, 0, 0, 3, 339, 0)",[{'name': 'Mike Loukides'}],Mike Loukides,{'name': 'Mike Loukides'},"[{'term': 'Radar Trends', 'scheme': None, 'lab...",https://www.oreilly.com/radar/?p=11118,False,Privacy and security trends DNS over HTTPS is ...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/radar-trends-to-...,0,https://www.oreilly.com/radar/radar-trends-to-...


### 10. Count the number of entries per author and sort them in descending order.

In [40]:
#df.groupby('author').count().title
df.groupby('author').count().sort_values(by=['title'],ascending=False).title

author
Nat Torkington                  13
Mike Loukides                    2
Jenn Webb                        1
Mike Loukides and Ben Lorica     1
Pamela Rucker                    1
Name: title, dtype: int64

### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

### 12. Create a list of entry titles whose summary includes the phrase "machine learning."