# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [17]:
import feedparser
#pip install --user feedparser in cmd.exe

### 1. Use feedparser to parse the following RSS feed URL.

In [18]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [19]:
oreilly = feedparser.parse(url)

##### 2. Obtain a list of components (keys) that are available for this feed.

In [20]:
oreilly.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

In [21]:
oreilly.feed

{'title': "All - O'Reilly Media",
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
  'value': "All - O'Reilly Media"},
 'id': 'https://www.oreilly.com',
 'guidislink': True,
 'link': 'https://www.oreilly.com',
 'updated': '2019-07-22T14:34:24Z',
 'updated_parsed': time.struct_time(tm_year=2019, tm_mon=7, tm_mday=22, tm_hour=14, tm_min=34, tm_sec=24, tm_wday=0, tm_yday=203, tm_isdst=0),
 'subtitle': 'All of our Ideas and Learning material from all of our topics.',
 'subtitle_detail': {'type': 'text/plain',
  'language': None,
  'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
  'value': 'All of our Ideas and Learning material from all of our topics.'},
 'links': [{'href': 'https://www.oreilly.com',
   'rel': 'alternate',
   'type': 'text/html'},
  {'rel': 'self',
   'type': 'application/atom+xml',
   'href': 'http://feeds.feedburner.com/oreilly/radar/atom'},
  {'rel': 'hub',
   'href': 'http://pubsubhubbub.a

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [22]:
oreilly.feed.keys()

dict_keys(['title', 'title_detail', 'id', 'guidislink', 'link', 'updated', 'updated_parsed', 'subtitle', 'subtitle_detail', 'links', 'authors', 'author_detail', 'author', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [23]:
oreilly.feed.title

"All - O'Reilly Media"

In [24]:
oreilly.feed.subtitle

'All of our Ideas and Learning material from all of our topics.'

In [25]:
oreilly.feed.author

"O'Reilly Media"

In [26]:
oreilly.feed.link

'https://www.oreilly.com'

### 5. Count the number of entries that are contained in this RSS feed.

In [27]:
print(len(oreilly['entries']))

60


### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [28]:
oreilly.entries[0].keys()

dict_keys(['title', 'title_detail', 'updated', 'updated_parsed', 'id', 'guidislink', 'link', 'content', 'summary', 'links', 'authors', 'author_detail', 'author', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [29]:
titles = [oreilly.entries[i].title for i in range(len(oreilly.entries))]
print('Here is list of entry titles:', titles)

Here is list of entry titles: ['Four short links: 22 July 2019', 'Four short links: 19 July 2019', 'The war for the soul of open source', "O'Reilly Open Source and Frank Willison Awards", 'O’Reilly Radar: Open source technology trends—What our users tell us', 'Ask not what Brands™ can do for you', 'Managing machines', 'Acquiring and sharing high-quality data', 'Four short links: 18 July 2019', 'The role of open source in mitigating natural disasters', "Highlights from the O'Reilly Open Source Software Conference in Portland 2019", 'Better living through software', 'Why Amazon cares about open source', 'Built to last: Building and growing open source communities', 'The next age of open innovation', 'Four short links: 17 July 2019', 'Four short links: 16 July 2019', 'Managing machine learning in the enterprise: Lessons from banking and health care', 'Four short links: 15 July 2019', 'Four short links: 12 July 2019', 'Four short links: 11 July 2019', 'Four short links: 10 July 2019', 'Fou

In [30]:
print(oreilly.version)

atom10


In [15]:
#for post in oreilly.entries:
#    print(post.title + ": " + post.link + "")

### 8. Calculate the percentage of "Four short links" entry titles.

In [66]:
import re

title_list=[title for title in titles if title.startswith('Four short links')]
part=len(title_list)
whole=len(titles)
percentage=part/whole*100
print(percentage)

43.333333333333336


### 9. Create a Pandas data frame from the feed's entries.

In [33]:
import pandas as pd

In [79]:
df = pd.DataFrame(oreilly.entries)
df.head(2)

Unnamed: 0,author,author_detail,authors,content,feedburner_origlink,guidislink,id,link,links,summary,title,title_detail,updated,updated_parsed
0,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-07-22:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Game Source, Procurement Graph, Data Mo...",Four short links: 22 July 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-07-22T14:30:00Z,"(2019, 7, 22, 14, 30, 0, 0, 203, 0)"
1,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-07-19:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Journal Mining, API Use, Better Convers...",Four short links: 19 July 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-07-19T17:05:00Z,"(2019, 7, 19, 17, 5, 0, 4, 200, 0)"


### 10. Count the number of entries per author and sort them in descending order.

In [78]:
authors = df.groupby('author', as_index=False).agg({'title':'count'})
authors.columns = ['author', 'entries']
authors.sort_values('entries', ascending=False).head(2)

Unnamed: 0,author,entries
18,Nat Torkington,26
5,Ben Lorica,5


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [69]:
df['title_length'] = df['title'].apply(len)
df[['title', 'author', 'title_length']].sort_values('title_length', ascending=False).head(3)


Unnamed: 0,title,author,title_length
41,RISELab’s AutoPandas hints at automation tech ...,Ben Lorica,97
17,Managing machine learning in the enterprise: L...,"Ben Lorica, Harish Doddi, David Talby",81
24,Highlights from the O'Reilly Artificial Intell...,Jenn Webb,79


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [90]:
df_machine= df[['title', 'summary']]
df_machine.head(2)

Unnamed: 0,title,summary
0,Four short links: 22 July 2019,"<p><em>Game Source, Procurement Graph, Data Mo..."
1,Four short links: 19 July 2019,"<p><em>Journal Mining, API Use, Better Convers..."


In [124]:
dataframe=df[df['summary'].str.contains('machine learning')]

In [110]:
[title for title in dataframe.title]

['Acquiring and sharing high-quality data',
 "Highlights from the O'Reilly Open Source Software Conference in Portland 2019",
 'Managing machine learning in the enterprise: Lessons from banking and health care',
 "Highlights from the O'Reilly Artificial Intelligence Conference in Beijing 2019",
 'The future of machine learning is tiny',
 'Tools for machine learning development',
 'New live online training courses',
 'RISELab’s AutoPandas hints at automation tech that will change the nature of software development',
 'AI and machine learning will require retraining your entire organization',
 'Enabling end-to-end machine learning pipelines in real-world applications',
 'What are model governance and model operations?',
 'The quest for high-quality data']

In [128]:
import re

machine_list=[summary for summary in df_machine if summary.find('machine learning')]
machine_list

['title', 'summary']