# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [20]:
import feedparser
#pip install --user feedparser in cmd.exe

### 1. Use feedparser to parse the following RSS feed URL.

In [21]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [22]:
oreilly = feedparser.parse(url)

##### 2. Obtain a list of components (keys) that are available for this feed.

In [24]:
oreilly.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

In [25]:
oreilly.feed

{'title': "All - O'Reilly Media",
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
  'value': "All - O'Reilly Media"},
 'id': 'https://www.oreilly.com',
 'guidislink': True,
 'link': 'https://www.oreilly.com',
 'updated': '2019-08-15T06:30:05Z',
 'updated_parsed': time.struct_time(tm_year=2019, tm_mon=8, tm_mday=15, tm_hour=6, tm_min=30, tm_sec=5, tm_wday=3, tm_yday=227, tm_isdst=0),
 'subtitle': 'All of our Ideas and Learning material from all of our topics.',
 'subtitle_detail': {'type': 'text/plain',
  'language': None,
  'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
  'value': 'All of our Ideas and Learning material from all of our topics.'},
 'links': [{'href': 'https://www.oreilly.com',
   'rel': 'alternate',
   'type': 'text/html'},
  {'rel': 'self',
   'type': 'application/atom+xml',
   'href': 'http://feeds.feedburner.com/oreilly/radar/atom'},
  {'rel': 'hub',
   'href': 'http://pubsubhubbub.app

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [26]:
oreilly.feed.keys()

dict_keys(['title', 'title_detail', 'id', 'guidislink', 'link', 'updated', 'updated_parsed', 'subtitle', 'subtitle_detail', 'links', 'authors', 'author_detail', 'author', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [28]:
oreilly.feed.title

"All - O'Reilly Media"

In [29]:
oreilly.feed.subtitle

'All of our Ideas and Learning material from all of our topics.'

In [30]:
oreilly.feed.author

"O'Reilly Media"

In [31]:
oreilly.feed.link

'https://www.oreilly.com'

### 5. Count the number of entries that are contained in this RSS feed.

In [32]:
print(len(oreilly['entries']))

60


### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [33]:
oreilly.entries[0].keys()

dict_keys(['title', 'title_detail', 'updated', 'updated_parsed', 'id', 'guidislink', 'link', 'content', 'summary', 'links', 'authors', 'author_detail', 'author', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [34]:
titles = [oreilly.entries[i].title for i in range(len(oreilly.entries))]
print('Here is list of entry titles:', titles)

Here is list of entry titles: ['Four short links: 14 August 2019', 'Four short links: 13 August 2019', 'Four short links: 12 August 2019', 'Blockchain solutions in enterprise', 'Four short links: 9 August 2019', 'Got speech? These guidelines will help you get started building voice applications', 'Four short links: 8 August 2019', 'New live online training courses', 'Four short links: 7 August 2019', 'Four short links: 6 August 2019', 'Four short links: 5 August 2019', 'Four short links: 2 August 2019', 'Make data science more useful', 'Taming chaos: Preparing for your next incident', 'Four short links: 1 August 2019', 'Learning from adversaries', 'Four short links: 31 July 2019', 'Four short links: 30 July 2019', 'Four short links: 29 July 2019', 'Four short links: 26 July 2019', 'Four short links: 25 July 2019', 'One simple graphic: Researchers love PyTorch and TensorFlow', 'Four short links: 24 July 2019', 'Four short links: 23 July 2019', 'Four short links: 22 July 2019', 'Four sho

In [35]:
print(oreilly.version)

atom10


In [36]:
#for post in oreilly.entries:
#    print(post.title + ": " + post.link + "")

### 8. Calculate the percentage of "Four short links" entry titles.

In [37]:
import re

title_list=[title for title in titles if title.startswith('Four short links')]
part=len(title_list)
whole=len(titles)
percentage=part/whole*100
print(percentage)

48.333333333333336


### 9. Create a Pandas data frame from the feed's entries.

In [38]:
import pandas as pd

In [39]:
df = pd.DataFrame(oreilly.entries)
df.head(2)

Unnamed: 0,author,author_detail,authors,content,feedburner_origlink,guidislink,id,link,links,summary,title,title_detail,updated,updated_parsed
0,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-14:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Hardware Deplatforming, Hiring Groupthi...",Four short links: 14 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-14T11:00:00Z,"(2019, 8, 14, 11, 0, 0, 2, 226, 0)"
1,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-13:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Recognizing Fact, YouTube &amp; Brazil,...",Four short links: 13 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-13T10:55:00Z,"(2019, 8, 13, 10, 55, 0, 1, 225, 0)"


### 10. Count the number of entries per author and sort them in descending order.

In [40]:
authors = df.groupby('author', as_index=False).agg({'title':'count'})
authors.columns = ['author', 'entries']
authors.sort_values('entries', ascending=False).head(2)

Unnamed: 0,author,entries
19,Nat Torkington,29
6,Ben Lorica,3


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [41]:
df['title_length'] = df['title'].apply(len)
df[['title', 'author', 'title_length']].sort_values('title_length', ascending=False).head(3)


Unnamed: 0,title,author,title_length
5,Got speech? These guidelines will help you get...,"Ben Lorica, Yishay Carmiel",82
42,Managing machine learning in the enterprise: L...,"Ben Lorica, Harish Doddi, David Talby",81
49,Highlights from the O'Reilly Artificial Intell...,Jenn Webb,79


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [42]:
df_machine= df[['title', 'summary']]
df_machine.head(8)

Unnamed: 0,title,summary
0,Four short links: 14 August 2019,"<p><em>Hardware Deplatforming, Hiring Groupthi..."
1,Four short links: 13 August 2019,"<p><em>Recognizing Fact, YouTube &amp; Brazil,..."
2,Four short links: 12 August 2019,"<p><em>Retro Hacking, Explaining AI, Teacher R..."
3,Blockchain solutions in enterprise,<p><img src='https://d3ucjech6zwjp8.cloudfront...
4,Four short links: 9 August 2019,"<p><em>Shadow Ban Patent, Abusing Unix Tools, ..."
5,Got speech? These guidelines will help you get...,<p><img src='https://d3ucjech6zwjp8.cloudfront...
6,Four short links: 8 August 2019,"<p><em>Counterfeit Security, Poses in Art, Con..."
7,New live online training courses,<p><img src='https://d3ucjech6zwjp8.cloudfront...


In [43]:
dataframe=df[df['summary'].str.contains('[Mm]achine learning')]
#correct regex usage?

In [47]:
[title for title in dataframe.title]

['Got speech? These guidelines will help you get started building voice applications',
 'New live online training courses',
 'Four short links: 5 August 2019',
 'Learning from adversaries',
 'One simple graphic: Researchers love PyTorch and TensorFlow',
 'Acquiring and sharing high-quality data',
 "Highlights from the O'Reilly Open Source Software Conference in Portland 2019",
 'Managing machine learning in the enterprise: Lessons from banking and health care',
 "Highlights from the O'Reilly Artificial Intelligence Conference in Beijing 2019",
 'The future of machine learning is tiny']

In [59]:
df['Extract']=df['summary'].str.extract('([Mm]achine learning)')
#j'utilise extract pour verifier mon resultat : 10 titres dans la liste ci-dessus, qui correspond aux 10 lignes reçu par extract

In [57]:
df['Extract1']=df['summary'].str.extract('(Machine learning)')

In [58]:
df['Extract1']

0                  NaN
1                  NaN
2                  NaN
3                  NaN
4                  NaN
5                  NaN
6                  NaN
7                  NaN
8                  NaN
9                  NaN
10                 NaN
11                 NaN
12                 NaN
13                 NaN
14                 NaN
15                 NaN
16                 NaN
17                 NaN
18                 NaN
19                 NaN
20                 NaN
21                 NaN
22                 NaN
23                 NaN
24                 NaN
25                 NaN
26                 NaN
27                 NaN
28                 NaN
29                 NaN
30                 NaN
31                 NaN
32                 NaN
33                 NaN
34                 NaN
35                 NaN
36                 NaN
37                 NaN
38                 NaN
39                 NaN
40                 NaN
41                 NaN
42    Machine learning
43         

In [60]:
df['Extract']

0                  NaN
1                  NaN
2                  NaN
3                  NaN
4                  NaN
5     machine learning
6                  NaN
7     machine learning
8                  NaN
9                  NaN
10    machine learning
11                 NaN
12                 NaN
13                 NaN
14                 NaN
15    machine learning
16                 NaN
17                 NaN
18                 NaN
19                 NaN
20                 NaN
21    machine learning
22                 NaN
23                 NaN
24                 NaN
25                 NaN
26                 NaN
27                 NaN
28                 NaN
29                 NaN
30                 NaN
31                 NaN
32    machine learning
33                 NaN
34    machine learning
35                 NaN
36                 NaN
37                 NaN
38                 NaN
39                 NaN
40                 NaN
41                 NaN
42    machine learning
43         

In [63]:
df.columns

Index(['author', 'author_detail', 'authors', 'content', 'feedburner_origlink',
       'guidislink', 'id', 'link', 'links', 'summary', 'title', 'title_detail',
       'updated', 'updated_parsed', 'title_length', 'Extract', 'Extract1'],
      dtype='object')