Skip to content

Commit

Permalink
Use feedparser for RSS parsing (#1362)
Browse files Browse the repository at this point in the history
Fixes #1171
Fixes #870 (probably, would need to test against a Wallabag Atom file to
Fixes #135
Fixes #123
Fixes #106
  • Loading branch information
pirate committed Mar 14, 2024
2 parents 3512dc7 + 0f402df commit 099f7d0
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 53 deletions.
48 changes: 20 additions & 28 deletions archivebox/parsers/generic_rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,49 +2,41 @@


from typing import IO, Iterable
from datetime import datetime
from time import mktime
from feedparser import parse as feedparser

from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
str_between,
enforce_types
)

@enforce_types
def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse RSS XML-format files into links"""

rss_file.seek(0)
items = rss_file.read().split('<item>')
items = items[1:] if items else []
for item in items:
# example item:
# <item>
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>

trailing_removed = item.split('</item>', 1)[0]
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
rows = leading_removed.split('\n')

def get_row(key):
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]

url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
feed = feedparser(rss_file.read())
for item in feed.entries:
url = item.link
title = item.title
time = mktime(item.updated_parsed)

try:
tags = ','.join(map(lambda tag: tag.term, item.tags))
except AttributeError:
tags = ''

if url is None:
# Yielding a Link with no URL will
# crash on a URL validation assertion
continue

yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
timestamp=str(time),
title=htmldecode(title) or None,
tags=None,
tags=tags,
sources=[rss_file.name],
)

Expand Down
41 changes: 16 additions & 25 deletions archivebox/parsers/pinboard_rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,41 @@


from typing import IO, Iterable
from datetime import datetime, timezone

from xml.etree import ElementTree
from time import mktime
from feedparser import parse as feedparser

from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
enforce_types
)


@enforce_types
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links"""

rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot()
items = root.findall("{http://purl.org/rss/1.0/}item")
for item in items:
find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore

url = find("{http://purl.org/rss/1.0/}link")
tags = find("{http://purl.org/dc/elements/1.1/}subject")
title = find("{http://purl.org/rss/1.0/}title")
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
feed = feedparser(rss_file.read())
for item in feed.entries:
url = item.link
# title will start with "[priv] " if pin was marked private. useful?
title = item.title
time = mktime(item.updated_parsed)

# all tags are in one entry.tags with spaces in it. annoying!
try:
tags = item.tags[0].term.replace(' ', ',')
except AttributeError:
tags = ''

if url is None:
# Yielding a Link with no URL will
# crash on a URL validation assertion
continue

# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]

if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now(timezone.utc)

yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
timestamp=str(time),
title=htmldecode(title) or None,
tags=htmldecode(tags) or None,
sources=[rss_file.name],
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies = [
"dateparser>=1.0.0",
"django-extensions>=3.0.3",
"django>=3.1.3,<3.2",
"feedparser>=6.0.11",
"ipython>5.0.0",
"mypy-extensions>=0.4.3",
"python-crontab>=2.5.1",
Expand Down
24 changes: 24 additions & 0 deletions tests/mock_server/templates/example.atom
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="utf-8"?>
<feed
xml:lang="en"
xmlns="http://www.w3.org/2005/Atom"
>
<id>http://www.example.com/</id>
<title>Example of an Atom feed</title>
<link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" />
<link rel="alternate" type="text/html" href="http://www.example.com/" />
<author>
<name>Jim Winstead</name>
</author>
<updated>2024-02-26T03:18:26Z</updated>
<entry>
<title>Example</title>
<link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" />
<id>tag:example.com,2024-02-25:3319</id>
<updated>2024-02-26T03:18:26Z</updated>
<published>2024-02-25T19:18:25-08:00</published>
<category term="Tag1" scheme="http://example.com/archive" />
<category term="Tag2" scheme="http://example.com/archive" />
<content type="html">This is some &lt;b&gt;content&lt;/b&gt;</content>
</entry>
</feed>
32 changes: 32 additions & 0 deletions tests/mock_server/templates/example.rss
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:admin="http://webns.net/mvcb/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<channel>
<title>Sample Feed</title>
<link>http://example.org/</link>
<description>For documentation only</description>
<dc:language>en-us</dc:language>
<dc:creator>Nobody (nobody@example.org)</dc:creator>
<dc:rights>Public domain</dc:rights>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<admin:generatorAgent rdf:resource="http://www.example.org/"/>
<admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/>

<item>
<title>First!</title>
<link>http://127.0.0.1:8080/static/example.com.html</link>
<guid isPermaLink="false">just-an@example.org</guid>
<description>
This has a description.
</description>
<dc:subject>Tag1 Tag2</dc:subject>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<content:encoded><![CDATA[
This has a <b>description</b>.]]>
</content:encoded>
</item>
</channel>
</rss>
68 changes: 68 additions & 0 deletions tests/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,71 @@ def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags

def test_generic_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)

conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()

urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://purl.org/dc/elements/1.1/" in urls

tags = list(map(lambda x: x[0], tags))
assert "Tag1 Tag2" in tags

def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)

conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()

tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags

def test_atom(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)

conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()

urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.w3.org/2005/Atom" in urls

tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags

0 comments on commit 099f7d0

Please sign in to comment.