Skip to content

Commit

Permalink
Implement Web scraping "HTML + XPath" (#4220)
Browse files Browse the repository at this point in the history
* More PHP type hints for Fever
Follow-up of #4201
Related to #4200

* Detail

* Draft

* Progress

* More draft

* Fix thumbnail PHP type hint
#4215

* More types

* A bit more

* Refactor FreshRSS_Entry::fromArray

* Progress

* Starts to work

* Categories

* Fonctional

* Layout update

* Fix relative URLs

* Cache system

* Forgotten files

* Remove a debug line

* Automatic form validation of XPath expressions

* data-leave-validation

* Fix reload action

* Simpler examples

* Fix column type for PostgreSQL

* Enforce HTTP encoding

* Readme

* Fix get full content

* target="_blank"

* gitignore

* htmlspecialchars_utf8

* Implement HTML <base>
And fix/revert `xml:base` support in SimplePie simplepie/simplepie@e49c578

* SimplePie upstream PR merged
simplepie/simplepie#723
  • Loading branch information
Alkarex committed Feb 28, 2022
1 parent fa23ae7 commit 1fe66ad
Show file tree
Hide file tree
Showing 56 changed files with 1,567 additions and 155 deletions.
2 changes: 2 additions & 0 deletions README.fr.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ Il y a une API pour les clients (mobiles), ainsi qu’une [interface en ligne de
Grâce au standard [WebSub](https://www.w3.org/TR/websub/) (anciennement [PubSubHubbub](https://github.com/pubsubhubbub/PubSubHubbub)),
FreshRSS est capable de recevoir des notifications push instantanées depuis les sources compatibles, telles [Mastodon](https://joinmastodon.org), [Friendica](https://friendi.ca), [WordPress](https://wordpress.org/plugins/pubsubhubbub/), Blogger, FeedBurner, etc.

FreshRSS supporte nativement le moissonnage du Web (Web Scraping) basique, basé sur [XPath](https://www.w3.org/TR/xpath-10/), pour les sites Web sans flux RSS / Atom.

Enfin, il permet l’ajout d’[extensions](#extensions) pour encore plus de personnalisation.

Les demandes de fonctionnalités, rapports de bugs, et autres contributions sont les bienvenues. Privilégiez pour cela des [demandes sur GitHub](https://github.com/FreshRSS/FreshRSS/issues).
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ There is an API for (mobile) clients, and a [Command-Line Interface](cli/README.
Thanks to the [WebSub](https://www.w3.org/TR/websub/) standard (formerly [PubSubHubbub](https://github.com/pubsubhubbub/PubSubHubbub)),
FreshRSS is able to receive instant push notifications from compatible sources, such as [Mastodon](https://joinmastodon.org), [Friendica](https://friendi.ca), [WordPress](https://wordpress.org/plugins/pubsubhubbub/), Blogger, FeedBurner, etc.

FreshRSS natively supports basic Web scraping, based on [XPath](https://www.w3.org/TR/xpath-10/), for Web sites not providing any RSS / Atom feed.

Finally, it supports [extensions](#extensions) for further tuning.

Feature requests, bug reports, and other contributions are welcome. The best way to contribute is to [open an issue on GitHub](https://github.com/FreshRSS/FreshRSS/issues).
Expand Down
48 changes: 43 additions & 5 deletions app/Controllers/feedController.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public function firstAction() {
* @throws FreshRSS_Feed_Exception
* @throws Minz_FileNotExistException
*/
public static function addFeed($url, $title = '', $cat_id = 0, $new_cat_name = '', $http_auth = '', $attributes = array()) {
public static function addFeed($url, $title = '', $cat_id = 0, $new_cat_name = '', $http_auth = '', $attributes = array(), $kind = FreshRSS_Feed::KIND_RSS) {
FreshRSS_UserDAO::touch();
@set_time_limit(300);

Expand Down Expand Up @@ -67,10 +67,19 @@ public static function addFeed($url, $title = '', $cat_id = 0, $new_cat_name = '
$cat_id = $cat == null ? FreshRSS_CategoryDAO::DEFAULTCATEGORYID : $cat->id();

$feed = new FreshRSS_Feed($url); //Throws FreshRSS_BadUrl_Exception
$feed->_kind($kind);
$feed->_attributes('', $attributes);
$feed->_httpAuth($http_auth);
$feed->load(true); //Throws FreshRSS_Feed_Exception, Minz_FileNotExistException
$feed->_category($cat_id);
switch ($kind) {
case FreshRSS_Feed::KIND_RSS:
case FreshRSS_Feed::KIND_RSS_FORCED:
$feed->load(true); //Throws FreshRSS_Feed_Exception, Minz_FileNotExistException
break;
case FreshRSS_Feed::KIND_HTML_XPATH:
$feed->_website($url);
break;
}

$feedDAO = FreshRSS_Factory::createFeedDao();
if ($feedDAO->searchByUrl($feed->url())) {
Expand All @@ -85,8 +94,9 @@ public static function addFeed($url, $title = '', $cat_id = 0, $new_cat_name = '

$values = array(
'url' => $feed->url(),
'kind' => $feed->kind(),
'category' => $feed->category(),
'name' => $title != '' ? $title : $feed->name(),
'name' => $title != '' ? $title : $feed->name(true),
'website' => $feed->website(),
'description' => $feed->description(),
'lastUpdate' => 0,
Expand Down Expand Up @@ -184,8 +194,25 @@ public function addAction() {
$timeout = intval(Minz_Request::param('timeout', 0));
$attributes['timeout'] = $timeout > 0 ? $timeout : null;

$feed_kind = Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS);
if ($feed_kind == FreshRSS_Feed::KIND_HTML_XPATH) {
$xPathSettings = [];
if (Minz_Request::param('xPathFeedTitle', '') != '') $xPathSettings['feedTitle'] = Minz_Request::param('xPathFeedTitle', '', true);
if (Minz_Request::param('xPathItem', '') != '') $xPathSettings['item'] = Minz_Request::param('xPathItem', '', true);
if (Minz_Request::param('xPathItemTitle', '') != '') $xPathSettings['itemTitle'] = Minz_Request::param('xPathItemTitle', '', true);
if (Minz_Request::param('xPathItemContent', '') != '') $xPathSettings['itemContent'] = Minz_Request::param('xPathItemContent', '', true);
if (Minz_Request::param('xPathItemUri', '') != '') $xPathSettings['itemUri'] = Minz_Request::param('xPathItemUri', '', true);
if (Minz_Request::param('xPathItemAuthor', '') != '') $xPathSettings['itemAuthor'] = Minz_Request::param('xPathItemAuthor', '', true);
if (Minz_Request::param('xPathItemTimestamp', '') != '') $xPathSettings['itemTimestamp'] = Minz_Request::param('xPathItemTimestamp', '', true);
if (Minz_Request::param('xPathItemThumbnail', '') != '') $xPathSettings['itemThumbnail'] = Minz_Request::param('xPathItemThumbnail', '', true);
if (Minz_Request::param('xPathItemCategories', '') != '') $xPathSettings['itemCategories'] = Minz_Request::param('xPathItemCategories', '', true);
if (!empty($xPathSettings)) {
$attributes['xpath'] = $xPathSettings;
}
}

try {
$feed = self::addFeed($url, '', $cat, '', $http_auth, $attributes);
$feed = self::addFeed($url, '', $cat, '', $http_auth, $attributes, $feed_kind);
} catch (FreshRSS_BadUrl_Exception $e) {
// Given url was not a valid url!
Minz_Log::warning($e->getMessage());
Expand Down Expand Up @@ -264,6 +291,14 @@ public function truncateAction() {
}
}

/**
* @param int $feed_id
* @param string $feed_url
* @param bool $force
* @param SimplePie|null $simplePiePush
* @param bool $noCommit
* @param int $maxFeeds
*/
public static function actualizeFeed($feed_id, $feed_url, $force, $simplePiePush = null, $noCommit = false, $maxFeeds = 10) {
@set_time_limit(300);

Expand Down Expand Up @@ -338,6 +373,8 @@ public static function actualizeFeed($feed_id, $feed_url, $force, $simplePiePush
try {
if ($simplePiePush) {
$simplePie = $simplePiePush; //Used by WebSub
} elseif ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH) {
$simplePie = $feed->loadHtmlXpath(false, $isNewFeed);
} else {
$simplePie = $feed->load(false, $isNewFeed);
}
Expand Down Expand Up @@ -377,6 +414,7 @@ public static function actualizeFeed($feed_id, $feed_url, $force, $simplePiePush

$oldGuids = array();
// Add entries in database if possible.
/** @var FreshRSS_Entry $entry */
foreach ($entries as $entry) {
if (isset($newGuids[$entry->guid()])) {
continue; //Skip subsequent articles with same GUID
Expand Down Expand Up @@ -765,7 +803,7 @@ public function reloadAction() {

//Re-fetch articles as if the feed was new.
$feedDAO->updateFeed($feed->id(), [ 'lastUpdate' => 0 ]);
self::actualizeFeed($feed_id, null, false, null, true);
self::actualizeFeed($feed_id, '', false);

//Extract all feed entries from database, load complete content and store them back in database.
$entries = $entryDAO->listWhere('f', $feed_id, FreshRSS_Entry::STATE_ALL, 'DESC', 0);
Expand Down
2 changes: 1 addition & 1 deletion app/Controllers/indexController.php
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ public function rssAction() {
}

// No layout for RSS output.
$this->view->url = PUBLIC_TO_INDEX_PATH . '/' . (empty($_SERVER['QUERY_STRING']) ? '' : '?' . $_SERVER['QUERY_STRING']);
$this->view->rss_url = PUBLIC_TO_INDEX_PATH . '/' . (empty($_SERVER['QUERY_STRING']) ? '' : '?' . $_SERVER['QUERY_STRING']);
$this->view->rss_title = FreshRSS_Context::$name . ' | ' . FreshRSS_View::title();
$this->view->_layout(false);
header('Content-Type: application/rss+xml; charset=utf-8');
Expand Down
18 changes: 18 additions & 0 deletions app/Controllers/subscriptionController.php
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,26 @@ public function feedAction() {

$feed->_filtersAction('read', preg_split('/[\n\r]+/', Minz_Request::param('filteractions_read', '')));

$feed_kind = Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS);
if ($feed_kind == FreshRSS_Feed::KIND_HTML_XPATH) {
$xPathSettings = [];
if (Minz_Request::param('xPathFeedTitle', '') != '') $xPathSettings['feedTitle'] = Minz_Request::param('xPathFeedTitle', '', true);
if (Minz_Request::param('xPathItem', '') != '') $xPathSettings['item'] = Minz_Request::param('xPathItem', '', true);
if (Minz_Request::param('xPathItemTitle', '') != '') $xPathSettings['itemTitle'] = Minz_Request::param('xPathItemTitle', '', true);
if (Minz_Request::param('xPathItemContent', '') != '') $xPathSettings['itemContent'] = Minz_Request::param('xPathItemContent', '', true);
if (Minz_Request::param('xPathItemUri', '') != '') $xPathSettings['itemUri'] = Minz_Request::param('xPathItemUri', '', true);
if (Minz_Request::param('xPathItemAuthor', '') != '') $xPathSettings['itemAuthor'] = Minz_Request::param('xPathItemAuthor', '', true);
if (Minz_Request::param('xPathItemTimestamp', '') != '') $xPathSettings['itemTimestamp'] = Minz_Request::param('xPathItemTimestamp', '', true);
if (Minz_Request::param('xPathItemThumbnail', '') != '') $xPathSettings['itemThumbnail'] = Minz_Request::param('xPathItemThumbnail', '', true);
if (Minz_Request::param('xPathItemCategories', '') != '') $xPathSettings['itemCategories'] = Minz_Request::param('xPathItemCategories', '', true);
if (!empty($xPathSettings)) {
$feed->_attributes('xpath', $xPathSettings);
}
}

$values = array(
'name' => Minz_Request::param('name', ''),
'kind' => $feed_kind,
'description' => sanitizeHTML(Minz_Request::param('description', '', true)),
'website' => checkUrl(Minz_Request::param('website', '')),
'url' => checkUrl(Minz_Request::param('url', '')),
Expand Down
96 changes: 49 additions & 47 deletions app/Models/Entry.php
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,38 @@ public function __construct(int $feedId = 0, string $guid = '', string $title =
$this->_guid($guid);
}

/** @param array<string,mixed> $dao */
public static function fromArray(array $dao): FreshRSS_Entry {
if (!isset($dao['content'])) {
$dao['content'] = '';
}
if (isset($dao['thumbnail'])) {
$dao['content'] .= '<p class="enclosure-content"><img src="' . $dao['thumbnail'] . '" alt="" /></p>';
}
$entry = new FreshRSS_Entry(
$dao['id_feed'] ?? 0,
$dao['guid'] ?? '',
$dao['title'] ?? '',
$dao['author'] ?? '',
$dao['content'] ?? '',
$dao['link'] ?? '',
$dao['date'] ?? 0,
$dao['is_read'] ?? false,
$dao['is_favorite'] ?? false,
$dao['tags'] ?? ''
);
if (isset($dao['id'])) {
$entry->_id($dao['id']);
}
if (!empty($dao['timestamp'])) {
$entry->_date(strtotime($dao['timestamp']));
}
if (!empty($dao['categories'])) {
$entry->_tags($dao['categories']);
}
return $entry;
}

public function id(): string {
return $this->id;
}
Expand All @@ -83,6 +115,7 @@ public function content(): string {
return $this->content;
}

/** @return array<array<string,string>> */
public function enclosures(bool $searchBodyImages = false): array {
$results = [];
try {
Expand All @@ -97,11 +130,20 @@ public function enclosures(bool $searchBodyImages = false): array {
if ($searchEnclosures) {
$enclosures = $xpath->query('//div[@class="enclosure"]/p[@class="enclosure-content"]/*[@src]');
foreach ($enclosures as $enclosure) {
$results[] = [
$result = [
'url' => $enclosure->getAttribute('src'),
'type' => $enclosure->getAttribute('data-type'),
'medium' => $enclosure->getAttribute('data-medium'),
'length' => $enclosure->getAttribute('data-length'),
];
if (empty($result['medium'])) {
switch (strtolower($enclosure->nodeName)) {
case 'img': $result['medium'] = 'image'; break;
case 'video': $result['medium'] = 'video'; break;
case 'audio': $result['medium'] = 'audio'; break;
}
}
$results[] = $result;
}
}
if ($searchBodyImages) {
Expand Down Expand Up @@ -432,52 +474,12 @@ public function isDay(int $day, int $today): bool {
}
}

public static function getContentByParsing(string $url, string $path, array $attributes = array(), int $maxRedirs = 3): string {
$limits = FreshRSS_Context::$system_conf->limits;
$feed_timeout = empty($attributes['timeout']) ? 0 : intval($attributes['timeout']);

if (FreshRSS_Context::$system_conf->simplepie_syslog_enabled) {
syslog(LOG_INFO, 'FreshRSS GET ' . SimplePie_Misc::url_remove_credentials($url));
}

$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_REFERER => SimplePie_Misc::url_remove_credentials($url),
CURLOPT_HTTPHEADER => array('Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
CURLOPT_USERAGENT => FRESHRSS_USERAGENT,
CURLOPT_CONNECTTIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
CURLOPT_TIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
//CURLOPT_FAILONERROR => true;
CURLOPT_MAXREDIRS => 4,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_ENCODING => '', //Enable all encodings
]);

curl_setopt_array($ch, FreshRSS_Context::$system_conf->curl_options);

if (isset($attributes['curl_params']) && is_array($attributes['curl_params'])) {
curl_setopt_array($ch, $attributes['curl_params']);
}

if (isset($attributes['ssl_verify'])) {
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, $attributes['ssl_verify'] ? 2 : 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $attributes['ssl_verify'] ? true : false);
if (!$attributes['ssl_verify']) {
curl_setopt($ch, CURLOPT_SSL_CIPHER_LIST, 'DEFAULT@SECLEVEL=1');
}
}
$html = curl_exec($ch);
$c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$c_error = curl_error($ch);
curl_close($ch);

if ($c_status != 200 || $c_error != '') {
Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
}

if (is_string($html) && strlen($html) > 0) {
/**
* @param array<string,mixed> $attributes
*/
public static function getContentByParsing(string $url, string $path, array $attributes = [], int $maxRedirs = 3): string {
$html = getHtml($url, $attributes);
if (strlen($html) > 0) {
require_once(LIB_PATH . '/lib_phpQuery.php');
/**
* @var phpQueryObject @doc
Expand Down
31 changes: 7 additions & 24 deletions app/Models/EntryDAO.php
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ public function commitNewEntries() {
)
SELECT @rank:=@rank+1 AS id, guid, title, author, content_bin, link, date, `lastSeen`, hash, is_read, is_favorite, id_feed, tags
FROM `_entrytmp`
ORDER BY date;
ORDER BY date, id;
DELETE FROM `_entrytmp` WHERE id <= @rank;
SQL;
Expand Down Expand Up @@ -658,6 +658,7 @@ public function selectAll() {
}
}

/** @return FreshRSS_Entry|null */
public function searchByGuid($id_feed, $guid) {
// un guid est unique pour un flux donné
$sql = 'SELECT id, guid, title, author, '
Expand All @@ -669,9 +670,10 @@ public function searchByGuid($id_feed, $guid) {
$stm->bindParam(':guid', $guid);
$stm->execute();
$res = $stm->fetchAll(PDO::FETCH_ASSOC);
return isset($res[0]) ? self::daoToEntry($res[0]) : null;
return isset($res[0]) ? FreshRSS_Entry::fromArray($res[0]) : null;
}

/** @return FreshRSS_Entry|null */
public function searchById($id) {
$sql = 'SELECT id, guid, title, author, '
. ($this->isCompressed() ? 'UNCOMPRESS(content_bin) AS content' : 'content')
Expand All @@ -681,7 +683,7 @@ public function searchById($id) {
$stm->bindParam(':id', $id, PDO::PARAM_INT);
$stm->execute();
$res = $stm->fetchAll(PDO::FETCH_ASSOC);
return isset($res[0]) ? self::daoToEntry($res[0]) : null;
return isset($res[0]) ? FreshRSS_Entry::fromArray($res[0]) : null;
}

public function searchIdByGuid($id_feed, $guid) {
Expand Down Expand Up @@ -1061,7 +1063,7 @@ public function listWhere($type = 'a', $id = '', $state = FreshRSS_Entry::STATE_
$stm = $this->listWhereRaw($type, $id, $state, $order, $limit, $firstId, $filters, $date_min);
if ($stm) {
while ($row = $stm->fetch(PDO::FETCH_ASSOC)) {
yield self::daoToEntry($row);
yield FreshRSS_Entry::fromArray($row);
}
} else {
yield false;
Expand Down Expand Up @@ -1092,7 +1094,7 @@ public function listByIds($ids, $order = 'DESC') {
$stm = $this->pdo->prepare($sql);
$stm->execute($ids);
while ($row = $stm->fetch(PDO::FETCH_ASSOC)) {
yield self::daoToEntry($row);
yield FreshRSS_Entry::fromArray($row);
}
}

Expand Down Expand Up @@ -1251,23 +1253,4 @@ public function countUnreadReadFavorites() {
$unread = empty($res[1]) ? 0 : intval($res[1]);
return array('all' => $all, 'unread' => $unread, 'read' => $all - $unread);
}

public static function daoToEntry($dao) {
$entry = new FreshRSS_Entry(
$dao['id_feed'],
$dao['guid'],
$dao['title'],
$dao['author'],
$dao['content'],
$dao['link'],
$dao['date'],
$dao['is_read'],
$dao['is_favorite'],
isset($dao['tags']) ? $dao['tags'] : ''
);
if (isset($dao['id'])) {
$entry->_id($dao['id']);
}
return $entry;
}
}
4 changes: 2 additions & 2 deletions app/Models/EntryDAOPGSQL.php
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ public function commitNewEntries() {
BEGIN
INSERT INTO `_entry`
(id, guid, title, author, content, link, date, `lastSeen`, hash, is_read, is_favorite, id_feed, tags)
(SELECT rank + row_number() OVER(ORDER BY date) AS id, guid, title, author, content,
(SELECT rank + row_number() OVER(ORDER BY date, id) AS id, guid, title, author, content,
link, date, `lastSeen`, hash, is_read, is_favorite, id_feed, tags
FROM `_entrytmp` AS etmp
WHERE NOT EXISTS (
SELECT 1 FROM `_entry` AS ereal
WHERE (etmp.id = ereal.id) OR (etmp.id_feed = ereal.id_feed AND etmp.guid = ereal.guid))
ORDER BY date);
ORDER BY date, id);
DELETE FROM `_entrytmp` WHERE id <= maxrank;
END $$;';
$hadTransaction = $this->pdo->inTransaction();
Expand Down
Loading

0 comments on commit 1fe66ad

Please sign in to comment.