Skip to content

Commit

Permalink
Refactor parser data fetching, fix data fetching with redirects
Browse files Browse the repository at this point in the history
- refer to OntoWiki issue #101
- data fetching was refactored an put into RDF parser class
- adapters do not fetch RDF anymore
- up to 10 redirects are now followed
- Virtuoso store adapter does not rely on Virtuoso data fetching anymore, since it
  does not follow redirects (at least 302)
- tests added for both Virtuoso adapter (integration test) an RDF parser class
  • Loading branch information
Philipp Frischmuth committed Sep 12, 2012
1 parent c07e7c6 commit 12c2526
Show file tree
Hide file tree
Showing 10 changed files with 6,571 additions and 115 deletions.
52 changes: 44 additions & 8 deletions library/Erfurt/Store/Adapter/Virtuoso.php
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ class Erfurt_Store_Adapter_Virtuoso implements Erfurt_Store_Adapter_Interface, E

private $_isOpenSourceVersion = true;

private $_httpClientAdapter = null;

// ------------------------------------------------------------------------
// --- Magic Methods ------------------------------------------------------
// ------------------------------------------------------------------------
Expand Down Expand Up @@ -560,23 +562,30 @@ public function importRdf($graphUri, $data, $type, $locator)
break;
}

$parser = $this->_rdfParser($type);
switch ($locator) {
case Erfurt_Syntax_RdfParser::LOCATOR_FILE:
$importSql = $this->_getImportSql('file', $data, $type, $graphUri);
break;

case Erfurt_Syntax_RdfParser::LOCATOR_URL:
// do some type guesswork
if (
substr($data, -2) == 'n3' ||
substr($data, -2) == 'nt' ||
substr($data, -3) == 'ttl'
) {
if (substr($data, -2) == 'n3' || substr($data, -2) == 'nt' || substr($data, -3) == 'ttl') {
$type = 'n3';
}
$importSql = $this->_getImportSql('url', $data, $type, $graphUri);
break;

$dataString = $parser->fetchDataFromUrl($data);
if (!$dataString) {
throw new Erfurt_Store_Adapter_Exception(
'Error importing statements: Failed to retrieve data for URL: ' . $data
);
}

$importSql = $this->_getImportSql('string', $dataString, $type, $graphUri);
break;
case Erfurt_Syntax_RdfParser::LOCATOR_DATASTRING:
$importSql = $this->_getImportSql('string', $data, $type, $graphUri);
break;
default:
throw new Erfurt_Store_Adapter_Exception("Locator '$locator' not supported by Virtuoso.");
break;
Expand All @@ -587,7 +596,6 @@ public function importRdf($graphUri, $data, $type, $locator)
$rid = $this->_execSql($importSql);

// parse namespace prefixes
$parser = Erfurt_Syntax_RdfParser::rdfParserWithFormat($type);
$namespacePrefixes = $parser->parseNamespaces($data, $locator);
$namespaces = Erfurt_App::getInstance()->getNamespaces();

Expand Down Expand Up @@ -1147,6 +1155,14 @@ protected function _getImportSql($method, $data, $type, $graphUri, $baseUri = nu
$baseUri,
$graphUri
);
} else if ($method === 'string') {
$importSql = sprintf(
"CALL DB.DBA.%s('%s', '%s', '%s')",
$importFunc,
addslashes($data),
$baseUri,
$graphUri
);
} else {
// import using internal Virtuoso/PL function
$importSql = sprintf(
Expand Down Expand Up @@ -1282,4 +1298,24 @@ private function _xyz()

return $logo;
}

private function _rdfParser($type)
{
$parser = Erfurt_Syntax_RdfParser::rdfParserWithFormat($type);
if (null !== $this->_httpClientAdapter) {
$parser->setHttpClientAdapter($this->_httpClientAdapter);
}

return $parser;
}

/**
* For testing purposes
*
* @param $httpClientAdapter
*/
public function setHttpClientAdapter($httpClientAdapter)
{
$this->_httpClientAdapter = $httpClientAdapter;
}
}
111 changes: 102 additions & 9 deletions library/Erfurt/Syntax/RdfParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,16 @@ class Erfurt_Syntax_RdfParser
const LOCATOR_URL = 10;
const LOCATOR_FILE = 20;
const LOCATOR_DATASTRING = 30;

protected $_parserAdapter = null;

/**
* @var Erfurt_Syntax_RdfParser_Adapter_Interface
*/
private $_parserAdapter = null;

private $_httpClient = null;
private $_httpClientAdapter = null;

private $_dataCache = array();

public static function rdfParserWithFormat($format)
{
Expand Down Expand Up @@ -65,26 +73,57 @@ public function reset()
}

/**
* @param string E.g. a filename, a url or the data to parse itself.
* @param int One of the supported pointer types.
*
*
* @param string $dataPointer E.g. a filename, a url or the data to parse itself.
* @param int $pointerType One of the supported pointer types.
* @param string|null $baseUri
* @return array Returns an RDF/PHP array.
* @throws Erfurt_Syntax_RdfParserException
*/
public function parse($dataPointer, $pointerType, $baseUri = null)
{
if ($pointerType === self::LOCATOR_URL) {
$result = $this->_parserAdapter->parseFromUrl($dataPointer);
$dataString = $this->fetchDataFromUrl($dataPointer);
if (!$dataString) {
throw new Erfurt_Syntax_RdfParserException('Failed to fetch data from URL:' . $dataPointer);
}
$result = $this->_parserAdapter->parseFromDataString($dataString, $baseUri);
} else if ($pointerType === self::LOCATOR_FILE) {
$result = $this->_parserAdapter->parseFromFilename($dataPointer);
} else if ($pointerType === self::LOCATOR_DATASTRING) {
$result = $this->_parserAdapter->parseFromDataString($dataPointer, $baseUri);
} else {
require_once 'Erfurt/Syntax/RdfParserException.php';
throw new Erfurt_Syntax_RdfParserException('Type of data pointer not valid.');
}

return $result;
}


public function fetchDataFromUrl($url)
{
// replace all whitespaces (prevent possible CRLF Injection attacks)
// http://www.acunetix.com/websitesecurity/crlf-injection.htm
$url = preg_replace('/\\s+/', '', $url);

if (!isset($this->_dataCache[$url])) {
$client = $this->_httpClient($url);
$response = $client->request();

if ($response->getStatus() === 200) {
$this->_dataCache[$url] = $response->getBody();
} else {
$this->_dataCache[$url] = true; // mark as already fetched
}
}

if (is_string($this->_dataCache[$url])) {
return $this->_dataCache[$url];
}

return false;
}

/**
* Call this method after parsing only. The function parseToStore will add namespaces automatically.
* This method is just for situations, where the namespaces are needed to after a in-memory parsing.
Expand All @@ -103,7 +142,12 @@ public function getNamespaces()
public function parseNamespaces($dataPointer, $pointerType)
{
if ($pointerType === self::LOCATOR_URL) {
$result = $this->_parserAdapter->parseNamespacesFromUrl($dataPointer);
$dataString = $this->fetchDataFromUrl($dataPointer);
if (!$dataString) {
throw new Erfurt_Syntax_RdfParserException('Failed to fetch data from URL:' . $dataPointer);
}

$result = $this->_parserAdapter->parseNamespacesFromDataString($dataString);
} else if ($pointerType === self::LOCATOR_FILE) {
$result = $this->_parserAdapter->parseNamespacesFromFilename($dataPointer);
} else if ($pointerType === self::LOCATOR_DATASTRING) {
Expand All @@ -124,7 +168,12 @@ public function getBaseUri()
public function parseToStore($dataPointer, $pointerType, $modelUri, $useAc = true, $baseUri = null)
{
if ($pointerType === self::LOCATOR_URL) {
$result = $this->_parserAdapter->parseFromUrlToStore($dataPointer, $modelUri, $useAc);
$dataString = $this->fetchDataFromUrl($dataPointer);
if (!$dataString) {
throw new Erfurt_Syntax_RdfParserException('Failed to fetch data from URL:' . $dataPointer);
}

$result = $this->_parserAdapter->parseFromDataStringToStore($dataString, $modelUri, $useAc, $baseUri);
} else if ($pointerType === self::LOCATOR_FILE) {
$result = $this->_parserAdapter->parseFromFilenameToStore($dataPointer, $modelUri, $useAc);
} else if ($pointerType === self::LOCATOR_DATASTRING) {
Expand All @@ -136,4 +185,48 @@ public function parseToStore($dataPointer, $pointerType, $modelUri, $useAc = tru

return $result;
}

/**
*
* @param $url
* @return Zend_Http_Client
*/
private function _httpClient($url = null)
{
if (null === $this->_httpClient) {
$options = array(
'maxredirects' => 10,
'timeout' => 30
);

if (null !== $this->_httpClientAdapter) {
$options['adapter'] = $this->_httpClientAdapter;
}

$this->_httpClient = Erfurt_App::getInstance()->getHttpClient(
$url,
$options
);

if ($this->_parserAdapter instanceof Erfurt_Syntax_RdfParser_Adapter_RdfXml) {
$this->_httpClient->setHeaders('Accept', 'application/rdf+xml, text/plain');
} else if ($this->_parserAdapter instanceof Erfurt_Syntax_RdfParser_Adapter_Turtle) {
$this->_httpClient->setHeaders('Accept', 'text/turtle, text/plain');
} else if ($this->_parserAdapter instanceof Erfurt_Syntax_RdfParser_Adapter_RdfJson) {
$this->_httpClient->setHeaders('Accept', 'application/rdf+json, text/plain');
}
}

return $this->_httpClient;
}

/**
* For testing purposes the HTTP client used for retrieving remote data can be overwritten.
*
* @param $httpClientAdapter
*/
public function setHttpClientAdapter($httpClientAdapter)
{
$this->_httpClientAdapter = $httpClientAdapter;
}
}
7 changes: 2 additions & 5 deletions library/Erfurt/Syntax/RdfParser/Adapter/Interface.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,12 @@ interface Erfurt_Syntax_RdfParser_Adapter_Interface
{
public function parseFromDataString($dataString);
public function parseFromFilename($filename);
public function parseFromUrl($url);


public function parseFromDataStringToStore($dataString, $graphUri, $useAc = true);
public function parseFromFilenameToStore($filename, $graphUri, $useAc = true);
public function parseFromUrlToStore($filename, $graphUri, $useAc = true);

public function parseNamespacesFromDataString($dataString);
public function parseNamespacesFromFilename($filename);
public function parseNamespacesFromUrl($url);


public function getBaseUri();
}
46 changes: 3 additions & 43 deletions library/Erfurt/Syntax/RdfParser/Adapter/RdfJson.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
* @license http://opensource.org/licenses/gpl-license.php GNU General Public License (GPL)
*/
class Erfurt_Syntax_RdfParser_Adapter_RdfJson extends Erfurt_Syntax_RdfParser_Adapter_Base
{

{
public function parseFromDataString($dataString, $baseUri = null, $type = null)
{
//because this method is reused internally we got to have this $type switch
Expand Down Expand Up @@ -54,31 +53,7 @@ public function parseFromFilename($filename)

return $this->parseFromDataString($dataString, $filename, self::TYPE_FILE);
}

public function parseFromUrl($url)
{
// replace all whitespaces (prevent possible CRLF Injection attacks)
// http://www.acunetix.com/websitesecurity/crlf-injection.htm
$url = preg_replace('/\\s+/', '', $url);

$handle = fopen($url, 'r');

if ($handle === false) {
require_once 'Erfurt/Syntax/RdfParserException.php';
throw new Erfurt_Syntax_RdfParserException("Failed to open file at url '$url'");
}

$dataString = '';

while(!feof($handle)) {
$dataString .= fread($handle, 1024);
}

fclose($handle);

return $this->parseFromDataString($dataString, $url, self::TYPE_URL);
}

public function parseFromDataStringToStore($dataString, $graphUri, $useAc = true)
{
$triples = $this->parseFromDataString($dataString, $graphUri);
Expand All @@ -100,18 +75,7 @@ public function parseFromFilenameToStore($filename, $graphUri, $useAc = true)

return true;
}

public function parseFromUrlToStore($url, $graphUri, $useAc = true)
{
$triples = $this->parseFromUrl($url);

$store = Erfurt_App::getInstance()->getStore();

$store->addMultipleStatements($graphUri, $triples, $useAc);

return true;
}


public function parseNamespacesFromDataString($dataString)
{
return array();
Expand All @@ -121,9 +85,5 @@ public function parseNamespacesFromFilename($filename)
{
return array();
}

public function parseNamespacesFromUrl($url)
{
return array();
}
}

Loading

0 comments on commit 12c2526

Please sign in to comment.