Skip to content

Commit

Permalink
Fix grabber of url when url is on second level dir. More phpunits
Browse files Browse the repository at this point in the history
  • Loading branch information
eldy committed Dec 10, 2017
1 parent 02d9e93 commit e1f0483
Show file tree
Hide file tree
Showing 6 changed files with 328 additions and 30 deletions.
36 changes: 35 additions & 1 deletion htdocs/core/lib/geturl.lib.php
Expand Up @@ -123,7 +123,8 @@ function getURLContent($url,$postorget='GET',$param='',$followlocation=1,$addhea
$request = curl_getinfo($ch, CURLINFO_HEADER_OUT); // Reading of request must be done after sending request

dol_syslog("getURLContent request=".$request);
dol_syslog("getURLContent response=".$response);
//dol_syslog("getURLContent response =".response); // This may contains binary data, so we dont output it
dol_syslog("getURLContent response size =".strlen($response)); // This may contains binary data, so we dont output it

$rep=array();
if (curl_errno($ch))
Expand Down Expand Up @@ -173,5 +174,38 @@ function getDomainFromURL($url)
$tmpdomain = preg_replace('/\/.*$/i', '', $tmpdomain); // Remove part after domain
$tmpdomain = preg_replace('/\.[^\.]+$/', '', $tmpdomain); // Remove first level domain (.com, .net, ...)
$tmpdomain = preg_replace('/^[^\.]+\./', '', $tmpdomain); // Remove part www. before domain name

return $tmpdomain;
}

/**
* Function root url from a long url
* For example: https://www.abc.mydomain.com/dir/page.html return 'https://www.abc.mydomain.com'
* For example: http://www.abc.mydomain.com/ return 'https://www.abc.mydomain.com'
*
* @param string $url Full URL.
* @return string Returns root url
*/
function getRootURLFromURL($url)
{
$prefix='';
$tmpurl = $url;
if (preg_match('/^(https?:\/\/)/i', $tmpurl, $reg)) $prefix = $reg[1];
$tmpurl = preg_replace('/^https?:\/\//i', '', $tmpurl); // Remove http(s)://
$tmpurl = preg_replace('/\/.*$/i', '', $tmpurl); // Remove part after domain

return $prefix.$tmpurl;
}

/**
* Function to remove comments into HTML content
*
* @param string $content Text content
* @return string Returns text without HTML comments
*/
function removeHtmlComment($content)
{
$content = preg_replace('/<!--[^\-]+-->/', '', $content);
return $content;
}

37 changes: 33 additions & 4 deletions htdocs/core/lib/website.lib.php
Expand Up @@ -225,7 +225,16 @@ function getAllImages($object, $objectpage, $urltograb, &$tmp, &$action, $modify
{
if (preg_match('/^data:image/i', $regs[2][$key])) continue; // We do nothing for such images

$urltograbbis = $urltograb.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
if (preg_match('/^\//', $regs[2][$key]))
{
$urltograbdirrootwithoutslash = getRootURLFromURL($urltograb);
$urltograbbis = $urltograbdirrootwithoutslash.$regs[2][$key]; // We use dirroot
}
else
{
$urltograbbis = $urltograb.'/'.$regs[2][$key]; // We use dir of grabbed file
}

$linkwithoutdomain = $regs[2][$key];
$filetosave = $conf->medias->multidir_output[$conf->entity].'/image/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
if (preg_match('/^http/', $regs[2][$key]))
Expand All @@ -251,7 +260,13 @@ function getAllImages($object, $objectpage, $urltograb, &$tmp, &$action, $modify
if ($tmpgeturl['curl_error_no'])
{
$error++;
setEventMessages($tmpgeturl['curl_error_msg'], null, 'errors');
setEventMessages('Error getting '.$urltograbbis.': '.$tmpgeturl['curl_error_msg'], null, 'errors');
$action='create';
}
elseif ($tmpgeturl['http_code'] != '200')
{
$error++;
setEventMessages('Error getting '.$urltograbbis.': '.$tmpgeturl['http_code'], null, 'errors');
$action='create';
}
else
Expand Down Expand Up @@ -281,7 +296,15 @@ function getAllImages($object, $objectpage, $urltograb, &$tmp, &$action, $modify
{
if (preg_match('/^data:image/i', $regs[2][$key])) continue; // We do nothing for such images

$urltograbbis = $urltograb.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
if (preg_match('/^\//', $regs[2][$key]))
{
$urltograbdirrootwithoutslash = getRootURLFromURL($urltograb);
$urltograbbis = $urltograbdirrootwithoutslash.$regs[2][$key]; // We use dirroot
}
else
{
$urltograbbis = $urltograb.'/'.$regs[2][$key]; // We use dir of grabbed file
}

$linkwithoutdomain = $regs[2][$key];
$filetosave = $conf->medias->multidir_output[$conf->entity].'/image/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
Expand Down Expand Up @@ -309,7 +332,13 @@ function getAllImages($object, $objectpage, $urltograb, &$tmp, &$action, $modify
if ($tmpgeturl['curl_error_no'])
{
$error++;
setEventMessages($tmpgeturl['curl_error_msg'], null, 'errors');
setEventMessages('Error getting '.$urltograbbis.': '.$tmpgeturl['curl_error_msg'], null, 'errors');
$action='create';
}
elseif ($tmpgeturl['http_code'] != '200')
{
$error++;
setEventMessages('Error getting '.$urltograbbis.': '.$tmpgeturl['http_code'], null, 'errors');
$action='create';
}
else
Expand Down
2 changes: 2 additions & 0 deletions htdocs/langs/en_US/website.lang
Expand Up @@ -54,6 +54,8 @@ OrEnterPageInfoManually=Or create empty page from scratch...
FetchAndCreate=Fetch and Create
ExportSite=Export site
IDOfPage=Id of page
Banner=Bandeau
BlogPost=Blog post
WebsiteAccount=Web site account
WebsiteAccounts=Web site accounts
AddWebsiteAccount=Create web site account
Expand Down
89 changes: 64 additions & 25 deletions htdocs/website/index.php
Expand Up @@ -255,6 +255,8 @@

if ($urltograb)
{
include_once DOL_DOCUMENT_ROOT.'/core/lib/geturl.lib.php';

// Clean url to grab, so url can be
// http://www.example.com/ or http://www.example.com/dir1/ or http://www.example.com/dir1/aaa
$urltograbwithoutdomainandparam = preg_replace('/^https?:\/\/[^\/]+\/?/i', '', $urltograb);
Expand All @@ -263,24 +265,35 @@
{
$urltograb.='/';
}
$urltograbdirwithoutslash = dirname($urltograb.'.');

include_once DOL_DOCUMENT_ROOT.'/core/lib/geturl.lib.php';
$urltograbdirwithoutslash = dirname($urltograb.'.');
$urltograbdirrootwithoutslash = getRootURLFromURL($urltograbdirwithoutslash);
// Exemple, now $urltograbdirwithoutslash is https://www.dolimed.com/screenshots
// and $urltograbdirrootwithoutslash is https://www.dolimed.com

$tmp = getURLContent($urltograb);
if ($tmp['curl_error_no'])
{
$error++;
setEventMessages($tmp['curl_error_msg'], null, 'errors');
setEventMessages('Error getting '.$urltograb.': '.$tmp['curl_error_msg'], null, 'errors');
$action='create';
}
elseif ($tmp['http_code'] != '200')
{
$error++;
setEventMessages('Error getting '.$urltograb.': '.$tmp['http_code'], null, 'errors');
$action='create';
}
else
{
// Remove comments
$tmp['content'] = removeHtmlComment($tmp['content']);

preg_match('/<head>(.*)<\/head>/is', $tmp['content'], $reg);
$head = $reg[1];

$objectpage->type_container = 'page';
$objectpage->pageurl = dol_sanitizeFileName(preg_replace('/[\/\.]/','-',$urltograbwithoutdomainandparam));
$objectpage->pageurl = dol_sanitizeFileName(preg_replace('/[\/\.]/','-', preg_replace('/\/+$/', '', $urltograbwithoutdomainandparam)));
if (empty($objectpage->pageurl))
{
$tmpdomain = getDomainFromURL($urltograb);
Expand Down Expand Up @@ -336,10 +349,17 @@
preg_match_all('/<script([^\.>]+)src=["\']([^"\'>]+)["\']([^>]*)><\/script>/i', $objectpage->htmlheader, $regs);
foreach ($regs[0] as $key => $val)
{
dol_syslog("We will grab the resource ".$regs[2][$key]);
dol_syslog("We will grab the resource found into script tag ".$regs[2][$key]);

$linkwithoutdomain = $regs[2][$key];
$urltograbbis = $urltograbdirwithoutslash.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
if (preg_match('/^\//', $regs[2][$key]))
{
$urltograbbis = $urltograbdirrootwithoutslash.$regs[2][$key]; // We use dirroot
}
else
{
$urltograbbis = $urltograbdirwithoutslash.'/'.$regs[2][$key]; // We use dir of grabbed file
}

//$filetosave = $conf->medias->multidir_output[$conf->entity].'/css/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
if (preg_match('/^http/', $regs[2][$key]))
Expand All @@ -362,10 +382,16 @@
if ($tmpgeturl['curl_error_no'])
{
$error++;
setEventMessages($tmpgeturl['curl_error_msg'], null, 'errors');
setEventMessages('Error getting '.$urltograbbis.': '.$tmpgeturl['curl_error_msg'], null, 'errors');
$action='create';
}
else
elseif ($tmpgeturl['http_code'] != '200')
{
$error++;
setEventMessages('Error getting '.$urltograbbis.': '.$tmpgeturl['http_code'], null, 'errors');
$action='create';
}
else
{
dol_mkdir(dirname($filetosave));
Expand All @@ -389,10 +415,17 @@
preg_match_all('/<link([^\.>]+)href=["\']([^"\'>]+\.css[^"\'>]*)["\']([^>]*)>/i', $objectpage->htmlheader, $regs);
foreach ($regs[0] as $key => $val)
{
dol_syslog("We will grab the resource ".$regs[2][$key]);
dol_syslog("We will grab the resource found into link tag ".$regs[2][$key]);

$linkwithoutdomain = $regs[2][$key];
$urltograbbis = $urltograbdirwithoutslash.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
if (preg_match('/^\//', $regs[2][$key]))
{
$urltograbbis = $urltograbdirrootwithoutslash.$regs[2][$key]; // We use dirroot
}
else
{
$urltograbbis = $urltograbdirwithoutslash.'/'.$regs[2][$key]; // We use dir of grabbed file
}

//$filetosave = $conf->medias->multidir_output[$conf->entity].'/css/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
if (preg_match('/^http/', $regs[2][$key]))
Expand All @@ -414,28 +447,34 @@
if ($tmpgeturl['curl_error_no'])
{
$error++;
setEventMessages($tmpgeturl['curl_error_msg'], null, 'errors');
setEventMessages('Error getting '.$urltograbbis.': '.$tmpgeturl['curl_error_msg'], null, 'errors');
$action='create';
}
elseif ($tmpgeturl['http_code'] != '200')
{
$error++;
setEventMessages('Error getting '.$urltograbbis.': '.$tmpgeturl['http_code'], null, 'errors');
$action='create';
}
else
{
//dol_mkdir(dirname($filetosave));
//dol_mkdir(dirname($filetosave));

//$fp = fopen($filetosave, "w");
//fputs($fp, $tmpgeturl['content']);
//fclose($fp);
//if (! empty($conf->global->MAIN_UMASK))
// @chmod($file, octdec($conf->global->MAIN_UMASK));
}
//$fp = fopen($filetosave, "w");
//fputs($fp, $tmpgeturl['content']);
//fclose($fp);
//if (! empty($conf->global->MAIN_UMASK))
// @chmod($file, octdec($conf->global->MAIN_UMASK));

// $filename = 'image/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $linkwithoutdomain)?'':'/').$linkwithoutdomain;
$pagecsscontent.='/* Content of file '.$urltograbbis.' */'."\n";
// $filename = 'image/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $linkwithoutdomain)?'':'/').$linkwithoutdomain;
$pagecsscontent.='/* Content of file '.$urltograbbis.' */'."\n";

getAllImages($object, $objectpage, $urltograbbis, $tmpgeturl['content'], $action, 1);
getAllImages($object, $objectpage, $urltograbbis, $tmpgeturl['content'], $action, 1);

$pagecsscontent.=$tmpgeturl['content']."\n";
$pagecsscontent.=$tmpgeturl['content']."\n";

$objectpage->htmlheader = preg_replace('/'.preg_quote($regs[0][$key],'/').'\n*/ims', '', $objectpage->htmlheader);
$objectpage->htmlheader = preg_replace('/'.preg_quote($regs[0][$key],'/').'\n*/ims', '', $objectpage->htmlheader);
}
}

$pagecsscontent.='</style>'."\n";
Expand Down Expand Up @@ -1790,7 +1829,7 @@

if ($action != 'create')
{
print '<tr><td class="titlefield">';
print '<tr><td class="titlefield fieldrequired">';
print $langs->trans('IDOfPage');
print '</td><td>';
print $pageid;
Expand Down Expand Up @@ -1828,7 +1867,7 @@
print '<tr><td class="titlefield fieldrequired">';
print $langs->trans('WEBSITE_TYPE_CONTAINER');
print '</td><td>';
$arrayoftype=array('page'=>$langs->trans("Page"), 'banner'=>$langs->trans("Banner"), 'blogpost'=>$langs->trans("BlogPost"));
$arrayoftype=array('page'=>$langs->trans("Page"), 'banner'=>$langs->trans("Banner"), 'blogpost'=>$langs->trans("BlogPost"), 'other'=>$langs->trans("Other"));
print $form->selectarray('WEBSITE_TYPE_CONTAINER', $arrayoftype, $type_container);
print '</td></tr>';

Expand Down
2 changes: 2 additions & 0 deletions test/phpunit/AllTests.php
Expand Up @@ -93,6 +93,8 @@ public static function suite()
$suite->addTestSuite('MarginsLibTest');
require_once dirname(__FILE__).'/FilesLibTest.php';
$suite->addTestSuite('FilesLibTest');
require_once dirname(__FILE__).'/GetUrlLibTest.php';
$suite->addTestSuite('GetUrlLibTest');
require_once dirname(__FILE__).'/JsonLibTest.php';
$suite->addTestSuite('JsonLibTest');
require_once dirname(__FILE__).'/ImagesLibTest.php';
Expand Down

0 comments on commit e1f0483

Please sign in to comment.