Skip to content

Commit

Permalink
Add exclude functionality to filter links based on array of patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
terwilligergreen committed Jun 5, 2012
1 parent a4d6976 commit cf4684e
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 17 deletions.
31 changes: 21 additions & 10 deletions config.php
Expand Up @@ -8,10 +8,10 @@
/**
* MySQL Connection Settings
*/
$mysql_server = '';
$mysql_user = '';
$mysql_pass = '';
$mysql_db = '';
$mysql_server = 'localhost';
$mysql_user = 'root';
$mysql_pass = 'galgren';
$mysql_db = 'fcc_crawler';

/**
* Local Timezone
Expand All @@ -21,7 +21,7 @@
/**
* Script Timeout (seconds)
*/
$timeout = 180;
$timeout = 380;

/**
*
Expand All @@ -30,7 +30,18 @@
* Example: "www.fcc.gov, broadband.com"
*
*/
$domains = "www.fcc.gov, broadband.com";
$domains = "www.fcc.gov";

/**
*
* Patterns separated by a commas that if found should be excluded from crawl and link count.
*
* Example: "/fontsize=/i"
*
*/
$excluded_array = array("/fontsize=/", "/contrast=/","/page=[2-9]+/","/page=1[0-9]+/",
"/related-rss/","/document/","/print\/node/","/gov\/related\//","/gov\/reports\//",
"/gov\/Bureaus/","/gov\/events\/[a-zA-Z0-9]+/","/blog\/[0-9]+/","/ecfs\/comment\//","/fcc-bin\/bye/");

/**
*
Expand All @@ -39,12 +50,12 @@
* Example: "mainsite"
*
*/
$crawl_tag = "mainsites";
$crawl_tag = "mainsite";

/**
* Settings to save html of page into database
*/
$store_local = True; // Set to False to not store
$store_local = True; // Set to False to not store


/**
Expand All @@ -56,9 +67,9 @@
* Check to ensure settings are not defaults
*/

if ($mysql_server == ''|$mysql_user == ''|$mysql_pass==''|$mysql_db=='') die('You must enter MySQL information in config.php before continuing');
if ($mysql_server == ''|$mysql_user == ''|$mysql_pass==''|$mysql_db=='') die('You must enter MySQL information in config.php before continuing');

if ($domains == '') die('You must enter one or more domains in config.php before continuing');
if ($domains == '') die('You must enter one or more domains in config.php before continuing');

/**
* Initiate database connection
Expand Down
7 changes: 6 additions & 1 deletion crawl.php
Expand Up @@ -90,7 +90,12 @@
* Verify that the link target is within our array of domains
*/
if (out_of_domain($link)) continue;


/**
* Verify that the link target is not excluded by a string match
*/
if (exclude_by_pattern($link)) continue;

/**
* Verify that the link is not a mailto: link
*/
Expand Down
38 changes: 32 additions & 6 deletions includes/functions.php
Expand Up @@ -112,19 +112,45 @@ function is_image($link) {
}

/**
* Checks to see that a given link is within the domain whitelist
* Checks to see that a given link is within the domain/host whitelist
*
* Note to self: this can be rewritten using a single regex command
* Improved from original to use regular expression and match hosts.
*
* @params string $link target link
* @return bool true if out of domain, false if on domain whitelist
*/
function out_of_domain($link) {
global $domain_array;
foreach ($domain_array as $domain) {
if (stripos($link,trim($domain)) != FALSE) return false;
}
return true;

// get host name from URL
preg_match("/^(http:\/\/)?([^\/]+)/i", $link, $matches);
$host = $matches[2];
// echo "<br />host: $host"; 
// get last two segments of host name
// preg_match("/[^\.\/]+\.[^\.\/]+$/", $host, $matches);
foreach ($domain_array as $domain) {
if ($domain == $host) {
return FALSE;
}
}
return TRUE;
}

/**
* Checks to see that a given link matches a pattern in the exclude list
*
* @params string $link target link
* @return bool true if matches exclude, false if no match
*/
function exclude_by_pattern($link) {
global $excluded_array;
foreach ($excluded_array as $pattern) {
if ( preg_match($pattern, urldecode($link)) ) {
echo "<p>matched exclude pattern <b>$pattern</b> in ".urldecode($link)."</p>";
return TRUE;
}
}
return FALSE;
}

/**
Expand Down

0 comments on commit cf4684e

Please sign in to comment.