Permalink
Browse files

Add exclude functionality to filter links based on array of patterns

  • Loading branch information...
1 parent a4d6976 commit cf4684e7637d4d133c64df32fac08604b4650a69 Greg Elin committed Jun 5, 2012
Showing with 59 additions and 17 deletions.
  1. +21 −10 config.php
  2. +6 −1 crawl.php
  3. +32 −6 includes/functions.php
View
@@ -8,10 +8,10 @@
/**
* MySQL Connection Settings
*/
- $mysql_server = '';
- $mysql_user = '';
- $mysql_pass = '';
- $mysql_db = '';
+ $mysql_server = 'localhost';
+ $mysql_user = 'root';
+ $mysql_pass = 'galgren';
+ $mysql_db = 'fcc_crawler';
/**
* Local Timezone
@@ -21,7 +21,7 @@
/**
* Script Timeout (seconds)
*/
- $timeout = 180;
+ $timeout = 380;
/**
*
@@ -30,7 +30,18 @@
* Example: "www.fcc.gov, broadband.com"
*
*/
-$domains = "www.fcc.gov, broadband.com";
+$domains = "www.fcc.gov";
+
+/**
+ *
+ * Patterns separated by a commas that if found should be excluded from crawl and link count.
+ *
+ * Example: "/fontsize=/i"
+ *
+ */
+$excluded_array = array("/fontsize=/", "/contrast=/","/page=[2-9]+/","/page=1[0-9]+/",
+ "/related-rss/","/document/","/print\/node/","/gov\/related\//","/gov\/reports\//",
+ "/gov\/Bureaus/","/gov\/events\/[a-zA-Z0-9]+/","/blog\/[0-9]+/","/ecfs\/comment\//","/fcc-bin\/bye/");
/**
*
@@ -39,12 +50,12 @@
* Example: "mainsite"
*
*/
-$crawl_tag = "mainsites";
+$crawl_tag = "mainsite";
/**
* Settings to save html of page into database
*/
- $store_local = True; // Set to False to not store
+$store_local = True; // Set to False to not store
/**
@@ -56,9 +67,9 @@
* Check to ensure settings are not defaults
*/
- if ($mysql_server == ''|$mysql_user == ''|$mysql_pass==''|$mysql_db=='') die('You must enter MySQL information in config.php before continuing');
+if ($mysql_server == ''|$mysql_user == ''|$mysql_pass==''|$mysql_db=='') die('You must enter MySQL information in config.php before continuing');
- if ($domains == '') die('You must enter one or more domains in config.php before continuing');
+if ($domains == '') die('You must enter one or more domains in config.php before continuing');
/**
* Initiate database connection
View
@@ -90,7 +90,12 @@
* Verify that the link target is within our array of domains
*/
if (out_of_domain($link)) continue;
-
+
+ /**
+ * Verify that the link target is not excluded by a string match
+ */
+ if (exclude_by_pattern($link)) continue;
+
/**
* Verify that the link is not a mailto: link
*/
View
@@ -112,19 +112,45 @@ function is_image($link) {
}
/**
- * Checks to see that a given link is within the domain whitelist
+ * Checks to see that a given link is within the domain/host whitelist
*
- * Note to self: this can be rewritten using a single regex command
+ * Improved from original to use regular expression and match hosts.
*
* @params string $link target link
* @return bool true if out of domain, false if on domain whitelist
*/
function out_of_domain($link) {
global $domain_array;
- foreach ($domain_array as $domain) {
- if (stripos($link,trim($domain)) != FALSE) return false;
- }
- return true;
+
+ // get host name from URL
+ preg_match("/^(http:\/\/)?([^\/]+)/i", $link, $matches);
+ $host = $matches[2];
+ // echo "<br />host: $host"; 
+ // get last two segments of host name
+ // preg_match("/[^\.\/]+\.[^\.\/]+$/", $host, $matches);
+ foreach ($domain_array as $domain) {
+ if ($domain == $host) {
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
+
+/**
+ * Checks to see that a given link matches a pattern in the exclude list
+ *
+ * @params string $link target link
+ * @return bool true if matches exclude, false if no match
+ */
+function exclude_by_pattern($link) {
+ global $excluded_array;
+ foreach ($excluded_array as $pattern) {
+ if ( preg_match($pattern, urldecode($link)) ) {
+ echo "<p>matched exclude pattern <b>$pattern</b> in ".urldecode($link)."</p>";
+ return TRUE;
+ }
+ }
+ return FALSE;
}
/**

0 comments on commit cf4684e

Please sign in to comment.