<?xml version="1.0" encoding="UTF-8"?>
<commit>
  <added type="array"/>
  <modified type="array">
    <modified>
      <diff>@@ -1,2 +1,2 @@
 
--record(task, {url}).
+-record(task, {url,sandboxRegex=&quot;&quot;,depth=-1}).</diff>
      <filename>include/task.hrl</filename>
    </modified>
    <modified>
      <diff>@@ -109,17 +109,21 @@ code_change(_OldVsn, State, _Extra) -&gt;
 
 process_task(Task) -&gt;
     Url = Task#task.url,
+
     io:format(&quot;~p Processing ~p~n&quot;, [self(), Url]),
 
-    {http, Host, _Port, _File} = parse(Url),
+    {http, Host, Port, File} = parse(Url),
 
     case http:request(get, {Url, ?HEADERS(Host)},
                       [], [{body_format, string}]) of
         {ok, {{_Version, 200, _Reason}, _Headers, Body}} -&gt;
             Parsed = mochiweb_html:parse(Body),
 	  
-            Links = extract_document_links(Parsed),
+	    % Extracts all links from the document and ensures they
+	    % are within the sandbox
+            Links = filter_regex(extract_document_links(Parsed, {http, Host, Port, File}), Task#task.sandboxRegex),
 
+	   
             %DocumentText = clean_document(Parsed),
             %TermFrequencies = clustering:term_frequencies(DocumentText),
 
@@ -134,11 +138,11 @@ process_task(Task) -&gt;
             #result{status=failure, code=Other}
     end.
 
-extract_document_links(Html) -&gt;
+extract_document_links(Html, {http, Host, Port, File}) -&gt;
     BinaryLinks = lists:flatten(extract_links(Html)),
     StringLinks = lists:map(fun(X) -&gt; binary_to_list(X) end,
                             BinaryLinks),
-    CleanedLinks = clean_links(StringLinks),
+    CleanedLinks = clean_links(StringLinks, {http, Host, Port, File}),
     
     lists:filter(fun(dud) -&gt; false;
                     (_X) -&gt; true
@@ -164,16 +168,19 @@ extract_links([_Head|Tail]) -&gt; extract_links(Tail);
 extract_links(X) when is_binary(X) -&gt; [];
 extract_links([]) -&gt; [];
 extract_links(X) -&gt;
-    io:format(&quot;DEBUG: extract_links(~p)~n&quot;, [X]),
+%    io:format(&quot;DEBUG: extract_links(~p)~n&quot;, [X]),
     [].
 
-clean_links(Links) -&gt;
+clean_links(Links, {http, Host, Port, File}) -&gt;
     lists:map(fun(&quot;&quot;) -&gt; dud; % Probably an AJAX link.
                  (&quot;javascript:&quot; ++ _Tail) -&gt; dud; % Don't care about JS
                  (&quot;http:&quot;) -&gt; dud; % These appear on Google pages sometimes...
                  (&quot;#&quot;) -&gt; dud; % Ignore page-local links
-%                 (&quot;/&quot; ++ Tail) -&gt; % Site-relative URL, convert to absolute
-%                      &quot;http://&quot; ++ Host ++ &quot;/&quot; ++ Tail;
+                 (&quot;/&quot; ++ Tail) -&gt; % Site-relative URL, convert to absolute
+		      case Port of
+			  80 -&gt;&quot;http://&quot; ++ Host ++ &quot;/&quot; ++ Tail;
+			  _ -&gt;&quot;http://&quot; ++Host++&quot;:&quot;+ integer_to_list(Port) ++ &quot;/&quot; ++ Tail
+		      end;   
                  (&quot;mailto:&quot; ++ _Tail) -&gt; dud; % Mail links... don't care!
                  (&quot;http://&quot; ++ Tail) -&gt; &quot;http://&quot; ++ Tail;
                  (Other) -&gt;
@@ -235,3 +242,16 @@ scan_for_slash(X) -&gt;
 	$/ -&gt; lists:reverse(Rest);
 	_  -&gt; scan_for_slash(Rest)
     end.
+
+
+% Filters a list for all items that match a given regular expression. 
+% Items with no match are discarded
+filter_regex(ItemList, Regex) -&gt;
+    lists:filter(fun(Item) -&gt;
+		   case regexp:first_match(Item, Regex) of
+		       {match, _, _} -&gt; true;
+		       _ -&gt; false
+		   end
+	   end,
+	   ItemList).
+</diff>
      <filename>src/fetcher.erl</filename>
    </modified>
    <modified>
      <diff>@@ -102,5 +102,5 @@ report_stats() -&gt;
     io:format(&quot;~nSTATS: l(WorkerQueue) = ~p, l(TaskQueue) = ~p&quot;,
               [WorkerQueueLength, TaskQueueLength]),
 
-    timer:apply_after(1000, ?MODULE, report_stats, []),
+    timer:apply_after(10000, ?MODULE, report_stats, []),
     ok.</diff>
      <filename>src/stats_collector.erl</filename>
    </modified>
    <modified>
      <diff>@@ -100,9 +100,12 @@ handle_cast({post_result, Task, Result}, State) -&gt;
     lists:foreach(fun(T) -&gt;
                           case get(T) of
                               undefined -&gt;
-                                  insert_task(T),
-                                  put(T, new);
-                              
+				  % Insert a new task with each URL, retaining other params
+                                  if Task#task.depth /= 1 -&gt;
+					  insert_task(Task#task{url=T, depth=Task#task.depth - 1}),
+					  put(T, new);
+				      true -&gt; ok
+				  end;
                               _Other -&gt;
                                   % Already got this task
                                   ok
@@ -118,18 +121,19 @@ handle_cast({post_result, Task, Result}, State) -&gt;
 
 handle_cast({insert_task, Task}, State) -&gt;
     case queue:is_empty(State#state.worker_queue) of
-        true -&gt;
-            {noreply,
-             State#state{task_queue=queue:in_r(Task,
-                                               State#state.task_queue)}};
-        false -&gt;
-            {{value, Worker}, NewWorkerQueue} =
-             queue:out(State#state.worker_queue),
-
-            gen_server:cast(Worker, {task, Task}),
-            {noreply, State#state{worker_queue=NewWorkerQueue}}
+	true -&gt;
+	    {noreply,
+	     State#state{task_queue=queue:in_r(Task,
+					       State#state.task_queue)}};
+	false -&gt;
+	    {{value, Worker}, NewWorkerQueue} =
+		queue:out(State#state.worker_queue),
+	    
+	    gen_server:cast(Worker, {task, Task}),
+	    {noreply, State#state{worker_queue=NewWorkerQueue}}
     end.
-             
+
+    
 
 %%--------------------------------------------------------------------
 %% Function: handle_info(Info, State) -&gt; {noreply, State} |</diff>
      <filename>src/task_master.erl</filename>
    </modified>
  </modified>
  <removed type="array"/>
  <parents type="array">
    <parent>
      <id>9c7cfc7c9495f63912e2f506305b139056ca3d51</id>
    </parent>
  </parents>
  <author>
    <name>xanados</name>
    <email>bjterry@Benjamin-Terrys-MacBook-Pro-15.local</email>
  </author>
  <url>http://github.com/michaelmelanson/spider/commit/83c8512d2a8d8295401e1e218545479fff6b5c16</url>
  <id>83c8512d2a8d8295401e1e218545479fff6b5c16</id>
  <committed-date>2008-09-06T17:36:34-07:00</committed-date>
  <authored-date>2008-09-06T17:36:34-07:00</authored-date>
  <message>Added support for control of search depth and sandboxing based on regular expression. To test the features use the following command:

rr(&quot;include/tasks.hrl&quot;). %You need this to access task records from the erl shell
task_master:insert_task(#task{url=&quot;http://www.example.com/&quot;, sandboxRegex=&quot;example.com&quot;, depth=5}).

By default, no sandbox is specified (sandboxRegex=&quot;&quot;) and the search is unbounded (depth=-1). A depth of 1 means only the initial page is grabbed.

Also half-implemented support for relative links of the form &quot;/link&quot;. That is, host relative links rather than directory relative links. Directory relative links will require full parsing of the URL, probably.

I also decreased the update on the status output to 10 seconds instead of 1.

X</message>
  <tree>ea59502a9e4829cde2835960802f7813093db509</tree>
  <committer>
    <name>xanados</name>
    <email>bjterry@Benjamin-Terrys-MacBook-Pro-15.local</email>
  </committer>
</commit>
