<?xml version="1.0" encoding="UTF-8"?>
<commit>
  <added type="array">
    <added>
      <filename>src/.DS_Store</filename>
    </added>
    <added>
      <filename>src/url_parse.erl</filename>
    </added>
  </added>
  <modified type="array">
    <modified>
      <diff>@@ -7,6 +7,7 @@
   &quot;src/task_master&quot;,
   &quot;src/stats_collector&quot;,
   &quot;src/mochiweb_html&quot;,
-  &quot;src/mochiweb_charref&quot;],
+  &quot;src/mochiweb_charref&quot;,
+  &quot;src/url_parse&quot;],
  [{i, &quot;include&quot;},
   {outdir, &quot;ebin&quot;}]}.</diff>
      <filename>Emakefile</filename>
    </modified>
    <modified>
      <diff>@@ -7,7 +7,8 @@
              fetcher_sup,
              fetcher,
              stats_collector,
-             task_master]},
+             task_master,
+	     url_parse]},
   {registered, [spider_sup,
                 fetcher_sup,
                 stats_collector,</diff>
      <filename>spider.app</filename>
    </modified>
    <modified>
      <diff>@@ -16,6 +16,9 @@
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
          terminate/2, code_change/3]).
 
+%% Internal API
+-export([clean_links/2]).
+
 -include(&quot;task.hrl&quot;).
 -include(&quot;result.hrl&quot;).
 
@@ -112,16 +115,18 @@ process_task(Task) -&gt;
 
     io:format(&quot;~p Processing ~p~n&quot;, [self(), Url]),
 
-    {http, Host, Port, File} = parse(Url),
+    {http, Host, Port, File} = url_parse:parse(Url),
 
     case http:request(get, {Url, ?HEADERS(Host)},
                       [], [{body_format, string}]) of
         {ok, {{_Version, 200, _Reason}, _Headers, Body}} -&gt;
             Parsed = mochiweb_html:parse(Body),
-	  
+	    
+	    % io:format(&quot;testing&quot;),
+
 	    % Extracts all links from the document and ensures they
 	    % are within the sandbox
-            Links = filter_regex(extract_document_links(Parsed, {http, Host, Port, File}), Task#task.sandboxRegex),
+            Links = filter_regex(extract_document_links(Parsed, Task#task.url), Task#task.sandboxRegex),
 
 	   
             %DocumentText = clean_document(Parsed),
@@ -138,11 +143,12 @@ process_task(Task) -&gt;
             #result{status=failure, code=Other}
     end.
 
-extract_document_links(Html, {http, Host, Port, File}) -&gt;
+extract_document_links(Html, URL) -&gt;
     BinaryLinks = lists:flatten(extract_links(Html)),
     StringLinks = lists:map(fun(X) -&gt; binary_to_list(X) end,
                             BinaryLinks),
-    CleanedLinks = clean_links(StringLinks, {http, Host, Port, File}),
+    % io:format(&quot;here&quot;),
+    CleanedLinks = clean_links(StringLinks, URL),
     
     lists:filter(fun(dud) -&gt; false;
                     (_X) -&gt; true
@@ -168,25 +174,26 @@ extract_links([_Head|Tail]) -&gt; extract_links(Tail);
 extract_links(X) when is_binary(X) -&gt; [];
 extract_links([]) -&gt; [];
 extract_links(X) -&gt;
-%    io:format(&quot;DEBUG: extract_links(~p)~n&quot;, [X]),
+%%    io:format(&quot;DEBUG: extract_links(~p)~n&quot;, [X]),
     [].
 
-clean_links(Links, {http, Host, Port, File}) -&gt;
-    lists:map(fun(&quot;&quot;) -&gt; dud; % Probably an AJAX link.
-                 (&quot;javascript:&quot; ++ _Tail) -&gt; dud; % Don't care about JS
-                 (&quot;http:&quot;) -&gt; dud; % These appear on Google pages sometimes...
-                 (&quot;#&quot;) -&gt; dud; % Ignore page-local links
-                 (&quot;/&quot; ++ Tail) -&gt; % Site-relative URL, convert to absolute
-		      case Port of
-			  80 -&gt;&quot;http://&quot; ++ Host ++ &quot;/&quot; ++ Tail;
-			  _ -&gt;&quot;http://&quot; ++Host++&quot;:&quot;+ integer_to_list(Port) ++ &quot;/&quot; ++ Tail
-		      end;   
-                 (&quot;mailto:&quot; ++ _Tail) -&gt; dud; % Mail links... don't care!
-                 (&quot;http://&quot; ++ Tail) -&gt; &quot;http://&quot; ++ Tail;
-                 (Other) -&gt;
-                      io:format(&quot;Dud link: ~p~n&quot;, [Other]),
-                      dud
-              end, Links).
+
+%% Updated to more cleanly handle link filtering. Specifically excludes
+%% fully qualified links that don't use the http schem and calls 'qualify/2'
+%% to mediate domain absolute, relative, and fully qualified links. There
+%% are still many sources of false positives, such as links with anchor 
+%% fragments, and perhaps I'll add more special casing, but ultimately
+%% we need a full and complete url parser
+clean_links(Links, SourceLink) -&gt; 
+    lists:map(fun(Link) -&gt; 
+	IsColonFree = string:chr(Link,$:) == 0,
+	case string:rstr(Link,&quot;http://&quot;) of
+	    0 when IsColonFree -&gt; url_parse:qualify(SourceLink, Link);
+	    1 -&gt; Link;
+	    _ -&gt; dud
+	end
+    end,Links).
+
 
 clean_document(List) when is_list(List) -&gt;
     lists:map(fun clean_document/1, List);
@@ -194,56 +201,6 @@ clean_document({_Tag, _Attrs, Contents}) -&gt; clean_document(Contents);
 clean_document(Text) when is_binary(Text) -&gt; Text;
 clean_document(_) -&gt; [].
 
-parse([$h,$t,$t,$p,$:,$/,$/|T]) -&gt;  parse_http(T);
-parse([$f,$t,$p,$:,$/,$/|_T])    -&gt;  {error, no_ftp};
-parse([$f,$i,$l,$e,$:,$/,$/|F]) -&gt;  {file, F};
-parse(_X)                        -&gt;  {error, unknown_url_type}.
-
-parse_http(X) -&gt;
-    case string:chr(X, $/) of
-        0 -&gt;
-            %% not terminated by &quot;/&quot; (sigh)
-            %% try again
-            parse_http(X ++ &quot;/&quot;);
-        N -&gt;
-            %% The Host is up to the first &quot;/&quot;
-            %% The file is everything else
-            Host = string:substr(X, 1, N-1),
-            File = string:substr(X, N, length(X)),
-            %% Now check to see if the host name contains a colon
-            %% i.e. there is an explicit port address in the hostname
-            case string:chr(Host, $:) of
-                0 -&gt;
-                    %% no colon
-                    Port = 80,
-                    {http, Host, Port, File};
-                M -&gt;
-                    Site = string:substr(Host,1,M-1),
-                    case (catch list_to_integer(
-                                  string:substr(Host, M+1, length(Host)))) of
-                        {'EXIT', _} -&gt;
-                            {http, Site, 80, File};
-                        Port -&gt;
-                            {http, Site, Port, File}
-                    end
-            end
-    end.
-
-% In order to assemble fully qualified links from relative ones, we need
-% to figure out the base href of a document, given a link; this is a naive
-% and buggy skeleton I'm committing at 2:41a :)
-get_base_href(Url) -&gt; 
-    X = lists:reverse(Url),
-    scan_for_slash(X).
-
-scan_for_slash(X) -&gt;
-    [FirstChar|Rest] = X,
-    case FirstChar of
-	$/ -&gt; lists:reverse(Rest);
-	_  -&gt; scan_for_slash(Rest)
-    end.
-
-
 % Filters a list for all items that match a given regular expression. 
 % Items with no match are discarded
 filter_regex(ItemList, Regex) -&gt;</diff>
      <filename>src/fetcher.erl</filename>
    </modified>
  </modified>
  <removed type="array"/>
  <parents type="array">
    <parent>
      <id>83c8512d2a8d8295401e1e218545479fff6b5c16</id>
    </parent>
  </parents>
  <author>
    <name>Michael Terry</name>
    <email>formido@gmail.com</email>
  </author>
  <url>http://github.com/michaelmelanson/spider/commit/00d5dd985113fb0731b20c3ea8d6b5380351043e</url>
  <id>00d5dd985113fb0731b20c3ea8d6b5380351043e</id>
  <committed-date>2008-09-07T13:28:06-07:00</committed-date>
  <authored-date>2008-09-07T13:28:06-07:00</authored-date>
  <message>Updated to handle common cases of URL parsing better

Added code to attempt to specifically match only valid http links. This is more useful at this stage of development, although we may allow more schemes once a more complete url parser is in place. Added url_parse module, borrowing and slightly expanding on http_tools by Joe Armstrong, which attempts to create fully qualified links from relative and domain absolute paths. Still do not deal with fragment anchors and .'s in URLs, for example, and for which we need a url parsing library that does canonicalization.</message>
  <tree>a5930110fc229cfffd8c709c6e1fcd2521e5f34d</tree>
  <committer>
    <name>Michael Terry</name>
    <email>formido@gmail.com</email>
  </committer>
</commit>
