<?xml version="1.0" encoding="UTF-8"?>
<commit>
  <added type="array"/>
  <modified type="array">
    <modified>
      <diff>@@ -202,7 +202,7 @@ class Fetcher(object):
         if not self.is_typechecked:
             data = open(self.filename, 'r').read()
             if data:
-                if not filetype.has_urls(data):
+                if not filetype.has_urls(data, self.url):
                     self.throw_type_error()
                 self.is_typechecked = True
 </diff>
      <filename>fetch.py</filename>
    </modified>
    <modified>
      <diff>@@ -17,10 +17,10 @@ def is_html(data):
     if data and re.match(_html_re, data):
         return True
 
-def has_urls(data):
+def has_urls(data, url):
     if data: 
         try:
-            spider.findall(data).next()
+            spider.findall(data, url).next()
             return True
         except StopIteration:
             pass</diff>
      <filename>filetype.py</filename>
    </modified>
    <modified>
      <diff>@@ -7,10 +7,9 @@ import urllib
 
 import io
 import shcolor
+import urlrewrite
 
 
-SPIDER_SCHEMES = [&quot;ftp&quot;, &quot;http&quot;, &quot;https&quot;]
-
 testcases = &quot;&quot;&quot;\
 &lt;a href=&quot;http://1host/path&quot;&gt;
 &lt;a href=&quot;http://2host/path&quot; &gt;
@@ -40,9 +39,22 @@ IMG = re.compile(_img)
 _uri_match = &quot;&quot;&quot;(?ims)(?P&lt;url&gt;[a-z][a-z0-9+.-]{1,120}:\/\/(([a-z0-9$_.+!*,;\/?:@&amp;~(){}\[\]=-])|%[a-f0-9]{2}){1,333}([a-z0-9][a-z0-9 $_.+!*,;\/?:@&amp;~(){}\[\]=%-]{0,1000})?)&quot;&quot;&quot;
 URI_MATCH = re.compile(_uri_match)
 
+#-rw-r--r--    1 1042     1042     28620269 Apr 19  2007 stage1-x86-2007.0.tar.bz2
+_ftp_listing = &quot;&quot;&quot;.[^ ]{9}(?:\s+[^ ]+){7}\s+(?P&lt;url&gt;.*)$&quot;&quot;&quot;
+FTP_LISTING = re.compile(_ftp_listing)
+
 def find_with_r(r, s):
     return re.finditer(r, s)
 
+def spider_ftp(s):
+    lines = s.splitlines()
+    filler = &quot;&quot;
+    for line in lines:
+        it = re.finditer(FTP_LISTING, filler+line)
+        filler += (2+len(line))*&quot; &quot;
+        for match in it:
+            yield match
+
 def spider(s):
     for it in [find_with_r(r, s) for r in (LINK, FRAME, IMG)]:
         for match in it:
@@ -51,8 +63,10 @@ def spider(s):
 def harvest(s):
     return find_with_r(URI_MATCH, s)
 
-def findall(s):
+def findall(s, url):
     its = [spider(s), harvest(s)]
+    if urlrewrite.get_scheme(url) == &quot;ftp&quot;:
+        its.append(spider_ftp(s))
     for (idx, it) in enumerate(its):
         for match in it: 
             yield match
@@ -62,7 +76,7 @@ def unbox_it_to_ss(it):
         yield match.group('url')
 
 def group_by_regex(s):
-    its = [spider(s), harvest(s)]
+    its = [spider(s), harvest(s), spider_ftp(s)]
     for (idx, it) in enumerate(its):
         for match in it: 
             yield (idx, match)
@@ -133,11 +147,12 @@ if __name__ == &quot;__main__&quot;:
         if opts.test:
             data = testcases
         else:
-            data = urllib.urlopen(args[0]).read()
+            url = args[0]
+            data = urllib.urlopen(url).read()
 
         if opts.dump:
-            for url in unique(unbox_it_to_ss(findall(data))):
-                print url
+            for u in unique(unbox_it_to_ss(findall(data, url))):
+                print u
         else:
             print colorize_shell(data)
     except IndexError:</diff>
      <filename>spider.py</filename>
    </modified>
    <modified>
      <diff>@@ -117,7 +117,7 @@ def process_records(queue, rule, wb):
 
             if record.get(&quot;mode&quot;) == fetch.Fetcher.SPIDER:
                 data = open(filename, 'r').read()
-                urls = spider.unbox_it_to_ss(spider.findall(data))
+                urls = spider.unbox_it_to_ss(spider.findall(data, url))
                 urls = urlrewrite.rewrite_urls(url, urls)
 
                 (newqueue, wb) = qualify_urls(url, urls, rule, newqueue, wb)</diff>
      <filename>spiderfetch.py</filename>
    </modified>
    <modified>
      <diff>@@ -4,10 +4,10 @@ import os
 import re
 import urlparse
 
-import spider
 
+SCHEMES = [&quot;ftp&quot;, &quot;http&quot;, &quot;https&quot;]
 
-_scheme = &quot;(?P&lt;scheme&gt;%s)$&quot; % &quot;&quot;.join(reduce(lambda x, y: &quot;%s|%s&quot; % (x, y), spider.SPIDER_SCHEMES))
+_scheme = &quot;(?P&lt;scheme&gt;%s)$&quot; % &quot;&quot;.join(reduce(lambda x, y: &quot;%s|%s&quot; % (x, y), SCHEMES))
 scheme_regex = re.compile(_scheme)
 
 class InvalidUrl(Exception): pass
@@ -32,6 +32,10 @@ def get_hostname(url):
     pack = urlparse.urlsplit(url)
     return pack.hostname
 
+def get_scheme(url):
+    pack = urlparse.urlsplit(url)
+    return pack.scheme
+
 def rewrite_urls(origin_url, urls):
     origin_pack = urlparse.urlsplit(origin_url)
     for u in urls:</diff>
      <filename>urlrewrite.py</filename>
    </modified>
  </modified>
  <removed type="array"/>
  <parents type="array">
    <parent>
      <id>5fcb044cf5b4c43bde858e59d3acc6de87b8092d</id>
    </parent>
  </parents>
  <author>
    <name>Martin Matusiak</name>
    <email>numerodix@gmail.com</email>
  </author>
  <url>http://github.com/numerodix/spiderfetch/commit/3556b280d07ca9b019b5c41ca6690998c6dd5a07</url>
  <id>3556b280d07ca9b019b5c41ca6690998c6dd5a07</id>
  <committed-date>2008-06-27T11:56:29-07:00</committed-date>
  <authored-date>2008-06-27T11:56:29-07:00</authored-date>
  <message>add pattern for matching ftp dir listings</message>
  <tree>111577e764718537c1801ef4fa09f8afd1827bb2</tree>
  <committer>
    <name>Martin Matusiak</name>
    <email>numerodix@gmail.com</email>
  </committer>
</commit>
