<?xml version="1.0" encoding="UTF-8"?>
<commit>
  <added type="array">
    <added>
      <filename>lib/imw/chunk_store/cached_uri.rb</filename>
    </added>
    <added>
      <filename>lib/imw/chunk_store/scrape.rb</filename>
    </added>
    <added>
      <filename>lib/imw/chunk_store/scrape_request.rb</filename>
    </added>
    <added>
      <filename>lib/imw/chunk_store/tracker.rb</filename>
    </added>
    <added>
      <filename>lib/imw/chunk_store/uri_file_store_junk.rb</filename>
    </added>
    <added>
      <filename>lib/imw/extract/flat_file_parser.rb</filename>
    </added>
    <added>
      <filename>lib/imw/extract/hpricot.rb</filename>
    </added>
    <added>
      <filename>lib/imw/extract/html_parser/match_tree.rb</filename>
    </added>
    <added>
      <filename>lib/imw/extract/html_parser/matcher.rb</filename>
    </added>
    <added>
      <filename>lib/imw/extract/loaddump.rb</filename>
    </added>
    <added>
      <filename>lib/imw/model/source.rb</filename>
    </added>
    <added>
      <filename>lib/imw/transform.rb</filename>
    </added>
    <added>
      <filename>meta/conventions</filename>
    </added>
  </added>
  <modified type="array">
    <modified>
      <diff>@@ -38,7 +38,14 @@ module IMW
     :imw_lib   =&gt; [:imw_root, 'lib'],
 
     # Data
-    :data_root  =&gt; [:super_root, 'data']
+    :data_root  =&gt; [:super_root, 'data'],
+    :ripd_root  =&gt; [:data_root, 'ripd'],
+    :peeld_root =&gt; [:data_root, 'peeld'],
+    :mungd_root =&gt; [:data_root, 'mungd'],
+    :temp_root  =&gt; [:data_root, 'temp'],
+    :fixd_root  =&gt; [:data_root, 'fixd'],
+    :pkgd_root  =&gt; [:data_root, 'pkgd'],
+    :log_root   =&gt; [:data_root, 'log'],
   }
 
   # Default time format.</diff>
      <filename>etc/imwrc.rb</filename>
    </modified>
    <modified>
      <diff>@@ -1,8 +1,6 @@
 require 'imw/dataset/datamapper'
 require 'imw/dataset/link/linkish'
-#
-# A file to process
-#
+
 class LinkAsset
   UUID_INFOCHIMPS_ASSETS_NAMESPACE = UUID.sha1_create(UUID_URL_NAMESPACE, 'http://infochimps.org/assets') unless defined?(UUID_INFOCHIMPS_ASSETS_NAMESPACE)
   include Linkish</diff>
      <filename>lib/imw/chunk_store/chunk.rb</filename>
    </modified>
    <modified>
      <diff>@@ -1,305 +1,5 @@
 # -*- coding: utf-8 -*-
-module Linkish
-  def self.included base
-    base.class_eval do
-      include DataMapper::Resource
-      include Infochimps::Resource
-      property      :id,              Integer,        :serial      =&gt; true
-      property      :full_url,        String,         :length      =&gt; 255,    :nullable =&gt; false,                     :unique_index =&gt; true
-      has_handle
-      alias_method  :handle_generator, :full_url
-      has_time_and_user_stamps
-      #
-      property      :name,            String,         :length      =&gt; 255,    :nullable =&gt; false, :default =&gt; ''
-      #
-      property      :file_path,       String,    :length =&gt; 1024
-      property      :file_time,       DateTime
-      property      :file_size,       Integer
-      property      :file_sha1,       String,    :length =&gt; 40
-      property      :tried_fetch,     DataMapper::Resource::Boolean
-      property      :fetched,         DataMapper::Resource::Boolean
-      #
-      before :create, :make_uuid_and_handle
-      before :create, :update_from_file!
-    end
-    base.extend ClassMethods
-  end
-
-  # ===========================================================================
-  #
-  # Delegate methods to uri
-  #
-  def uri
-    @uri ||= Addressable::URI.parse(self.full_url)
-  end
-  # Dispatch anything else to the aggregated uri object
-  def method_missing method, *args
-    if self.uri.respond_to?(method)
-      self.uri.send(method, *args)
-    else
-      super method, *args
-    end
-  end
-
-  def to_s
-    &quot;&lt;a href='#{self.uri.to_s}'&gt;#{self.name}&lt;/a&gt;&quot; # &lt;-- !! not escaped !!
-  end
-
-  # ===========================================================================
-  #
-  # ID, naming, etc
-  #
-  def normalize_url!
-    u = Addressable::URI.parse(self.full_url).normalize
-    self.full_url = u.to_s
-  end
-
-  # ===========================================================================
-  #
-  # Properly belongs in FileStore module
-  #
-  #
-  # Refresh cached properties from our copy of the asset.
-  #
-  def update_from_file!
-    self.make_uuid_and_handle # make sure this happened
-    # Set the file path
-    self.file_path = self.to_file_path if self.file_path.blank?
-    # FIXME -- kludge to ripd_root
-    if ! File.exist?(actual_path)
-      self.fetched   = false
-    else
-      self.fetched   = self.tried_fetch = true
-      self.file_size = File.size( actual_path)
-      self.file_time = File.mtime(actual_path)
-    end
-    self.fetched
-  end
-  def actual_path
-    path_to(:ripd_root, self.file_path)
-  end
-
-  # ===========================================================================
-  #
-  # Properly belongs in own module
-  #
-
-  IMW_WGET_OPTIONS = {
-    :root       =&gt; :ripd_root,
-    :wait       =&gt; 2,
-    :noretry    =&gt; true,
-    :log_level  =&gt; Logger::DEBUG,
-    :clobber    =&gt; false,
-  }
-  #
-  # Fetch from the web
-  #
-  def wget options={}
-    options.reverse_merge! IMW_WGET_OPTIONS
-    cd path_to(options[:root]) do
-      if (not options[:clobber]) &amp;&amp; File.file?(file_path) then
-        IMW.log.add options[:log_level], &quot;Skipping #{file_path}&quot;; return
-      end
-      # Do the fetch
-      mkdir_p File.dirname(actual_path)
-      # defaults are --connect-timeout=infinity --read-timeout=900 --tries=20 acc. to man page
-      cmd = %Q{wget -nv &quot;#{full_url}&quot; -O&quot;#{actual_path}&quot; --connect-timeout=5 --read-timeout=10 --tries=1 &amp;}
-      IMW.log.add(options[:log_level], cmd)
-      IMW.log.add(options[:log_level], `#{cmd}`)
-      self.tried_fetch = true
-      sleep options[:wait] # please hammer don't hurt em
-      update_from_file!
-      self.save
-      return self.fetched
-    end
-  end
-
-  #
-  #
-  #
-  def contents options={}
-    wget options
-    if fetched
-      File.open actual_path
-    end
-  end
-
-  # ===========================================================================
-  #
-  # Properly belongs in FileStore
-  #
-
-  protected
-  #
-  # The standard file path for this url's ripped cache
-  #
-  # * leading directory from reverse.dotted.host_scheme:port:user@password
-  # * normalized path/file?query#fragment
-  # * uuid formed from the
-  #
-  def to_file_path
-    file_path_str = &quot;&quot;
-    file_path_str &lt;&lt; to_file_path_root_part
-    file_path_str &lt;&lt; to_file_path_path_part
-    file_path_str &lt;&lt; to_file_path_file_part
-    file_path_str = self.class.path_str_encode(file_path_str)
-    self.class.validate_roundtrip(file_path_str)
-    file_path_str
-  end
-  def file_timestamp
-    file_time.strftime(&quot;%Y%m%d-%H%M%S&quot;)
-  end
-  def to_file_path_with_timestamp
-    to_file_path + file_timestamp
-  end
-  #
-  # revhost_scheme:port:user@password -- omitting _scheme if it's http, and
-  # omitting :port:user@password if all three are blank.
-  #
-  def to_file_path_root_part
-    root_part_str = &quot;&quot;
-    tld_host_frag = self.class.tier_path_segment(revhost, /^([^\.]+)\.([^\.]{1,2})/)
-    root_part_str &lt;&lt; revhost
-    root_part_str &lt;&lt; &quot;_#{uri.scheme}&quot;                           unless uri.scheme == 'http'
-    root_part_str &lt;&lt; &quot;:#{uri.port}:#{uri.user}@#{uri.password}&quot; unless uri.simple?
-    root_part_str
-  end
-  def to_file_path_path_part
-    uri.path.to_s
-  end
-  def to_file_path_file_part
-    file_path_str = &quot;&quot;
-    file_path_str &lt;&lt; &quot;?#{uri.query}&quot;        unless uri.query.nil?
-    file_path_str &lt;&lt; &quot;##{uri.fragment}&quot;     unless uri.fragment.nil?
-    file_path_str &lt;&lt; &quot;-#{self.uuid}&quot;
-  end
-  public
-
-
-  module ClassMethods
-    #
-    # find_or_creates from url
-    #
-    # url is heuristic_parse'd and normalized by Addressable before lookup:
-    #   &quot;Converts an input to a URI. The input does not have to be a valid URI &#8212;
-    #   the method will use heuristics to guess what URI was intended. This is not
-    #   standards compliant, merely user-friendly.
-    #
-    def find_or_create_from_url url_str
-      link = self.find_or_new_from_url url_str
-      link.save
-      link
-    end
-    def find_or_new_from_url url_str # :nodoc:
-      url_str = Addressable::URI.heuristic_parse(url_str).normalize.to_s
-      link = self.first( :full_url =&gt; url_str ) || self.new( :full_url =&gt; url_str )
-      link.make_uuid_and_handle
-      link.update_from_file!
-      link
-    end
-    def find_or_create_from_file_path ripd_file
-      url_str = Link.url_from_file_path(ripd_file)
-      link = self.first( :full_url =&gt; url_str.to_s ) || self.new( :full_url =&gt; url_str.to_s )
-      link.file_path = ripd_file
-      link.make_uuid_and_handle
-      link.update_from_file!
-      link.save
-      link
-    end
-    #
-    # Decode url from its file_path
-    #
-    def url_from_file_path fp
-      fp = path_str_decode(fp)
-      m = (%r{\A
-            (#{Addressable::URI::HOST_TLD})  # tld tier
-           /(..?)                            # revhost tier
-           /([^/\:_]+)                       # revhost
-        (?:_([^/\:]+))?                      # _scheme
-        (?::(\d*):([^/]*)@([^@/]*?))?        # :port:user@password
-           /(?:(.*?)/)?                      # /dirs/
-            ([^/]*)                          #  file
-           -([a-f0-9]{32})                   # -uuid
-                                \z}x.match(fp))
-      raise &quot;Can't extract url from file path #{fp}&quot; if !m
-      fp_host, fp_scheme, fp_port, fp_user, fp_pass, fp_path, fp_file, fp_uuid = m.captures
-      fp_host     = fp_host.split('.').reverse.join('.')
-      fp_scheme ||= 'http'
-      fp_pass     = &quot;:#{fp_pass}&quot;             unless fp_pass.blank?
-      fp_userpass = &quot;#{fp_user}#{fp_user}@&quot;   unless fp_user.blank?
-      fp_port     = &quot;:#{fp_port}&quot;             unless fp_port.blank?
-      fp_path     = File.join(*[fp_path, fp_file].compact)
-      &quot;#{fp_scheme}://#{fp_userpass}#{fp_host}#{fp_port}/#{fp_path}&quot;
-    end
-    #
-    # to control files-per-directory madness, take a path segment like &quot;foobar&quot; in
-    #   blah.com/top/foobar/directory
-    # and transform into
-    #   blah.com/top/fo/foobar/directory
-    #
-    # Ex.
-    #   self.class.tier_path_segment('a_username')
-    #   # =&gt; 'a_/a_username'
-    #   self.class.tier_path_segment('1')
-    #   # =&gt; '1/1'
-    #   self.class.tier_path_segment('com.twitter', /^([^\.]+)\.([^\.]{1,2})/)
-    #   # =&gt; 'com/tw/com.twitter'
-    #
-    def self.tier_path_segment(path_seg, re=/(..?)/)
-      frag_seg = re.match(path_seg).captures
-      raise &quot;Can't tier path_seg #{path_seg} using #{re}&quot; if frag_seg.blank?
-      File.join(* [frag_seg, path_seg].flatten )
-    end
-    #
-    #
-    # It's really bad if you can't roundtrip --
-    # since saving is the rare case (only done once!) we insist on checking.
-    #
-    def self.validate_roundtrip file_path_str
-      # uu = self.class.url_from_file_path(file_path_str)
-      # puts &quot;*&quot;*75, uri.to_hash.inspect, ['path str', file_path_str, 'uri', uri.to_s, 'rt', uu.to_s].inspect
-      return_trip_url = Addressable::URI.parse(self.class.url_from_file_path(file_path_str))
-      raise &quot;crapsticks: uri doesn't roundtrip #{file_path_str} to #{uri.to_s}: #{return_trip_url}&quot; if return_trip_url != uri
-    end
-    #
-    # Uses a similar scheme as the 'Quoted Printable' encoding, but more strict
-    # and without linebreaking or anything. The intent is to reversibly and
-    # recognizably store URLs to disk with names that (apart from path) do not
-    # need to be further escaped in filesystem, URL, database or HTML.
-    #
-    # The only characters in a path_encoded string are alpha-numeric /_-.+
-    #
-    # Rules:
-    # * Any character that is not alphanumeric, and is not /_-.  is encoded as an
-    #   plus sign + followed by its upper-case hex encoding.
-    #
-    # * Furthermore, in any sequence of repeated '.' characters, all after the
-    #   first are hex encoded; same with '/'.
-    #
-    # Ex.
-    #   path_encode(&quot;www.measuringworth.com/datasets/consumer/result.php?use[]=VCB&amp;use[]=CU&amp;use[]=SZ&amp;year_source=1900&amp;year_result=2007&quot;
-    #   # =&gt; www.measuringworth.com/datasets/consumer/result.php+3Fuse+5B+5D+3DVCB+26use+5B+5D+3DCU+26use+5B+5D+3DSZ+26year_source+3D1900+26year_result+3D2007
-    #
-    # Code inspired by &quot;Glenn Parker's response to ruby quiz #23&quot;http://www.rubyquiz.com/quiz23.html
-    #
-    def path_str_encode(str)
-      str.gsub(%r{\.(\.+)}){|chars| '.'+path_encode_chars(chars) }
-      str.gsub(%r{\/(\/+)}){|chars| '/'+path_encode_chars(chars) }
-      str.gsub(%r{[^A-Za-z0-9/_\-\.]+}){|chars| path_encode_chars(chars) }
-    end
-    #
-    # See the notes in path_encode
-    #
-    def path_str_decode(str)
-      str.gsub(/\+([\dA-F]{2})/){ $1.hex.chr }
-    end
-    protected
-    def path_encode_chars(chars) # :nodoc:
-      # send each character to a plus sign followed by its uppercase hex encoding
-      encoded = &quot;&quot;;
-      chars.each_byte{|c| encoded &lt;&lt; &quot;+%02X&quot; % c }
-      encoded
-    end
-    public
+module IMW
+  module UriFileStore
   end
 end</diff>
      <filename>lib/imw/chunk_store/uri_file_store.rb</filename>
    </modified>
    <modified>
      <diff>@@ -31,7 +31,7 @@ module DataMapper
   def self.setup_local_connection options
     options = { :handle =&gt; :default }.merge options
     params = options.values_at(:protocol, :dbpath, :dbname)
-    DataMapper.setup(options[:handle], &quot;%s://%s/%s&quot; % options)
+    DataMapper.setup(options[:handle], &quot;%s://%s/%s&quot; % params)
   end
 
   # KLUDGE
@@ -57,4 +57,10 @@ module DataMapper
     end
     
   end
+
+  # watch SQL log -- must be BEFORE call to db setup
+  def self.logging=(verbosity)
+    verbosity = :debug if (verbosity == true)
+    DataMapper::Logger.new(STDERR, verbosity) if verbosity
+  end
 end</diff>
      <filename>lib/imw/dataset/datamapper.rb</filename>
    </modified>
    <modified>
      <diff>@@ -27,6 +27,7 @@ require 'imw/dataset/task'
 include FileUtils
 
 module IMW
+  include FileUtils
 
   ################################################################
   ## FLIP'S CODE
@@ -126,6 +127,6 @@ module IMW
       @last_description = &quot;Get rid of all traces of this dataset.&quot;
       define_task(IMW::Task, :destroy =&gt; [:delete_data])
     end
-    
+
   end
 end</diff>
      <filename>lib/imw/dataset/scaffold.rb</filename>
    </modified>
    <modified>
      <diff>@@ -10,7 +10,7 @@
 # Copyright:: Copyright (c) 2008 infochimps.org
 # License::   GPL 3.0
 # Website::   http://infinitemonkeywrench.org/
-# 
+#
 
 require 'imw/dataset/scaffold'
 require 'imw/dataset/task'
@@ -33,21 +33,48 @@ module IMW
       create_workflow_tasks
     end
 
+    # Sets the default tasks in this workflow.
+    #
+    # The default tasks constitute a set of consecutive actions that
+    # must be taken in order: &lt;tt&gt;:rip&lt;/tt&gt;, &lt;tt&gt;parse&lt;/tt&gt;,
+    # &lt;tt&gt;munge&lt;/tt&gt;, &lt;tt&gt;fix&lt;/tt&gt;, and &lt;tt&gt;package&lt;/tt&gt;.  Each task
+    # is a &lt;tt&gt;Rake::Task&lt;/tt&gt; which depends on the one before it.
+    #
+    # Each task does nothing by default other than create directories
+    # to hold files for this dataset as it undergoes the workflow.
+    def set_default_tasks
+      define_task(Rake::Task, {:rip =&gt; []})
+      define_task(Rake::Task, {:parse =&gt; :rip})
+      define_task(Rake::Task, {:munge =&gt; :parse})
+      define_task(Rake::Task, {:fix =&gt; :munge})
+      define_task(Rake::Task, {:package =&gt; :fix})
+      comment_default_tasks
+    end
+
+    # Set the initial comments for each of the default tasks.
+    def comment_default_tasks
+      self[:rip].comment = &quot;Rip dataset from an origin&quot;
+      self[:parse].comment = &quot;Parse dataset into intermediate form&quot;
+      self[:munge].comment = &quot;Munge dataset's structure into desired form&quot;
+      self[:fix].comment = &quot;Fix and format dataset&quot;
+      self[:package].comment = &quot;Package dataset into a final format&quot;
+    end
+
     # Creates the task dependency chain &lt;tt&gt;:package =&gt; :fix =&gt; :munge
     # =&gt; :peel =&gt; :rip =&gt; :initialize&lt;/tt&gt;.
     def create_workflow_tasks
       @last_description = &quot;Obtain data from some source.&quot;
       define_task(IMW::Task, :rip     =&gt; [:initialize])
-      @last_description = &quot;Extract datafiles from ripped data.&quot;      
+      @last_description = &quot;Extract datafiles from ripped data.&quot;
       define_task(IMW::Task, :peel    =&gt; [:rip])
-      @last_description = &quot;Transform records in a dataset.&quot;      
+      @last_description = &quot;Transform records in a dataset.&quot;
       define_task(IMW::Task, :munge   =&gt; [:peel])
-      @last_description = &quot;Reconcile records.&quot;      
+      @last_description = &quot;Reconcile records.&quot;
       define_task(IMW::Task, :fix     =&gt; [:munge])
-      @last_description = &quot;Package dataset in final form.&quot;      
+      @last_description = &quot;Package dataset in final form.&quot;
       define_task(IMW::Task, :package =&gt; [:fix])
     end
-      
+
   end
 end
 </diff>
      <filename>lib/imw/dataset/workflow.rb</filename>
    </modified>
    <modified>
      <diff>@@ -17,5 +17,8 @@ require 'imw/utils/paths'
 require 'imw/utils/misc'
 require 'imw/utils/components'
 require 'imw/utils/extensions/core'
+require 'fileutils'
+require 'pathname'
+
 
 # puts &quot;#{File.basename(__FILE__)}: Early economists thought they would measure the utility of an action in units of `utils'.  Really.&quot; # at bottom</diff>
      <filename>lib/imw/utils.rb</filename>
    </modified>
    <modified>
      <diff>@@ -32,5 +32,11 @@ Struct.class_eval do
     self
   end
   alias_method :update, :merge!
+  def indifferent_merge  *args, &amp;block
+    self.dup.indifferent_merge! *args
+  end
+  def indifferent_merge! hashlike, &amp;block
+    merge! hashlike.reject{|k,v| ! self.members.include?(k.to_s) }
+  end
 
 end</diff>
      <filename>lib/imw/utils/extensions/struct.rb</filename>
    </modified>
    <modified>
      <diff>@@ -14,7 +14,6 @@
 # License::   GPL 3.0
 # Website::   http://infinitemonkeywrench.org/
 #
-require 'pathname'
 
 module IMW
 </diff>
      <filename>lib/imw/utils/paths.rb</filename>
    </modified>
    <modified>
      <diff>@@ -1,5 +1,6 @@
-require 'imw/dataset/uri/file_store'
-require 'imw/dataset/uuid'
+require 'imw/utils'
+require 'imw/utils/uuid'
+require 'addressable/uri'
 module Addressable
   #
   # Add the #scrubbed and #revhost calls
@@ -12,30 +13,23 @@ module Addressable
     HOST_HEAD     = '(?:[a-z0-9\-]+\.)+'
     HOST_TLD      = '(?:[a-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
 
-    def path_with_strip
-      path_str = path_without_strip
-      return '/' if path_str.blank?
-      path_str.gsub!(%r{([^/])/+$},'\1')
-      @path = path_str
-    end
-    alias_method_chain :path, :strip unless defined?(path_without_strip)
-
     def host_valid?
       !!(host =~ %r{\A#{HOST_HEAD}#{HOST_TLD}\z}i)
     end
-
     def path_valid?
       !!(path =~ %r{\A[#{PATH_CHARS}%]*\z})
     end
+    def simple_connection_part?
+      ( ['http', nil].include?(scheme) &amp;&amp;
+        [80,     nil].include?(port) &amp;&amp;
+        (self.to_hash.values_at(:password, :user).join.blank?) )
+    end
 
     #
-    # can the uri be reproduced from its scrubbed representation?
+    # Does this look like a
     #
     def simple?
-      host_valid? &amp;&amp;
-        path_valid? &amp;&amp;
-        (scheme == 'http' &amp;&amp; port == 80)  &amp;&amp;
-        self.to_hash.values_at(:password, :user).join.blank?
+      host_valid? &amp;&amp; path_valid? &amp;&amp; simple_connection_part?
     end
 
     #
@@ -59,3 +53,7 @@ module Addressable
   end
 end
 
+class &lt;&lt; Addressable::URI
+  alias_method :encode_segment,   :encode_component    if ! defined?(encode_segment)
+  alias_method :unencode_segment, :unencode_component  if ! defined?(unencode_segment)
+end</diff>
      <filename>lib/imw/utils/uri.rb</filename>
    </modified>
  </modified>
  <removed type="array">
    <removed>
      <filename>lib/imw/chunk_store/scraper.rb</filename>
    </removed>
    <removed>
      <filename>lib/tasks/#pool.rake#</filename>
    </removed>
  </removed>
  <parents type="array">
    <parent>
      <id>d5568e3b0b1b6a4e9202f2a6201357dc3d69ff75</id>
    </parent>
    <parent>
      <id>6e3b7f11fba601b1e924c56b6580ba3d4c9216a7</id>
    </parent>
  </parents>
  <author>
    <name>Dhruv Bansal</name>
    <email>dhruv@ph.utexas.edu</email>
  </author>
  <url>http://github.com/infochimps/imw/commit/11bafb60ea39dfd7d0cbdcde0af8ffeb1ab02efd</url>
  <id>11bafb60ea39dfd7d0cbdcde0af8ffeb1ab02efd</id>
  <committed-date>2009-09-27T13:33:35-07:00</committed-date>
  <authored-date>2009-09-27T13:33:35-07:00</authored-date>
  <message>Merge branch 'master' of git@github.com:infochimps/imw

Conflicts:
	etc/imwrc.rb
	lib/imw.rb
	lib/imw/dataset/workflow.rb</message>
  <tree>3be62b74f32110f6b5ae0e5e32c18b9e6c627e3e</tree>
  <committer>
    <name>Dhruv Bansal</name>
    <email>dhruv@ph.utexas.edu</email>
  </committer>
</commit>
