<?xml version="1.0" encoding="UTF-8"?>
<commit>
  <added type="array">
    <added>
      <filename>README-organization.txt</filename>
    </added>
    <added>
      <filename>lib/imw/chunk_store/chunk.rb</filename>
    </added>
    <added>
      <filename>lib/imw/chunk_store/scraper.rb</filename>
    </added>
    <added>
      <filename>lib/imw/chunk_store/uri_file_store.rb</filename>
    </added>
  </added>
  <modified type="array">
    <modified>
      <diff>@@ -1,4 +1,185 @@
 require 'imw/extract/hpricot'
+
+#
+# h4. HTML Extractor
+#
+# * map repeating HTML elements to intermediate ruby data structure
+# * optimize all the common cases for expressive brevity
+# * output structure will come from HTML structure; map to desired output objects in transform stage.
+# * spec shouldn't be allowed to get too much more complicated than this; again, transform stage exists
+#
+# If this doesn't yield satisfaction you may enjoy
+# * http://blog.labnotes.org/2006/07/11/scraping-with-style-scrapi-toolkit-for-ruby/
+# * http://scrubyt.org/
+# Note of course that these have quite different goals.  For example, we don't
+# have any interest in &quot;interactive&quot; crawling, eg form submission, or at least
+# that goes elsewhere.
+#
+#
+# == Sample HTML (http://twitter.com:
+#
+#   &lt;ul class=&quot;about vcard entry-author&quot;&gt;
+#     &lt;li         &gt;&lt;span class=&quot;label&quot;&gt;Name&lt;/span&gt;     &lt;span class=&quot;fn&quot; &gt;MarsPhoenix       &lt;/span&gt; &lt;/li&gt;
+#     &lt;li         &gt;&lt;span class=&quot;label&quot;&gt;Location&lt;/span&gt; &lt;span class=&quot;adr&quot;&gt;Mars, Solar System&lt;/span&gt; &lt;/li&gt;
+#     &lt;li id=&quot;bio&quot;&gt;&lt;span class=&quot;label&quot;&gt;Bio&lt;/span&gt;      &lt;span class=&quot;bio&quot;&gt;I dig Mars!       &lt;/span&gt; &lt;/li&gt;
+#     &lt;li         &gt;&lt;span class=&quot;label&quot;&gt;Web&lt;/span&gt;
+#        &lt;a href=&quot;http://tinyurl.com/5wwaru&quot; class=&quot;url&quot; rel=&quot;me nofollow&quot;&gt;http://tinyurl.co...&lt;/a&gt;&lt;/li&gt;
+#   &lt;/ul&gt;
+#
+# == Parser Spec:
+#   :hcard        =&gt; m_one('//ul.vcard.about',
+#     {
+#       :name     =&gt; 'li/span.fn',
+#       :location =&gt; 'li/span.adr',
+#       :url      =&gt; m_attr('li/a.url[@href]', 'href'),
+#       :bio      =&gt; 'li#bio/span.bio',
+#     }
+#   )
+#
+# == Example return:
+#   { :hcard =&gt; { :name =&gt; 'Mars Phoenix', :location =&gt; 'Mars, Solar System', :bio =&gt; 'I dig Mars!', :url =&gt; 'http://tinyurl.com/5wwaru' } }
+#
+# == Sample HTML (http://delicious.com):
+#   &lt;ul id=&quot;bookmarklist&quot; class=&quot;bookmarks NOTHUMB&quot;&gt;
+#     &lt;li class=&quot;post&quot; id=&quot;item-...&quot;&gt;
+#       &lt;div class=&quot;bookmark NOTHUMB&quot;&gt;
+#         &lt;div class=&quot;dateGroup&quot;&gt;         &lt;span title=&quot;23 APR 08&quot;&gt;23 APR 08&lt;/span&gt;     &lt;/div&gt;
+#         &lt;div class=&quot;data&quot;&gt;
+#           &lt;h4&gt;                          &lt;a rel=&quot;nofollow&quot; class=&quot;taggedlink&quot; href=&quot;http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm&quot;&gt;Blog Authorship Corpus (Blogger.com 1994)&lt;/a&gt;
+#                                         &lt;a class=&quot;inlinesave&quot; href=&quot;...&quot;&gt;SAVE&lt;/a&gt; &lt;/h4&gt;
+#           &lt;h5 class=&quot;savers-label&quot;&gt;     PEOPLE&lt;/h5&gt;
+#           &lt;div class=&quot;savers savers2&quot;&gt;  &lt;a class=&quot;delNav&quot; href=&quot;/url/7df6661946fca61863312644eb071953&quot;&gt;&lt;span class=&quot;delNavCount&quot;&gt;26&lt;/span&gt;&lt;/a&gt;  &lt;/div&gt;
+#           &lt;div class=&quot;description&quot;&gt;     The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person. &lt;/div&gt;
+#         &lt;/div&gt;
+#         &lt;div class=&quot;meta&quot;&gt;&lt;/div&gt;
+#         &lt;h5 class=&quot;tag-chain-label&quot;&gt;TAGS&lt;/h5&gt;
+#         &lt;div class=&quot;tagdisplay&quot;&gt;
+#           &lt;ul class=&quot;tag-chain&quot;&gt;
+#             &lt;li class=&quot;tag-chain-item off first&quot;&gt;&lt;a class=&quot;tag-chain-item-link&quot; rel=&quot;tag&quot; href=&quot;/infochimps/blog&quot;     &gt;&lt;span class=&quot;tag-chain-item-span&quot;&gt;blog&lt;/span&gt;    &lt;/a&gt;&lt;/li&gt;
+#             &lt;li class=&quot;tag-chain-item off&quot;&gt;      &lt;a class=&quot;tag-chain-item-link&quot; rel=&quot;tag&quot; href=&quot;/infochimps/corpus&quot;   &gt;&lt;span class=&quot;tag-chain-item-span&quot;&gt;corpus&lt;/span&gt;  &lt;/a&gt;&lt;/li&gt;
+#             &lt;li class=&quot;tag-chain-item off&quot;&gt;      &lt;a class=&quot;tag-chain-item-link&quot; rel=&quot;tag&quot; href=&quot;/infochimps/analysis&quot; &gt;&lt;span class=&quot;tag-chain-item-span&quot;&gt;analysis&lt;/span&gt;&lt;/a&gt;&lt;/li&gt;
+#             &lt;li class=&quot;tag-chain-item off&quot;&gt;      &lt;a class=&quot;tag-chain-item-link&quot; rel=&quot;tag&quot; href=&quot;/infochimps/nlp&quot;      &gt;&lt;span class=&quot;tag-chain-item-span&quot;&gt;nlp&lt;/span&gt;     &lt;/a&gt;&lt;/li&gt;
+#             &lt;li class=&quot;tag-chain-item on  last&quot;&gt; &lt;a class=&quot;tag-chain-item-link&quot; rel=&quot;tag&quot; href=&quot;/infochimps/dataset&quot;  &gt;&lt;span class=&quot;tag-chain-item-span&quot;&gt;dataset&lt;/span&gt; &lt;/a&gt;&lt;/li&gt;
+#           &lt;/ul&gt;
+#         &lt;/div&gt;
+#         &lt;div class=&quot;clr&quot;&gt;&lt;/div&gt;
+#       &lt;/div&gt;
+#     &lt;/li&gt;
+#   &lt;/ul&gt;
+#
+# == Parser Specification:
+#   :bookmarks            =&gt; [ 'ul#bookmarklist/li.post/.bookmark',
+#     {
+#       :date                     =&gt; hash(    '.dateGroup/span',
+#          [:year, :month, :day]  =&gt; regexp(  '', /(\d{2}) ([A-Z]{3}) (\d{2})/),
+#          ),
+#       :title                    =&gt;          '.data/h4/a.taggedlink',
+#       :url                      =&gt; attr(    '.data/h4/a.taggedlink', 'href'),
+#       :del_link_url             =&gt; href(    '.data/.savers/a.delNav),
+#       :num_savers               =&gt; to_i(    '.data/.savers//span.delNavCount'),
+#       :description              =&gt;          '.data/.description',
+#       :tags                     =&gt;         ['.tagdisplay//tag-chain-item-span']
+#     }
+#   ]
+#
+# == Example output:
+#   { :bookmarks =&gt; [
+#     { :date             =&gt; { :year =&gt; '08', :month =&gt; 'APR', :day =&gt; '23' },
+#       :title            =&gt; 'Blog Authorship Corpus (Blogger.com 1994)',
+#       :url              =&gt; 'http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm',
+#       :del_link_url     =&gt; '/url/7df6661946fca61863312644eb071953',
+#       :num_savers       =&gt; 26,
+#       :description      =&gt; 'The Blog ... ',
+#       :tags             =&gt; ['blog', 'corpus', 'analysis', 'nlp', 'dataset'],
+#      }
+#    ]}
+#
+# == Implementation:
+#
+# Internally, we take the spec and turn it into a recursive structure of Matcher
+# objects.  These consume Hpricot Elements and return the appropriately extracted
+# object.
+#
+# Note that the /default/ is for a bare selector to match ONE element, and to not
+# complain if there are many.
+#
+# Missing elements are silently ignored -- for example if
+#   :foo =&gt; 'li.missing'
+# there will simply be no :foo element in the hash (as opposed to having hsh[:foo]
+# set to nil -- hsh.include?(foo) will be false)
+#
+#
+# == List of Matchers:
+#     { :field =&gt; /spec/, ... }           # hash          hash, each field taken from spec.
+#     [ &quot;hpricot_path&quot; ]                  # 1-el array    array: for each element matching
+#                                                         hpricot_path, the inner_html
+#     [ &quot;hpricot_path&quot;, /spec/ ]          # 2-el array    array: for each element matching
+#                                                         hpricot_path, pass to spec
+#     &quot;hpricot_path&quot;                      # string        same as one(&quot;hpricot_path&quot;)
+#     one(&quot;hpricot_path&quot;)                 # one           first match to hpricot_path
+#     one(&quot;hpricot_path&quot;, /spec/)         # one           applies spec to first match to hpricot_path
+#     (these all match on one path:)
+#     regexp(&quot;hpricot_path&quot;, /RE/)        # regexp        capture groups from matching RE against
+#                                                         inner_html of first match to hpricot_path
+#     attr(&quot;hpricot_path&quot;, 'attr_name')   # attr
+#     href(&quot;hpricot_path&quot;)                # href          shorthand for attr(foo, 'href')
+#     no_html                             #               strip tags from contents
+#     html_encoded                        #               html encode contents
+#     to_i, to_f, etc                     # convert
+#     lambda{|doc| ... }                  # proc          calls proc on current doc
+#
+# == Complicated HCard example:
+#     :hcards                     =&gt;      [ '//ul.users/li.vcard',
+#       {
+#         :name                   =&gt;      '.fn',
+#         :address                =&gt;      one('.adr',
+#           :street               =&gt;      '.street',
+#           :city                 =&gt;      '.city',
+#           :zip                  =&gt;      '.postal'
+#         )
+#         :tel                    =&gt;      [ 'span.tel',
+#           {
+#             :type               =&gt;      'span.type',
+#             [:cc, :area, :num]  =&gt;      hp.regexp('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
+#           }
+#         ]
+#         :tags                   =&gt;      [ '.tag' ],
+#       }
+#     ]
+#
+# == Resulting Parser
+#     MatchHash({:hcards  =&gt;      MatchArray('//ul.users/li.hcard',
+#       MatchHash({
+#         :name                   =&gt;      MatchFirst('.fn'),
+#         :address                =&gt;      MatchFirst('.adr',
+#           MatchHash({
+#             :street             =&gt;      MatchFirst('.street'),
+#             :city               =&gt;      MatchFirst('.locality),
+#             :state              =&gt;      MatchFirst('.region),
+#             :zip                =&gt;      MatchFirst('.postal'),
+#           }))
+#         :tel                    =&gt;      MatchArray('span.tel',
+#           MatchHash({
+#             :type               =&gt;      MatchFirst('span.type'),
+#             [:cc, :area, :num]  =&gt;      RegexpMatcher('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
+#           })
+#         )
+#         :tags                   =&gt;      MatchArray('.tag'),
+#       })
+#     )
+#
+# == Example output
+#     [
+#       {:tel     =&gt; [ {:type =&gt; 'home', :cc =&gt; '49', :area =&gt; '305', :num =&gt; '555-1212'},
+#                      {:type =&gt; 'work', :cc =&gt; '49', :area =&gt; '305', :num =&gt; '555-6969'}, ],
+#        :name    =&gt; &quot;Bob Dobbs, Jr.&quot;,
+#        :tags    =&gt; [&quot;church&quot;] },
+#       {:tel     =&gt; [ {:type =&gt; 'fax',  :cc =&gt; '49', :area =&gt; '305', :num =&gt; '867-5309'}, ],
+#        :name    =&gt; &quot;Jenny&quot;,
+#        :address =&gt; { :street =&gt; &quot;53 Evergreen Terr.&quot;, :city =&gt; &quot;Springfield&quot; },
+#        :tags    =&gt; [&quot;bathroom&quot;, &quot;wall&quot;] },
+#     ]
+
 class HTMLParser
   attr_accessor :mapping
 </diff>
      <filename>lib/imw/extract/html_parser.rb</filename>
    </modified>
  </modified>
  <removed type="array">
    <removed>
      <filename>lib/imw/asset_store/asset.rb</filename>
    </removed>
    <removed>
      <filename>lib/imw/asset_store/cached_asset.rb</filename>
    </removed>
    <removed>
      <filename>lib/imw/asset_store/file_store.rb</filename>
    </removed>
    <removed>
      <filename>lib/imw/asset_store/tiered_path_linkish.rb</filename>
    </removed>
  </removed>
  <parents type="array">
    <parent>
      <id>c172acd008f04c310cea5a7fff5b9498d02054d1</id>
    </parent>
  </parents>
  <author>
    <name>Philip (flip) Kromer</name>
    <email>flip@infochimps.org</email>
  </author>
  <url>http://github.com/infochimps/imw/commit/99c72cfa4235acd659fb0c8e56fea74b177badda</url>
  <id>99c72cfa4235acd659fb0c8e56fea74b177badda</id>
  <committed-date>2008-11-25T21:15:19-08:00</committed-date>
  <authored-date>2008-11-25T21:15:19-08:00</authored-date>
  <message>Kinda know how to organize this... let's find out</message>
  <tree>62d8c9e4bf89876a94cee725a8bbfe2755b65ecc</tree>
  <committer>
    <name>Philip (flip) Kromer</name>
    <email>flip@infochimps.org</email>
  </committer>
</commit>
