<?xml version="1.0" encoding="UTF-8"?>
<commit>
  <added type="array"/>
  <modified type="array">
    <modified>
      <diff>@@ -14,7 +14,7 @@ logging.info(&quot;Starting application in DEBUG mode: %s&quot;, DEBUG)
 DEFAULT_BLOG = {
     &quot;bloog_version&quot;: &quot;0.8&quot;,
     &quot;html_type&quot;: &quot;text/html&quot;,
-    &quot;charset&quot;: &quot;iso-8859-1&quot;,
+    &quot;charset&quot;: &quot;utf-8&quot;,
     &quot;title&quot;: &quot;Bloog&quot;,
     &quot;author&quot;: &quot;Bill Katz&quot;,
     # This must be the email address of a registered administrator for the </diff>
      <filename>config.py</filename>
    </modified>
    <modified>
      <diff>@@ -90,16 +90,23 @@ Options:
 -l, --url        = the url (web location) of the Bloog app
 -a, --articles   = only upload this many articles (for testing)
 &quot;&quot;&quot;
+DB_ENCODING = 'latin-1'
 
 # List the ASCII chars that are OK for our pages
+NEWLINE_CHARS = [ord(x) for x in ['\n', '\t', '\r']]
 OK_CHARS = range(32,126) + [ord(x) for x in ['\n', '\t', '\r']]
 OK_TITLE = range(32,126)
 
 def clean_multiline(raw_string):
     return ''.join([x for x in raw_string if ord(x) in OK_CHARS])
 
-def clean_singleline(raw_string):
-    return ''.join([x for x in raw_string if ord(x) in OK_TITLE])
+def force_singleline(raw_string):
+    return ''.join([x for x in raw_string if ord(x) not in NEWLINE_CHARS])
+
+def fix_string(str_from_db):
+    # Add encoding change here if needed.
+    # For Bloog, will just output latin-1 and let it convert to utf-8
+    return str_from_db
 
 def fix_thread_string(tstr):
     &quot;&quot;&quot;
@@ -227,21 +234,25 @@ class DrupalConverter(object):
 
     def get_html(self, raw_body, markup_type):
         &quot;&quot;&quot; Convert various Drupal formats to html &quot;&quot;&quot;
+        
+        utf8_body = fix_string(raw_body)
+
         def repl(tmatch):
-            if tmatch:
-                return textile.textile(tmatch.group(1))
+            if tmatch:   # Assume latin-1.  Will be converted by Bloog.
+                return textile.textile(tmatch.group(1), 
+                                       encoding='latin-1', output='latin-1')
 
         # Because Drupal textile formatting allows use of [textile][/textile] 
         # delimeters, remove them.
         if markup_type == 'textile':
             pattern = re.compile('\[textile\](.*)\[/textile\]', 
                                  re.MULTILINE | re.IGNORECASE | re.DOTALL)
-            body = re.sub(pattern, repl, raw_body)
+            body = re.sub(pattern, repl, utf8_body)
         elif markup_type == 'filtered html':
-            body = re.sub('\n', '&lt;br /&gt;', raw_body)
+            body = re.sub('\n', '&lt;br /&gt;', utf8_body)
         else:
             body = raw_body
-        return clean_multiline(body)
+        return body
 
     def go(self, num_articles=None):
         # Get all the term (tag) data and the hierarchy pattern
@@ -266,7 +277,7 @@ class DrupalConverter(object):
             ntype = row[1]
             if ntype in ['page', 'blog']:
                 article['legacy_id'] = row[0]
-                article['title'] = clean_singleline(row[2])
+                article['title'] = force_singleline(row[2])
                 article['format'] = None
                 if row[14] &gt;= 0 and row[14] &lt;= 4:
                     cur_format = self.drupal_format_description[row[14]]
@@ -333,13 +344,13 @@ class DrupalConverter(object):
                 # Store comment associated with article by POST to 
                 # article entry url
                 comment = {
-                    'title': clean_singleline(row[0]),
-                    'body': clean_multiline(row[1]),
+                    'title': force_singleline(row[0]),
+                    'body': fix_string(row[1]),
                     'published': str(datetime.datetime.fromtimestamp(row[2])),
-                    'thread': fix_thread_string(clean_singleline(row[3])),
-                    'name': clean_singleline(row[4]),
-                    'email': clean_singleline(row[5]),
-                    'homepage': clean_singleline(row[6])
+                    'thread': fix_thread_string(force_singleline(row[3])),
+                    'name': force_singleline(row[4]),
+                    'email': force_singleline(row[5]),
+                    'homepage': force_singleline(row[6])
                 }
                 print &quot;Posting comment '&quot; + row[0] + &quot;' to&quot;, \
                       comment_posting_url</diff>
      <filename>dev/scripts/drupal_uploader.py</filename>
    </modified>
    <modified>
      <diff>@@ -52,6 +52,7 @@ from google.appengine.ext.webapp import template
 
 from handlers import restful
 from utils import authorized
+from utils import sanitizer
 import models
 import view
 import config
@@ -89,12 +90,26 @@ def get_format(format_string):
     return format_string
 
 def get_tag_key(tag_name):
-    obj = models.blog.Tag.get_or_insert(tag_name)
+    obj = models.blog.Tag.get_or_insert(tag_name.lower())
     return obj.key()
 
+def process_tag(tag_name, tags):
+    # Check tag_name against all 'name' values in tags and coerce
+    tag_name = tag_name.strip()
+    lowercase_name = tag_name.lower()
+    for tag in tags:
+        if lowercase_name == tag['name'].lower():
+            return tag['name']
+    return tag_name
+
 def get_tags(tags_string):
+    logging.debug(&quot;get_tags: tag_string = %s&quot;, tags_string)
     if tags_string:
-        return [s.strip() for s in tags_string.split(&quot;,&quot;) if s != '']
+        from models.blog import Tag
+        tags = Tag.list()
+        logging.debug(&quot;  tags = %s&quot;, tags)
+        return [process_tag(s, tags) 
+                for s in tags_string.split(&quot;,&quot;) if s != '']
     return None
     
 def get_friendly_url(title):
@@ -105,23 +120,23 @@ def get_friendly_url(title):
 def get_html(body, markup_type):
     if markup_type == 'textile':
         from external.libs import textile
-        return textile.textile(str(body))
+        return textile.textile(body)
     return body
 
 def get_captcha(key):
     return (&quot;%X&quot; % abs(hash(str(key) + config.BLOG['title'])))[:6]
 
-def sanitize_html(html):
-    from utils import sanitizer
-    try:
-        clean_html = sanitizer.sanitize_html(html, 
-                                             allow_attributes=['href', 'src'],
-                                             blacklist_tags=['img'])
-        return clean_html
-    except sanitizer.DangerousHTMLError, e:
-        logging.error(&quot;Sanitized HTML has dangerous elements: %s&quot;, e.value)
-        return None
-
+def get_sanitizer_func(handler, **kwargs):
+    match_obj = re.match(r'.*;\s*charset=(?P&lt;charset&gt;[\w-]+)',  
+                         handler.request.headers['CONTENT_TYPE'])
+    kwlist = {}
+    kwlist.update(kwargs)
+    if match_obj:
+        kwlist.update({ 'encoding': match_obj.group('charset').lower() })
+    logging.debug(&quot;Content-type: %s&quot;, handler.request.headers['CONTENT_TYPE'])
+    logging.debug(&quot;In sanitizer: %s&quot;, kwlist)
+    return lambda html : sanitizer.sanitize_html(html, **kwlist)
+        
 def process_embedded_code(article):
     # TODO -- Check for embedded code, escape opening triangular brackets
     # within code, and set article embedded_code strings so we can
@@ -138,7 +153,7 @@ def process_article_edit(handler, permalink):
         params[key] = value[0]
     property_hash = restful.get_sent_properties(params.get,
         ['title',
-         'body',
+         ('body', get_sanitizer_func(handler, allow_styling=True)),
          ('format', get_format),
          ('updated', get_datetime),
          ('tags', get_tags),
@@ -149,11 +164,10 @@ def process_article_edit(handler, permalink):
             property_hash['tag_keys'] = [get_tag_key(name) 
                                          for name in property_hash['tags']]
         article = db.Query(models.blog.Article).filter('permalink =', permalink).get()
-        before_tags = set(article.tags)
+        before_tags = set(article.tag_keys)
         for key,value in property_hash.iteritems():
-            logging.debug(&quot;  Setting %s&quot;, key)
             setattr(article, key, value)
-        after_tags = set(article.tags)
+        after_tags = set(article.tag_keys)
         for removed_tag in before_tags - after_tags:
             db.get(removed_tag).counter.decrement()
         for added_tag in after_tags - before_tags:
@@ -168,7 +182,7 @@ def process_article_edit(handler, permalink):
 def process_article_submission(handler, article_type):
     property_hash = restful.get_sent_properties(handler.request.get, 
         ['title',
-         'body',
+         ('body', get_sanitizer_func(handler, allow_styling=True)),
          'legacy_id',
          ('format', get_format),
          ('published', get_datetime),
@@ -197,22 +211,19 @@ def process_article_submission(handler, article_type):
         handler.error(400)
 
 def process_comment_submission(handler, article):
+    sanitize_comment = get_sanitizer_func(handler,
+                                          allow_attributes=['href', 'src'],
+                                          blacklist_tags=['img'])
     property_hash = restful.get_sent_properties(handler.request.get, 
         ['name',
          'email',
          'homepage',
          'title',
-         'body',
+         ('body', sanitize_comment),
          'key',
          'thread',    # If it's given, use it.  Else generate it.
          'captcha',
          ('published', get_datetime)])
-    html = sanitize_html(property_hash['body'])
-    if html is None:
-        handler.error(400)
-        return
-    else:
-        property_hash['body'] = html
 
     # If we aren't administrator, abort if bad captcha
     if not users.is_current_user_admin():
@@ -255,8 +266,14 @@ def process_comment_submission(handler, article):
         article.num_comments += 1
     property_hash['article'] = article.put()
 
-    comment = models.blog.Comment(**property_hash)
-    comment.put()
+    try:
+        comment = models.blog.Comment(**property_hash)
+        comment.put()
+    except:
+        logging.debug(&quot;Bad comment: %s&quot;, property_hash)
+        handler.error(400)
+        return
+
     # Render just this comment and send it to client
     response = template.render(
         &quot;views/%s/bloog/blog/comment.html&quot; % config.BLOG['theme'], 
@@ -326,6 +343,7 @@ class RootHandler(restful.Controller):
         process_article_submission(handler=self, article_type='article')
 
 # Articles are off root url
+# TODO -- Make it DRY by combining Article/MonthHandler
 class ArticleHandler(restful.Controller):
     def get(self, path):
         logging.debug(&quot;ArticleHandler#get on path (%s)&quot;, path)
@@ -362,9 +380,6 @@ class ArticleHandler(restful.Controller):
         By using DELETE on /Article, /Comment, /Tag, you can delete the first 
          entity of the desired kind.
         This is useful for writing utilities like clear_datastore.py.  
-        TODO - Once we write a DELETE for specific entities, it makse sense to 
-         DRY this up and just require a utility to inquire which entities are 
-         available and then call DELETE on each permalink.
         &quot;&quot;&quot;
         # TODO: Add DELETE for articles off root like blog entry DELETE.
         model_class = path.lower()
@@ -396,7 +411,13 @@ class ArticleHandler(restful.Controller):
             query = models.blog.Tag.all()
             delete_entity(query)
         else:
-            self.error(404)
+            article = db.Query(models.blog.Article). \
+                         filter('permalink =', path).get()
+            for key in article.tag_keys:
+                db.get(key).counter.decrement()
+            article.delete()
+            view.invalidate_cache()
+            restful.send_successful_response(self, &quot;/&quot;)
 
 # Blog entries are dated articles
 class BlogEntryHandler(restful.Controller):</diff>
      <filename>handlers/bloog/blog.py</filename>
    </modified>
    <modified>
      <diff>@@ -62,10 +62,10 @@ def get_sent_properties(request_func, propname_list):
     &quot;&quot;&quot;
     prop_hash = {}
     for item in propname_list:
-        if type(item) == str:
+        if isinstance(item, basestring):
             key = item
             value = request_func(item)
-        elif type(item == tuple):
+        elif isinstance(item, tuple):
             key = item[0]
             prop_func = item[1]
             if len(item) &lt;= 2:</diff>
      <filename>handlers/restful.py</filename>
    </modified>
    <modified>
      <diff>@@ -33,6 +33,8 @@ YAHOO.bloog.initAdmin = function() {
                 YAHOO.bloog.http.action = '/';
                 YAHOO.bloog.http.verb = 'POST';
                 YAHOO.bloog.editor.setEditorHTML('&lt;p&gt;Article goes here&lt;/p&gt;');
+                YAHOO.bloog.postDialog.render();
+                YAHOO.bloog.postDialog.show();
                 break;
             case 'newblog':
                 hdr.setContent('Submit Blog Entry');
@@ -42,6 +44,8 @@ YAHOO.bloog.initAdmin = function() {
                 YAHOO.bloog.http.action = &quot;/&quot; + year + &quot;/&quot; + month;
                 YAHOO.bloog.http.verb = 'POST';
                 YAHOO.bloog.editor.setEditorHTML('&lt;p&gt;Blog entry goes here&lt;/p&gt;');
+                YAHOO.bloog.postDialog.render();
+                YAHOO.bloog.postDialog.show();
                 break;
             case 'editbtn':
                 hdr.setContent('Submit Edit');
@@ -55,8 +59,6 @@ YAHOO.bloog.initAdmin = function() {
                 }, null);
                 break;
         }
-        YAHOO.bloog.postDialog.render();
-        YAHOO.bloog.postDialog.show();
     }
 
     YAHOO.bloog.populateDialog = function(o) {
@@ -64,6 +66,8 @@ YAHOO.bloog.initAdmin = function() {
         document.getElementById(&quot;postTitle&quot;).value = article.title;
         document.getElementById(&quot;postTags&quot;).value = article.tags.join(', ');
         YAHOO.bloog.editor.setEditorHTML(article.body);
+        YAHOO.bloog.postDialog.render();
+        YAHOO.bloog.postDialog.show();
     }
 
     var handleSubmit = function() {
@@ -112,7 +116,7 @@ YAHOO.bloog.initAdmin = function() {
         animate: true,
         toolbar: {
             titlebar: '',
-            draggable: false,
+            draggable: true,
             buttonType: 'advanced',
             buttons: [
                 /*** Prefer to have blog articles of one font and use consistent sizing</diff>
      <filename>static/default/js/bloog_admin.js</filename>
    </modified>
    <modified>
      <diff>@@ -50,7 +50,8 @@ def process_html(html):
       The modified html and a list of strings giving the embedded
       code languages.
     &quot;&quot;&quot;
-    code_tag = re.compile('&lt;pre name=&quot;code&quot; class=&quot;([^&quot;]+)&quot;&gt;', re.MULTILINE)
+    code_tag = re.compile('\s*&lt;pre name=&quot;code&quot; class=&quot;([^&quot;]+)&quot;&gt;', 
+                          re.MULTILINE)
     languages = set([])
     soup = BeautifulSoup(html)
     clean_html = ''
@@ -65,5 +66,5 @@ def process_html(html):
             
     # Map the language class names to the spelling for javascript files
     list_language_files = [language_jsfiles[lang] for lang in list(languages)]
-    return sanitizer.clean_multiline(clean_html), list_language_files
+    return clean_html.decode('utf-8'), list_language_files
 </diff>
      <filename>utils/codehighlighter.py</filename>
    </modified>
    <modified>
      <diff>@@ -116,11 +116,11 @@ __history__ = &quot;&quot;&quot;
 &quot;&quot;&quot;
 
 # Set your encoding here.
-ENCODING = 'latin-1'
+ENCODING = 'utf-8'
 
 # Output? Non-ASCII characters will be automatically
 # converted to XML entities if you choose ASCII.
-OUTPUT = 'ascii'
+OUTPUT = 'utf-8'
 
 # PyTextile can optionally validate the generated
 # XHTML code. We can use either mxTidy or uTidyLib.</diff>
      <filename>utils/external/textile.py</filename>
    </modified>
    <modified>
      <diff>@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+import logging
 import string
 import re
 
@@ -59,19 +60,10 @@ class DangerousHTMLError(Exception):
     def __str__(self):
         return ' ~ '.join(self.value)
 
-# List the ASCII chars that are OK for our pages
-OK_CHARS = range(32,126) + [ord(x) for x in ['\n', '\t', '\r']]
-OK_TITLE = range(32,126)
-
-def clean_multiline(raw_string):
-    return ''.join([x for x in raw_string if ord(x) in OK_CHARS])
-
-def clean_singleline(raw_string):
-    return ''.join([x for x in raw_string if ord(x) in OK_TITLE])
-
-def sanitize_html(html='&lt;p&gt;No comment&lt;/p&gt;', 
+def sanitize_html(html='&lt;p&gt;No comment&lt;/p&gt;', encoding=None,
                   allow_tags=[], allow_attributes=[],
-                  blacklist_tags=[], blacklist_attributes=[]):
+                  blacklist_tags=[], blacklist_attributes=[],
+                  allow_styling=False):
     &quot;&quot;&quot;Parses HTML and tries to sanitize it using white list.
     
     This method is a mishmash of code from Django snippets
@@ -83,9 +75,10 @@ def sanitize_html(html='&lt;p&gt;No comment&lt;/p&gt;',
     So sanitized HTML cannot be colored or highlighted using styles.
 
     Args:
-      html: HTML to be sanitized
+      html: HTML to be sanitized.
       allow_tags: limit all tags to just this list
       allow_attributes: limit all tags to just this list
+      allow_styling: should only be TRUE if you trust source
 
     Returns:
       Sanitized version of html
@@ -100,9 +93,17 @@ def sanitize_html(html='&lt;p&gt;No comment&lt;/p&gt;',
     allow_tags = [tag for tag in allow_tags if tag not in blacklist_tags]
     allow_attributes = [tag for tag in allow_attributes 
                         if tag not in blacklist_tags]
-
-    #cleaner_html = limit_xss_vectors(html)
-    soup = BeautifulSoup(html)
+    if allow_styling:
+        allow_attributes.append('style')
+
+    if isinstance(html, unicode) and not encoding:
+        logging.debug(&quot;Sanitizing unicode input.&quot;)
+        soup = BeautifulSoup(html)
+    else:
+        if not encoding:
+            encoding = 'latin-1'
+        logging.debug(&quot;Sanitizing string input, assuming %s&quot;, encoding)
+        soup = BeautifulSoup(html.decode(encoding, 'ignore'))
     for comment in soup.findAll(
                     text = lambda text: isinstance(text, Comment)):
         comment.extract()
@@ -121,7 +122,7 @@ def sanitize_html(html='&lt;p&gt;No comment&lt;/p&gt;',
                         raise DangerousHTMLError(html)
                 ok_attrs += [(attr, val)]
         tag.attrs = ok_attrs
-    return soup.renderContents().decode('utf8')
+    return soup.renderContents().decode('utf-8')
 
 def chop_up(text, chop_size=5):
     &quot;Returns a list of smaller chunks of text&quot;</diff>
      <filename>utils/sanitizer.py</filename>
    </modified>
  </modified>
  <removed type="array"/>
  <parents type="array">
    <parent>
      <id>16a90c4b9f499826146b68ec014ac5ddf37c10ee</id>
    </parent>
  </parents>
  <author>
    <name>Bill Katz</name>
    <email>billkatz@gmail.com</email>
  </author>
  <url>http://github.com/DocSavage/bloog/commit/4194cba50b77f669ac238925dad566ea79226f4a</url>
  <id>4194cba50b77f669ac238925dad566ea79226f4a</id>
  <committed-date>2008-08-21T21:50:01-07:00</committed-date>
  <authored-date>2008-08-21T21:50:01-07:00</authored-date>
  <message>Improved handling of text encoding.  Now primarily utf-8 inside app.  Will check Content-Type header for encoding on POST/PUT.</message>
  <tree>81c5eea0d729eaa959a75ec4ab4a1660d43396a7</tree>
  <committer>
    <name>Bill Katz</name>
    <email>billkatz@gmail.com</email>
  </committer>
</commit>
