Update cnx-archive to use python 3 (but still compatible with python 2)

openstax · Oct 21, 2015 · ad8db6c · ad8db6c
1 parent 7c364f0
commit ad8db6c
Show file tree

Hide file tree

Showing 25 changed files with 988 additions and 885 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,6 +1,7 @@
 language: python
 python:
   - "2.7"
+  - "3.4"
 addons:
   postgresql: "9.3"
 services:
@@ -19,10 +20,10 @@ before_install:
 
   # Install rhaptos.cnxmlutils (dependency of cnx-cnxml-transforms)
   - git clone https://github.com/Connexions/rhaptos.cnxmlutils.git
-  - cd ./rhaptos.cnxmlutils && python setup.py install && cd ..
+  - cd ./rhaptos.cnxmlutils && python setup.py install && sudo /usr/bin/python setup.py install && cd ..
   # Install cnx-cnxml-transforms
   - git clone https://github.com/Connexions/cnx-cnxml-transforms.git
-  - cd ./cnx-cnxml-transforms && python setup.py install && cd ..
+  - cd ./cnx-cnxml-transforms && python setup.py install && sudo /usr/bin/python setup.py install && cd ..
   # Install cnx-query-grammar
   - git clone https://github.com/Connexions/cnx-query-grammar.git
   - cd ./cnx-query-grammar && python setup.py install && cd ..
@@ -34,12 +35,13 @@ before_install:
   - pip install coverage
   - pip install coveralls
 
-  # FIXME patch triggers to include paths to cnx-cnxml-transforms and rhaptos.cnxmlutils
-  - CNXML_TRANSFORMS_PATH=$(python -c 'import os; import cnxmltransforms; print(os.path.abspath("{}/..".format(cnxmltransforms.__path__[0])))')
-  - RHAPTOS_CNXMLUTILS_PATH=$(python -c 'import os; import rhaptos.cnxmlutils; print(os.path.abspath("{}/../..".format(rhaptos.cnxmlutils.__path__[0])))')
-  - LXML_PATH=$(python -c 'import os; import lxml; print(os.path.abspath("{}/..".format(lxml.__path__[0])))')
-  - cd cnx-cnxml-transforms && python setup.py install && cd ..
-  - sed -i "s%from cnxmltransforms%import sys; sys.path.append('$CNXML_TRANSFORMS_PATH'); sys.path.append('$RHAPTOS_CNXMLUTILS_PATH'); sys.path.append('$LXML_PATH'); &%" cnxarchive/sql/schema/*.sql
+# Doesn't work with python 3
+#  # FIXME patch triggers to include paths to cnx-cnxml-transforms and rhaptos.cnxmlutils
+#  - CNXML_TRANSFORMS_PATH=$(python -c 'import os; import cnxmltransforms; print(os.path.abspath("{}/..".format(cnxmltransforms.__path__[0])))')
+#  - RHAPTOS_CNXMLUTILS_PATH=$(python -c 'import os; import rhaptos.cnxmlutils; print(os.path.abspath("{}/../..".format(rhaptos.cnxmlutils.__path__[0])))')
+#  - LXML_PATH=$(python -c 'import os; import lxml; print(os.path.abspath("{}/..".format(lxml.__path__[0])))')
+#  - cd cnx-cnxml-transforms && python setup.py install && cd ..
+#  - sed -i "s%from cnxmltransforms%import sys; sys.path.append('$CNXML_TRANSFORMS_PATH'); sys.path.append('$RHAPTOS_CNXMLUTILS_PATH'); sys.path.append('$LXML_PATH'); &%" cnxarchive/sql/schema/*.sql
 install:
   - python setup.py install
 before_script:

diff --git a/cnxarchive/__init__.py b/cnxarchive/__init__.py
@@ -6,6 +6,9 @@
 # See LICENCE.txt for details.
 # ###
 """Document and collection archive web application."""
+from __future__ import unicode_literals
+import sys
+
 from pyramid.config import Configurator
 
 
@@ -22,6 +25,7 @@
     'cache-control',
     'content-type',
     ]
+IS_PY2 = sys.version_info.major == 2
 
 
 def declare_api_routes(config):
@@ -72,6 +76,9 @@ def main(global_config, **settings):
         if not settings.get(setting, None):
             raise ValueError('Missing {} config setting.'.format(setting))
 
-    config.scan(ignore='.tests')
+    if IS_PY2:
+        config.scan(ignore=b'.tests')
+    else:
+        config.scan(ignore='.tests')
     config.include('cnxarchive.events.main')
     return config.make_wsgi_app()
diff --git a/cnxarchive/cache.py b/cnxarchive/cache.py
@@ -6,14 +6,15 @@
 # See LICENCE.txt for details.
 # ###
 """Memcached utilities"""
-
+from __future__ import unicode_literals
 import base64
 import copy
 
 import memcache
 from pyramid.threadlocal import get_current_registry
 
 from .search import search as database_search
+from .utils import utf8
 
 
 def search(query, query_type, nocache=False):
@@ -36,11 +37,11 @@ def search(query, query_type, nocache=False):
 
     # search_key should look something like:
     # '"sort:pubDate" "text:college physics" "query_type:weakAND"'
-    search_key = ' '.join(['"{}"'.format(':'.join(param))
+    search_key = ' '.join(['"{}"'.format(':'.join(utf8(param)))
                            for param in search_params])
     # since search_key is not a valid memcached key, use base64
     # encoding to make it into a valid key
-    mc_search_key = base64.b64encode(search_key)
+    mc_search_key = base64.b64encode(search_key.encode('utf-8'))
 
     # look for search results in memcache first, unless nocache
     mc = memcache.Client(memcache_servers,
@@ -71,5 +72,6 @@ def search(query, query_type, nocache=False):
         mc.set(mc_search_key, search_results, time=cache_length,
                min_compress_len=1024*1024)  # compress when > 1MB
 
+    mc.disconnect_all()
     # return search results
     return search_results
diff --git a/cnxarchive/config.py b/cnxarchive/config.py
@@ -5,7 +5,7 @@
 # Public License version 3 (AGPLv3).
 # See LICENCE.txt for details.
 # ###
-
+from __future__ import unicode_literals
 import os.path
 
 

diff --git a/cnxarchive/database.py b/cnxarchive/database.py
@@ -6,13 +6,14 @@
 # See LICENCE.txt for details.
 # ###
 """Database models and utilities"""
+from __future__ import unicode_literals
 import datetime
 import os
 import json
 import psycopg2
 import re
 
-from . import config
+from . import config, IS_PY2
 from .utils import split_ident_hash
 
 
@@ -47,7 +48,7 @@ def _read_sql_file(name):
 
 
 def _read_schema_manifest(manifest_filepath):
-    with open(os.path.abspath(manifest_filepath), 'rb') as fp:
+    with open(os.path.abspath(manifest_filepath), 'r') as fp:
         raw_manifest = json.loads(fp.read())
     manifest = []
     relative_dir = os.path.abspath(os.path.dirname(manifest_filepath))
@@ -76,7 +77,7 @@ def _compile_manifest(manifest, content_modifier=None):
         if isinstance(item, list):
             items.extend(_compile_manifest(item, content_modifier))
         else:
-            with open(item, 'rb') as fp:
+            with open(item, 'r') as fp:
                 content = fp.read()
             if content_modifier:
                 content = content_modifier(item, content)
@@ -91,7 +92,7 @@ def get_schema():
 
     # Modify the file so that it contains comments that say it's origin.
     def file_wrapper(f, c):
-        return u"-- FILE: {0}\n{1}\n-- \n".format(f, c)
+        return "-- FILE: {0}\n{1}\n-- \n".format(f, c)
 
     return _compile_manifest(schema_manifest, file_wrapper)
 
@@ -140,7 +141,11 @@ def get_tree(ident_hash, cursor):
         tree = cursor.fetchone()[0]
     except TypeError:  # NoneType
         raise ContentNotFound()
-    if type(tree) in (type(''), type(u'')):
+    if IS_PY2:
+        string_types = basestring
+    else:
+        string_types = (str, bytes)
+    if isinstance(tree, string_types):
         import json
         return json.loads(tree)
     else:

diff --git a/cnxarchive/logs.py b/cnxarchive/logs.py
@@ -5,6 +5,7 @@
 # Public License version 3 (AGPLv3).
 # See LICENCE.txt for details.
 # ###
+from __future__ import unicode_literals
 import os.path
 import logging
 import socket

diff --git a/cnxarchive/robots.py b/cnxarchive/robots.py
@@ -1,3 +1,15 @@
+# -*- coding: utf-8 -*-
+# ###
+# Copyright (c) 2015, Rice University
+# This software is subject to the provisions of the GNU Affero General
+# Public License version 3 (AGPLv3).
+# See LICENCE.txt for details.
+# ###
+from __future__ import unicode_literals
+
+from . import IS_PY2
+
+
 class Robots(object):
     def __init__(self, sitemap='http://cnx.org/sitemap.xml', bots=None):
         self.sitemap = sitemap
@@ -6,14 +18,23 @@ def __init__(self, sitemap='http://cnx.org/sitemap.xml', bots=None):
     def add_bot(self, bot_name, delay, pages_to_block):
         self.bots.append(Bot(bot_name, delay, pages_to_block))
 
-    def __str__(self):
+    def to_string(self):
         ret_str = 'Sitemap: ' + self.sitemap + '\n'
         for bot in self.bots:
             ret_str += bot.to_string() + '\n'
         return ret_str
 
-    def to_string(self):
-        return self.__str__()
+    def __str__(self):
+        if IS_PY2:
+            return self.to_string().encode('utf-8')
+        return self.to_string()
+
+    def __unicode__(self):
+        # FIXME remove when we no longer need to support python 2
+        return self.to_string()
+
+    def __bytes__(self):
+        return self.to_string().encode('utf-8')
 
 
 class Bot(object):
@@ -22,13 +43,22 @@ def __init__(self, bot_name, delay, pages_to_block):
         self.delay = delay
         self.blocked = pages_to_block
 
-    def __str__(self):
+    def to_string(self):
         ret_str = 'User-agent: ' + self.name + '\n'
         if self.delay:
             ret_str += 'Crawl-delay: ' + self.delay + '\n'
         for page in self.blocked:
             ret_str += 'Disallow: ' + page + '\n'
         return ret_str
 
-    def to_string(self):
-        return self.__str__()
+    def __str__(self):
+        if IS_PY2:
+            return self.to_string().encode('utf-8')
+        return self.to_string()
+
+    def __unicode__(self):
+        # FIXME remove when we no longer need to support python 2
+        return self.to_string()
+
+    def __bytes__(self):
+        return self.to_string().encode('utf-8')
diff --git a/cnxarchive/scripts/hits_counter.py b/cnxarchive/scripts/hits_counter.py
@@ -11,6 +11,7 @@
 The counts are processed into a time range
 and inserted into the cnx-archive database.
 """
+from __future__ import unicode_literals
 import re
 import argparse
 import gzip

diff --git a/cnxarchive/scripts/initializedb.py b/cnxarchive/scripts/initializedb.py
@@ -6,7 +6,7 @@
 # See LICENCE.txt for details.
 # ###
 """Commandline script used to initialize the SQL database."""
-from __future__ import print_function
+from __future__ import print_function, unicode_literals
 import os
 import sys
 import argparse

diff --git a/cnxarchive/search.py b/cnxarchive/search.py
@@ -6,6 +6,7 @@
 # See LICENCE.txt for details.
 # ###
 """Database search utilties"""
+from __future__ import unicode_literals
 import os
 import json
 import re
@@ -328,7 +329,7 @@ def __hash__(self):
                 authors.add(hashabledict(author))
 
         authors = list(authors)
-        authors.sort(lambda x, y: cmp(y['id'], x['id']))
+        authors.sort(key=lambda a: a['id'], reverse=True)
         setattr(self, attr_name, authors)
         return getattr(self, attr_name)
 
@@ -348,18 +349,18 @@ def _count_field(self, field_name, sorted=True, max_results=None):
 
         if max_results:
             # limit the number of results we return
-            counts = counts.items()
+            counts = list(counts.items())
             # sort counts by the count with highest count first
-            counts.sort(lambda a, b: cmp(a[1], b[1]), reverse=True)
+            counts.sort(key=lambda a: (a[1], a[0].lower()), reverse=True)
             counts = counts[:max_results]
 
         if sorted:
             if isinstance(counts, dict):
-                counts = counts.items()
+                counts = list(counts.items())
             # Sort counts by the name alphabetically
-            counts.sort(lambda a, b: cmp(a[0].lower(), b[0].lower()))
+            counts.sort(key=lambda a: a[0].lower())
         else:
-            counts = counts.iteritems()
+            counts = list(counts.items())
 
         return counts
 
@@ -384,25 +385,21 @@ def _count_authors(self, max_results=None):
                 counts[uid] += 1
                 uid_author.setdefault(uid, author)
         authors = []
-        for uid, count in counts.iteritems():
+        for uid, count in counts.items():
             author = uid_author[uid]
             authors.append(((uid, author,), count))
 
         if max_results:
             # limit the number of results we return
             # sort counts by the count with highest count first
-            authors.sort(lambda a, b: cmp(a[1], b[1]), reverse=True)
+            authors.sort(key=lambda a: a[1], reverse=True)
             authors = authors[:max_results]
 
-        def sort_name(a, b):
+        def sort_name_key(a):
             (uid_a, author_a), count_a = a
-            (uid_b, author_b), count_b = b
-            result = cmp(author_a['surname'], author_b['surname'])
-            if result == 0:
-                result = cmp(author_a['firstname'], author_b['firstname'])
-            return result
+            return (author_a['surname'] or '', author_a['firstname'] or '')
         # Sort authors by surname then first name
-        authors.sort(sort_name)
+        authors.sort(key=sort_name_key)
         authors = [(a[0][0], a[1],) for a in authors]
         return authors
 
@@ -414,12 +411,12 @@ def _count_publication_year(self):
                 continue
             date = datetime(*strptime(date, "%Y-%m-%dT%H:%M:%SZ")[:6],
                             tzinfo=FixedOffsetTimezone())
-            year = unicode(date.astimezone(LOCAL_TZINFO).year)
+            year = str(date.astimezone(LOCAL_TZINFO).year)
             counts.setdefault(year, 0)
             counts[year] += 1
-        counts = counts.items()
+        counts = list(counts.items())
         # Sort pubYear in reverse chronological order
-        counts.sort(lambda a, b: cmp(a[0], b[0]), reverse=True)
+        counts.sort(key=lambda a: a[0], reverse=True)
         return counts
 
 
@@ -536,7 +533,7 @@ def _build_search(structured_query, weights):
     arguments = {}
 
     # Clone the weighted queries for popping.
-    query_weight_order = DEFAULT_SEARCH_WEIGHTS.keys()
+    query_weight_order = list(DEFAULT_SEARCH_WEIGHTS.keys())
 
     # Roll over the weight sequence.
     query_list = []