Skip to content

Commit

Permalink
Merge d33dcd0 into 88d5d7c
Browse files Browse the repository at this point in the history
  • Loading branch information
janetriley committed Jun 24, 2017
2 parents 88d5d7c + d33dcd0 commit fac0bc0
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 61 deletions.
16 changes: 12 additions & 4 deletions baleen/console/commands/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@

from commis import Command
from baleen.console.utils import csv
from baleen.export import MongoExporter, SCHEMES
from baleen.export import MongoExporter, SCHEMES, JSON
from baleen.utils.text import SAFE, SANITIZE_LEVELS

from baleen.utils.timez import Timer


Expand All @@ -47,10 +49,16 @@ class ExportCommand(Command):
},
('-S', '--scheme'): {
'type': str,
'default': 'json',
'default': JSON,
'choices': SCHEMES,
'help': 'specify the output format for the corpus',
},
('-Z', '--sanitize'): {
'type': str,
'default': SAFE,
'choices': SANITIZE_LEVELS,
'help': 'specify what sanitization to apply to html exports',
},
'location': {
'nargs': 1,
'type': str,
Expand All @@ -69,15 +77,15 @@ def handle(self, args):

# Create the exporter object
exporter = MongoExporter(
root, categories=args.categories, scheme=args.scheme
root, categories=args.categories, scheme=args.scheme, sanitize_level=args.sanitize
)

# If list categories is true, list them and exit.
if args.list_categories:
return "\n".join(sorted(exporter.categories))

with Timer() as t:
exporter.export(level=args.scheme)
exporter.export()

return (
"Baleen corpus export complete in {}\n"
Expand Down
132 changes: 90 additions & 42 deletions baleen/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,16 @@
from baleen.utils.text import sanitize_html, SAFE, SANITIZE_LEVELS

DTFMT = "%b %d, %Y at %H:%M"
EXPORT_FORMATS = ('json', 'html')
SCHEMES = EXPORT_FORMATS + SANITIZE_LEVELS
JSON = 'json'
HTML = 'html'
SCHEMES = (JSON, HTML)
State = Enum('State', 'Init, Started, Finished')

INVALID_SANITIZE_LEVEL = "Unknown sanitization method: '{{}}' - use one of {LEVELS}.".format(
LEVELS=(", ".join(SANITIZE_LEVELS)))
INVALID_EXPORT_SCHEME = "Unknown export format scheme '{{}}' - use one of {SCHEMES}.".format(
SCHEMES=", ".join(SCHEMES))


##########################################################################
## Exporter
Expand All @@ -46,20 +52,24 @@ class MongoExporter(object):
writing posts to disk in either HTML or JSON format.
"""

def __init__(self, root, categories=None, scheme='json'):
def __init__(self, root, categories=None, scheme=JSON, sanitize_level=SAFE):
scheme = scheme.lower()

if not self.valid_scheme(scheme):
raise ExportError(INVALID_EXPORT_SCHEME.format(scheme))
else:
self.scheme = scheme # Output format of the data

if not self.valid_sanitize_level(sanitize_level):
raise ExportError(INVALID_SANITIZE_LEVEL.format(sanitize_level))
else:
self.sanitize_level = sanitize_level # Sanitization to apply to content

self.root = root # Location on disk to write to
self.scheme = scheme.lower() # Output format of the data
self.state = State.Init # Current state of the export
self.counts = Counter() # Counts of posts per category
self.categories = categories # Specific categories to export

if self.scheme not in SCHEMES:
raise ExportError(
"Unknown export scheme: '{}' - use one of {}.".format(
self.scheme, ", ".join(SCHEMES)
)
)

@property
def categories(self):
if self._categories is None:
Expand Down Expand Up @@ -169,60 +179,98 @@ def feedinfo(self, path):
with open(path, 'w') as f:
f.write(feeds.to_json(indent=2))

def export(self, root=None, categories=None, level=SAFE):
def export(self, root=None, categories=None, scheme=JSON, level=SAFE):
"""
Runs the export of the posts to disk.
Export all posts in categories to disk.
"""
self.root = root or self.root

root = root or self.root
categories = categories or self.categories

scheme = scheme or self.scheme
level = level or self.sanitize_level
if not self.valid_scheme(scheme):
raise ExportError(INVALID_EXPORT_SCHEME.format(scheme))

if not self.valid_sanitize_level(level):
raise ExportError(INVALID_SANITIZE_LEVEL.format(level))

self.initialize_export_directory(root)
category_filepaths = self.initialize_category_dirs(base_dir=root,
categories=categories)

# Reset the counts object and mark export as started.
self.counts = Counter()
self.state = State.Started

# Make the directory to export if it doesn't exist.
if not os.path.exists(self.root):
os.mkdir(self.root)
# Iterate through all posts, writing them to disk correctly.
# Right now we will simply write them based on their object id.
for post, category in tqdm(self.posts(categories=categories),
total=Post.objects.count(),
unit="docs"):

# If the root is not a directory, then we can't write there.
if not os.path.isdir(self.root):
raise ExportError(
"'{}' is not a directory!".format(self.root)
)
path = os.path.join(category_filepaths[category], "{}.{}".format(post.id, self.scheme))

with codecs.open(path, 'w', encoding='utf-8') as f:
action = {
JSON: lambda: post.to_json(indent=2),
HTML: lambda: post.htmlize(sanitize=level)
}[self.scheme]

# Create the directories for each category on disk and map paths.
f.write(action())

# Mark the export as finished and write the README to the corpus.
self.state = State.Finished
self.readme(os.path.join(root, "README"))
self.feedinfo(os.path.join(root, "feeds.json"))

@classmethod
def initialize_category_dirs(cls, base_dir, categories):
"""
Create the directories for each category on disk and map paths.
:param base_dir: the absolute filepath to create the category directories in
:param categories: an iterable of category names
:return: a dict of categories and their filepath, { 'category1': 'filepath1' }
"""
catdir = {}
for category in self.categories:
path = os.path.join(self.root, category)
for category in categories:
path = os.path.join(base_dir, category)

if not os.path.exists(path):
os.mkdir(path)

if not os.path.isdir(path):
raise ExportError(
"'{}' is not a directory!".format(path)
"Could not create directory '{}'!".format(path)
)

catdir[category] = path

# Iterate through all posts, writing them to disk correctly.
# Right now we will simply write them based on their object id.
for post, category in tqdm(self.posts(), total=Post.objects.count(), unit="docs"):
path = os.path.join(
self.root, catdir[category], "{}.{}".format(post.id, self.scheme)
)
return catdir

with codecs.open(path, 'w', encoding='utf-8') as f:
action = {
'json': lambda: post.to_json(indent=2),
'html': lambda: post.htmlize(sanitize=level)
}[self.scheme]
@classmethod
def initialize_export_directory(cls, directory):
# Make the directory to export if it doesn't exist.
if not os.path.exists(directory):
os.mkdir(directory)

f.write(action())
# If the root is not a directory, then we can't write there.
if not os.path.isdir(directory):
raise ExportError(
"'{}' is not a directory!".format(directory)
)

# Mark the export as finished and write the README to the corpus.
self.state = State.Finished
self.readme(os.path.join(self.root, "README"))
self.feedinfo(os.path.join(self.root, "feeds.json"))
@classmethod
def valid_sanitize_level(self, level):
"""
:param level: sanitization level
:return: Boolean
"""
return (not level) or (level in SANITIZE_LEVELS)

@classmethod
def valid_scheme(cls, scheme):
return (not scheme) or (scheme in SCHEMES)


if __name__ == '__main__':
Expand Down
47 changes: 32 additions & 15 deletions tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ def setUpClass(cls):

assert Feed.objects.count() == 3
assert Post.objects.count() == 3
cls.root_dir = "/tmp"
cls.corpus_dir = "/tmp/corpus"

@classmethod
def tearDownClass(self):
Expand All @@ -120,36 +122,36 @@ def test_scheme_specification(self):
# Make sure good schemes don't error
for scheme in SCHEMES:
try:
exporter = MongoExporter("/tmp/corpus", scheme=scheme)
exporter = MongoExporter(root=self.corpus_dir, scheme=scheme)
except ExportError:
self.fail("Could not use expected scheme, {}".format(scheme))

# Make sure bad schemes do error
for scheme in ('bson', 'xml', 'yaml', 'foo', 'bar'):
with self.assertRaises(ExportError):
exporter = MongoExporter("/tmp/corpus", scheme=scheme)
exporter = MongoExporter(root=self.corpus_dir, scheme=scheme)

def test_categories_default(self):
"""
Assert that categories are set to default when not provided
"""

exporter = MongoExporter("/tmp/corpus")
exporter = MongoExporter(root=self.corpus_dir)
self.assertCountEqual(CATEGORIES_IN_DB, exporter.categories)

def test_categories_provided(self):
"""
Assert that provided categories are returned
"""
categories = ["TestCategory", "Another Category", "Unicode ĆăƮĖƓƠŕƔ"]
exporter = MongoExporter("/tmp/corpus", categories=categories)
exporter = MongoExporter(root=self.corpus_dir, categories=categories)
self.assertCountEqual(categories, exporter.categories)

def test_feeds_for_list_of_categories(self):
"""
Assert that getting feeds for a list of categories works
"""
exporter = MongoExporter("/tmp/corpus", categories=CATEGORIES_IN_DB)
exporter = MongoExporter(root=self.corpus_dir, categories=CATEGORIES_IN_DB)
expected_feeds = [POLITICS_FEED, FOOD_FEED]
test_categories = ["politics", "food"]
self.assertCountEqual(expected_feeds, exporter.feeds(categories=test_categories))
Expand All @@ -158,21 +160,21 @@ def test_feeds_for_category_string(self):
"""
Assert that getting feeds for a category as a string
"""
exporter = MongoExporter("/tmp/corpus", categories=CATEGORIES_IN_DB)
exporter = MongoExporter(root=self.corpus_dir, categories=CATEGORIES_IN_DB)
self.assertCountEqual([POLITICS_FEED], exporter.feeds(categories="politics"))

def test_feeds_for_all_categories(self):
"""
Assert that getting feeds with a category returns for all categories
"""
exporter = MongoExporter("/tmp/corpus", categories=CATEGORIES_IN_DB)
exporter = MongoExporter(root=self.corpus_dir, categories=CATEGORIES_IN_DB)
self.assertCountEqual([POLITICS_FEED, FOOD_FEED, BOOKS_FEED], exporter.feeds())

def test_writing_readme(self):
"""
Assert that a readme file is written correctly
"""
exporter = MongoExporter("/tmp/corpus", categories=CATEGORIES_IN_DB)
exporter = MongoExporter(root=self.corpus_dir, categories=CATEGORIES_IN_DB)
exporter.state = State.Finished
exporter.readme("/tmp/readme")

Expand All @@ -182,15 +184,15 @@ def test_writing_readme_fails(self):
"""
Assert writing readme file fails when in an incorrect state
"""
exporter = MongoExporter("/tmp/corpus", categories=CATEGORIES_IN_DB)
exporter = MongoExporter(root=self.corpus_dir, categories=CATEGORIES_IN_DB)
with self.assertRaises(ExportError):
exporter.readme("/tmp/readme")

def test_generating_posts_fails(self):
"""
Assert generating posts fails when in an incorrect state
"""
exporter = MongoExporter("/tmp/corpus", categories=CATEGORIES_IN_DB)
exporter = MongoExporter(root=self.corpus_dir, categories=CATEGORIES_IN_DB)
exporter.state = "Some crazy thing"
with self.assertRaises(ExportError):
for post, category in exporter.posts():
Expand All @@ -207,7 +209,7 @@ def test_export(self):
"""
Assert that we can export posts
"""
exporter = MongoExporter("/tmp/corpus", categories=CATEGORIES_IN_DB)
exporter = MongoExporter(root=self.corpus_dir, categories=CATEGORIES_IN_DB)

# Mock Mongo calls that aren't supported in MockMongoClient
post_categories = [
Expand All @@ -222,7 +224,7 @@ def test_export_with_root_path_failure(self):
"""
Assert that root path failures are raised
"""
root_path = "/tmp/corpus"
root_path = self.corpus_dir
exporter = MongoExporter(root_path, categories=CATEGORIES_IN_DB)
os.path.exists = lambda path: False if path == root_path else True
os.mkdir = lambda success: True # Mock directory creation
Expand All @@ -235,15 +237,30 @@ def test_export_with_category_path_failure(self):
"""
Assert that category path failures are raised
"""
root_path = "/tmp/corpus"
exporter = MongoExporter(root_path, categories=CATEGORIES_IN_DB)
exporter = MongoExporter(root=self.corpus_dir, categories=CATEGORIES_IN_DB)
for category in CATEGORIES_IN_DB:
category_path = os.path.join(root_path, category)
category_path = os.path.join(self.corpus_dir, category)
os.path.exists = lambda path: False if path == category_path else True
os.mkdir = lambda success: True # Mock directory creation
os.path.isdir = lambda path: False if path == category_path else True

with self.assertRaises(ExportError):
exporter.export()

def test_export_with_invalid_sanitization(self):
"""
Assert that export requires a valid
"""
exporter = MongoExporter(root=self.corpus_dir, categories=CATEGORIES_IN_DB)

with self.assertRaises(ExportError):
exporter.export(level="BOGUS")

def test_export_with_invalid_scheme(self):
"""
Assert that export requires a valid
"""
exporter = MongoExporter(root=self.corpus_dir, categories=CATEGORIES_IN_DB)

with self.assertRaises(ExportError):
exporter.export(scheme="BOGUS")

0 comments on commit fac0bc0

Please sign in to comment.