Skip to content

Commit

Permalink
Fixed the performance regression in LazyFile from #45
Browse files Browse the repository at this point in the history
The performance regression was caused by implementing the next() or
__next__() methods which were being called instead of the built in C
implementation from the object file. The fix is to only implement
__iter__() to return the file since it is its own iterator.
Additionally this is wrapped with a keyword with block and returns
lines with yield to make sure that after the iteration the file is
closed. This also makes it so that calls __iter__ are completely
independent where before they were related.

Lastly, LazyFile was renamed to ReusableFile since the name more clearly
communicates its purpose.
  • Loading branch information
EntilZha committed Nov 3, 2015
1 parent fb8052c commit deffa83
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 33 deletions.
3 changes: 2 additions & 1 deletion functional/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Package which supplies utilities primarily through functional.seq to do pipeline style,
functional transformations and reductions.
"""
from .streams import seq

__author__ = "Pedro Rodriguez"
__copyright__ = "Copyright 2015, Pedro Rodriguez"
__license__ = "MIT"
Expand All @@ -10,4 +12,3 @@
__email__ = "ski.rodriguez@gmail.com"
__status__ = "Development"

from .streams import seq
12 changes: 6 additions & 6 deletions functional/streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
import csv as csvapi
import json as jsonapi
import six

from .pipeline import Sequence
from .util import is_primitive, LazyFile
from .util import is_primitive, ReusableFile


def seq(*args):
Expand Down Expand Up @@ -76,8 +75,9 @@ def open(path, delimiter=None, mode='r', buffering=-1, encoding=None,
if not re.match('^[rbt]{1,3}$', mode):
raise ValueError('mode argument must be only have r, b, and t')
if delimiter is None:
return seq(LazyFile(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
newline=newline))
return seq(
ReusableFile(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
newline=newline))
else:
with builtins.open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
newline=newline) as data:
Expand Down Expand Up @@ -114,7 +114,7 @@ def csv(csv_file, dialect='excel', **fmt_params):
:return: Sequence wrapping csv file
"""
if isinstance(csv_file, str):
input_file = LazyFile(csv_file, mode='r')
input_file = ReusableFile(csv_file, mode='r')
elif hasattr(csv_file, 'next') or hasattr(csv_file, '__next__'):
input_file = csv_file
else:
Expand All @@ -137,7 +137,7 @@ def jsonl(jsonl_file):
:return: Sequence wrapping jsonl file
"""
if isinstance(jsonl_file, str):
input_file = LazyFile(jsonl_file)
input_file = ReusableFile(jsonl_file)
else:
input_file = jsonl_file
return seq(input_file).map(jsonapi.loads).cache(delete_lineage=True)
Expand Down
19 changes: 9 additions & 10 deletions functional/test/test_util.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import unittest
from ..util import LazyFile
from ..util import ReusableFile


class TestUtil(unittest.TestCase):
def test_lazy_file(self):
license_file = LazyFile('LICENSE.txt')
self.assertTrue(license_file.file is None)
iter(license_file)
handle_0 = license_file.file
iter(license_file)
handle_1 = license_file.file
self.assertTrue(handle_0.closed)
self.assertFalse(handle_1.closed)
def test_reusable_file(self):
license_file_lf = ReusableFile('LICENSE.txt')
with open('LICENSE.txt') as license_file:
self.assertEqual(list(license_file), list(license_file_lf))
self.assertTrue(license_file_lf.file is None)
iter_1 = iter(license_file_lf)
iter_2 = iter(license_file_lf)
self.assertEqual(list(iter_1), list(iter_2))
25 changes: 9 additions & 16 deletions functional/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def is_iterable(val):
return isinstance(val, collections.Iterable)


class LazyFile(object):
class ReusableFile(object):
# pylint: disable=too-few-public-methods,too-many-instance-attributes
def __init__(self, path, delimiter=None, mode='r', buffering=-1, encoding=None,
errors=None, newline=None):
Expand All @@ -70,18 +70,11 @@ def __init__(self, path, delimiter=None, mode='r', buffering=-1, encoding=None,
self.file = None

def __iter__(self):
if self.file is not None:
self.file.close()
self.file = builtins.open(self.path, mode=self.mode, buffering=self.buffering,
encoding=self.encoding, errors=self.errors, newline=self.newline)
return self

def next(self):
try:
return next(self.file)
except StopIteration:
self.file.close()
raise StopIteration

def __next__(self):
return self.next()
with builtins.open(self.path,
mode=self.mode,
buffering=self.buffering,
encoding=self.encoding,
errors=self.errors,
newline=self.newline) as file_content:
for line in file_content:
yield line

0 comments on commit deffa83

Please sign in to comment.