Fixed the performance regression in LazyFile from #45

The performance regression was caused by implementing the next() or __next__() methods which were being called instead of the built in C implementation from the object file. The fix is to only implement __iter__() to return the file since it is its own iterator. Additionally this is wrapped with a keyword with block and returns lines with yield to make sure that after the iteration the file is closed. This also makes it so that calls __iter__ are completely independent where before they were related. Lastly, LazyFile was renamed to ReusableFile since the name more clearly communicates its purpose.
EntilZha · Nov 3, 2015 · deffa83 · deffa83
1 parent fb8052c
commit deffa83
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 33 deletions.
diff --git a/functional/__init__.py b/functional/__init__.py
@@ -2,6 +2,8 @@
 Package which supplies utilities primarily through functional.seq to do pipeline style,
 functional transformations and reductions.
 """
+from .streams import seq
+
 __author__ = "Pedro Rodriguez"
 __copyright__ = "Copyright 2015, Pedro Rodriguez"
 __license__ = "MIT"
@@ -10,4 +12,3 @@
 __email__ = "ski.rodriguez@gmail.com"
 __status__ = "Development"
 
-from .streams import seq
diff --git a/functional/streams.py b/functional/streams.py
@@ -5,9 +5,8 @@
 import csv as csvapi
 import json as jsonapi
 import six
-
 from .pipeline import Sequence
-from .util import is_primitive, LazyFile
+from .util import is_primitive, ReusableFile
 
 
 def seq(*args):
@@ -76,8 +75,9 @@ def open(path, delimiter=None, mode='r', buffering=-1, encoding=None,
     if not re.match('^[rbt]{1,3}$', mode):
         raise ValueError('mode argument must be only have r, b, and t')
     if delimiter is None:
-        return seq(LazyFile(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
-                            newline=newline))
+        return seq(
+            ReusableFile(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
+                         newline=newline))
     else:
         with builtins.open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
                            newline=newline) as data:
@@ -114,7 +114,7 @@ def csv(csv_file, dialect='excel', **fmt_params):
     :return: Sequence wrapping csv file
     """
     if isinstance(csv_file, str):
-        input_file = LazyFile(csv_file, mode='r')
+        input_file = ReusableFile(csv_file, mode='r')
     elif hasattr(csv_file, 'next') or hasattr(csv_file, '__next__'):
         input_file = csv_file
     else:
@@ -137,7 +137,7 @@ def jsonl(jsonl_file):
     :return: Sequence wrapping jsonl file
     """
     if isinstance(jsonl_file, str):
-        input_file = LazyFile(jsonl_file)
+        input_file = ReusableFile(jsonl_file)
     else:
         input_file = jsonl_file
     return seq(input_file).map(jsonapi.loads).cache(delete_lineage=True)

diff --git a/functional/test/test_util.py b/functional/test/test_util.py
@@ -1,14 +1,13 @@
 import unittest
-from ..util import LazyFile
+from ..util import ReusableFile
 
 
 class TestUtil(unittest.TestCase):
-    def test_lazy_file(self):
-        license_file = LazyFile('LICENSE.txt')
-        self.assertTrue(license_file.file is None)
-        iter(license_file)
-        handle_0 = license_file.file
-        iter(license_file)
-        handle_1 = license_file.file
-        self.assertTrue(handle_0.closed)
-        self.assertFalse(handle_1.closed)
+    def test_reusable_file(self):
+        license_file_lf = ReusableFile('LICENSE.txt')
+        with open('LICENSE.txt') as license_file:
+            self.assertEqual(list(license_file), list(license_file_lf))
+        self.assertTrue(license_file_lf.file is None)
+        iter_1 = iter(license_file_lf)
+        iter_2 = iter(license_file_lf)
+        self.assertEqual(list(iter_1), list(iter_2))
diff --git a/functional/util.py b/functional/util.py
@@ -55,7 +55,7 @@ def is_iterable(val):
     return isinstance(val, collections.Iterable)
 
 
-class LazyFile(object):
+class ReusableFile(object):
     # pylint: disable=too-few-public-methods,too-many-instance-attributes
     def __init__(self, path, delimiter=None, mode='r', buffering=-1, encoding=None,
                  errors=None, newline=None):
@@ -70,18 +70,11 @@ def __init__(self, path, delimiter=None, mode='r', buffering=-1, encoding=None,
         self.file = None
 
     def __iter__(self):
-        if self.file is not None:
-            self.file.close()
-        self.file = builtins.open(self.path, mode=self.mode, buffering=self.buffering,
-                                  encoding=self.encoding, errors=self.errors, newline=self.newline)
-        return self
-
-    def next(self):
-        try:
-            return next(self.file)
-        except StopIteration:
-            self.file.close()
-            raise StopIteration
-
-    def __next__(self):
-        return self.next()
+        with builtins.open(self.path,
+                           mode=self.mode,
+                           buffering=self.buffering,
+                           encoding=self.encoding,
+                           errors=self.errors,
+                           newline=self.newline) as file_content:
+            for line in file_content:
+                yield line