Iterators and Generators

Manually Consuming an Iterator

In [None]:
with open('/etc/passwd') as f:
    try:
        while True:
            line = next(f)
            print(line, end='')
    except StopIteration:
        pass

In [None]:
#one can also instruct it to return a terminating value,


with open('/etc/passwd') as f:
    while True:
        line = next(f, None)
        if line is None:
            break
        print(line, end='')

In [3]:
#also 

items = [1,2,4]
it = iter(items)
next(it)

1

In [4]:
next(it)

2

Delegating Iteration

In [6]:
#You have built a custom conatiner object that internally holds a list, tuple or some other iteratble. You would like to make iteration work with your new container.

In [7]:
class Node:
    def __init__(self, value):
        self._value = value
        self._children = []

    def __repr__(self) -> str:
        return 'Node({!r})'.format(self._value)
    
    def add_child(self, node):
        self._children.append(node)

    def __iter__(self):
        return iter(self._children)

In [9]:
nn = Node(12)

In [15]:
n1 = Node(13)

In [16]:
nn.add_child(n1)

In [17]:
nn

Node(12)

In [18]:
for i in nn:
    print(i)

12
13
Node(13)


In [61]:
def frange(start, stop, increment=0):
    try:    
        while(start<stop):
            if (increment == 0):
               print("increment must be greater than one")
            else:
                yield start
                start+=increment
        yield stop
    except StopIteration:
        pass

In [48]:
frange(1,10,2)

<generator object frange at 0x7f2ab164aa40>

In [64]:
I = frange(2,33,4)

In [65]:
for i in I:
    print(i)

2
6
10
14
18
22
26
30
33


In [1]:
#Pythons iterator protocol requires __iter__() to return a special iterator object that implements a __next__() operation and uses a StopIteration exception to signal completion.However implementing such objects can often be a messy affair. For example the following code shows an alternative implementation of the depth_first() method using an associated iterator class:



class Node:
    def __init__(self,value):
        self._value = value
        self._children = []

    def __repr__(self):
        return 'Node({!r})'.format(self._value)
    
    def add_child(self,other_node):
        self._children.append(other_node)

    def __iter__(self):
        return iter(self._children)
    
    def depth_first(self):
        return DepthFirstIterator(self)
    

class DepthFirstIterator(object):
    '''Depth first traversal'''

    def __init__(self, start_node):
        self._node = start_node
        self._children_iter =  None
        self._child_iter = None

    def __iter__(self):
        return self
    
    def __next__(self):
        #return myself if just sarted; create an iterator for children
        if self._child_iter is None:
            self._children_iter = iter(self._node)
            return self._node
        
        #If processing a child, return its next item
        elif self._child_iter:
            try:
                nextchild = next(self._child_iter)
                return nextchild
            except StopIteration:
                self._child_iter = None
                return next(self)
        
        #Advance to the next child and start its iteration

        else:
            self._child_iter = next(self._children_iter).depth_first()
            return next(self)

Iterating in Reverse

In [2]:
a = [i for i in range(10,1,-2)]

In [3]:
for x in reversed(a):
    print(x)

2
4
6
8
10


In [4]:
class Countdown:
    def __init__(self, start):
        self.start  = start
    
    #forward iterator
    def __iter__(self):
        n = self.start
        while n>0:
            yield n
            n -= 1

    #backward iterator
    def __reversed__(self):
        n = 1
        while n<=self.start:
            yield n
            n += 1

In [5]:
c = Countdown(12)

In [7]:
for i in reversed(c):
    print(i)

1
2
3
4
5
6
7
8
9
10
11
12


Defining Generator Functions with Extra State

In [8]:
#If you want a generator to expose extra state to the user, dont forget that you can easily implement it as a class putting the generator function code in the __iter__() method.

from collections import deque

class LineHistory:
    def __init__(self,lines, histlen=3) -> None:
        self.lines = lines
        self.history = deque(maxlen=histlen)

    def __iter__(self):
        for lineno, line in enumerate(self.lines, 1):
            self.history.append((lineno, line))
            yield line

    def clear(self):
        self.history.clear()

In [11]:
#To use this class, you would treat it like a normal generator function. However, since it creates an instance, you can access internal attributes or the clear() method. For example:

with open("somefile.txt",'w+') as f:
    lines  = LineHistory(f)
    for line in lines:
        if 'python' in line:
            for lineno, hline in lines.history:
                print('{}:{}'.format(lineno, hline), end = '')

Discussion

In [16]:
#With generators, it is easy to fall into a trap of trying to do everything with finctions alone. this can lead to rather complicated code if the generator function needs to interact with other parts of your program in unusual ways(exposing attributes, allowing control via method calls, etc). If this is the case, just use a class definition, as shown. Defining your generator in the __iter__() method doestnt change anything about how your algorithm. the fact that its part of a class makes it easy for you to provide attributes and methods for users to interact with.


#One potential subtlety with the method shown is that it might require and extra step of calling iter() if you are going to drive iteration using a technique other that a for loop

f = open('somefile.txt')
lines = LineHistory(f)
next(lines)

TypeError: 'LineHistory' object is not an iterator

In [14]:
f.close()

In [17]:
#call iter() first, then start iterating

it = iter(lines)

next(it)

'hello world\n'

In [18]:
next(it)

'this is a test'

Taking a Slice of an Iterator

In [19]:
#You want to take a slice of data produced by an iterator, but the normal slicing operator doesnt work.

In [20]:
#The itertools.islice() function is perfecty suited for taking slices of iterators and generators. For example:


def count(n):
    while True:
        yield n
        n+=1
        

In [21]:
c = count(34)

In [22]:
c[10:20]

TypeError: 'generator' object is not subscriptable

In [None]:
#Now using islice()
import itertools
for x in itertools.islice(c, 10, 20):
    print(x)

Discussion

In [24]:
#With generators it is easy to fall into a trap of trying to do everything with functions alone. This can lead to rather complicated code if the generator function needs to interact with other parts of your program in unusual ways. If this is the case, just use a class definition, as shown.The fact that its part of a class makes it easy for you to provide attributes and methods for users to interact with.

Skipping the First Part of an Iterable

In [26]:
#You want to iterate over items in an iterable but the first few items arent of interest and you just want to discard them.

In [None]:
with open('/etc/passwd') as f:
    for line in f:
        print(line, end='')

In [28]:
#if you want to skip all of the initial comment lines, heres one way to do it:

from itertools import dropwhile
with open('/etc/passwd') as f:
    for line in dropwhile(lambda line: line.startswith('#'), f):
        print(line, end='')

root:x:0:0:root:/root:/bin/bash
systemd-timesync:x:497:497:systemd Time Synchronization:/:/usr/sbin/nologin
messagebus:x:484:484:User for D-Bus:/run/dbus:/usr/sbin/nologin
polkitd:x:483:483:User for polkitd:/var/lib/polkit:/usr/sbin/nologin
scard:x:482:482:Smart Card Reader:/run/pcscd:/usr/sbin/nologin
tftp:x:481:481:TFTP Account:/srv/tftpboot:/usr/sbin/nologin
srvGeoClue:x:480:480:User for GeoClue D-Bus service:/var/lib/srvGeoClue:/usr/sbin/nologin
pulse:x:478:478:PulseAudio daemon:/var/lib/pulseaudio:/usr/sbin/nologin
nobody:x:65534:65534:nobody:/var/lib/nobody:/bin/bash
man:x:13:62:Manual pages viewer:/var/lib/empty:/usr/sbin/nologin
mail:x:477:477:Mailer daemon:/var/spool/clientmqueue:/usr/sbin/nologin
lp:x:489:489:Printing daemon:/var/spool/lpd:/usr/sbin/nologin
flatpak:x:476:476:Flatpak system helper:/:/usr/sbin/nologin
daemon:x:2:2:Daemon:/sbin:/usr/sbin/nologin
rtkit:x:473:473:RealtimeKit:/var/lib/empty:/usr/sbin/nologin
rpc:x:472:472:User for rpcbind:/var/lib/empty:/usr/sbin/n

In [29]:
from itertools import islice
items = ['a','b','c',1,4,10,15]
for x in islice(items, 3, None):
    print(x)

1
4
10
15


In [30]:
#In this example the last None argument is islice() is required to indicate that you want everything beyond and first three items as opposed to onloy the first three items



#in this example the last None argument is islice() is required to indicate that you want everything beyond the first three items as opposed to only teh first three items


Discussion


In [34]:
#The dropwhile() and islice() functions are mainly convenience functions that you can use to avoid writing rather messy code such as this:

with open('/etc/passwd') as f:
    #skip over initial comments
    while True:
        line = next(f, '')
        if not line.startswith('#'):
            break
    while line:
    #Replace with useful processing

        print(line, end='')
        line = next(f, None)

root:x:0:0:root:/root:/bin/bash
systemd-timesync:x:497:497:systemd Time Synchronization:/:/usr/sbin/nologin
messagebus:x:484:484:User for D-Bus:/run/dbus:/usr/sbin/nologin
polkitd:x:483:483:User for polkitd:/var/lib/polkit:/usr/sbin/nologin
scard:x:482:482:Smart Card Reader:/run/pcscd:/usr/sbin/nologin
tftp:x:481:481:TFTP Account:/srv/tftpboot:/usr/sbin/nologin
srvGeoClue:x:480:480:User for GeoClue D-Bus service:/var/lib/srvGeoClue:/usr/sbin/nologin
pulse:x:478:478:PulseAudio daemon:/var/lib/pulseaudio:/usr/sbin/nologin
nobody:x:65534:65534:nobody:/var/lib/nobody:/bin/bash
man:x:13:62:Manual pages viewer:/var/lib/empty:/usr/sbin/nologin
mail:x:477:477:Mailer daemon:/var/spool/clientmqueue:/usr/sbin/nologin
lp:x:489:489:Printing daemon:/var/spool/lpd:/usr/sbin/nologin
flatpak:x:476:476:Flatpak system helper:/:/usr/sbin/nologin
daemon:x:2:2:Daemon:/sbin:/usr/sbin/nologin
rtkit:x:473:473:RealtimeKit:/var/lib/empty:/usr/sbin/nologin
rpc:x:472:472:User for rpcbind:/var/lib/empty:/usr/sbin/n

In [None]:
with open('/etc/passwd') as f:
    lines = (line for line in f if not line.startswith('#'))
    for line in lines:
        print(line, end='')

In [36]:
#This obviously discard the comment lines at the start  but will also discard all such lines throughout the inter file. On the other hand, the solution only discards items until an item no longer satisfies the supplied test. After that, all subsequent items are returned with no filtering.

#Last but not least it should be emphasized that this recipe works with all iterables, including those whose size cant be determined in advance. This includes generators, files, and similar kinds of objects.

Iterating Over all possible Combinations or Permutations

In [37]:
#You want to iterate over all of the possible combinations or permutations of a collection of items.

In [38]:
#The itertools module provides three function sfor this task. The first of these itertools.permutations() takes a collection of items and produces a sequence of tuples that rearranges all of the items into all possible permutations for example:

items = ['a', 'b', 'c']
from itertools import permutations

for p in permutations(items):
    print(p)

('a', 'b', 'c')
('a', 'c', 'b')
('b', 'a', 'c')
('b', 'c', 'a')
('c', 'a', 'b')
('c', 'b', 'a')


In [39]:
#If you want all permutations of a smaller length you can give an optional length argument. For example:

for p in permutations(items, 2):
    print(p)

('a', 'b')
('a', 'c')
('b', 'a')
('b', 'c')
('c', 'a')
('c', 'b')


In [40]:
#Use itertools.combinations() to produce a sequence of combinations of items taken from the input. For example:
from itertools import combinations
for c in combinations(items, 3):
    print(c)
    

('a', 'b', 'c')


In [41]:
for c in combinations(items, 1):
    print(c)

('a',)
('b',)
('c',)


In [43]:
#For combinations() the actual order of the elements is not considered. That is, the combination ('a','b') is considered to be the same as ('b','a')(which is not produced).

#When producing combinations chosen items are removed from the collectionof possible candidates. The itertools.combinations_with_replacement() function relaxes this, and allows the same item to be chosen more than once


for c in itertools.combinations_with_replacement(items, 3):
    print(c)

('a', 'a', 'a')
('a', 'a', 'b')
('a', 'a', 'c')
('a', 'b', 'b')
('a', 'b', 'c')
('a', 'c', 'c')
('b', 'b', 'b')
('b', 'b', 'c')
('b', 'c', 'c')
('c', 'c', 'c')


Discussion

In [44]:
#This recipe demonstrates only some of the power found in the itertools module. Although you could certainly write code to produce permutations and combinations yourself, doing so would probably require more than a fair bit of thought. When faced with seemingly complicated iteration problems, it always pays to look at itertools first. If the problem is common, chances are a solution is already availble.

Iterating Over the index value pairs of a sequence

In [45]:
#You want to iterate over a sequence but would like to keep track of which element of the sequence is currently being processed.

my_list = ['a', 'b', 'c']

for idx, val in enumerate(my_list):
    print(idx, val)

0 a
1 b
2 c


In [46]:
#For printing output with canonical line numbers you can pass in a start argument:

my_list = ['a', 'b','c']
for idx, val in enumerate(my_list, 1):
    print(idx, val)

1 a
2 b
3 c


In [47]:
def parse_data(filename):
    with open(filename, 'rt') as f:
        for lineno, line in enumerate(f, 1):
            fields = line.split()
            try:
                count = int(fields[1])
            except ValueError as e:
                print('Line {}: Parse error: {}'.format(lineno, e))

In [50]:
from collections import defaultdict
word_summary = defaultdict(list)

with open('myfile.txt', 'w+') as f:
    lines = f.readlines()

for idx, line in enumerate(lines):
    #create a list of words in current line
    words = [w.strip().lower() for w in line.split()]
    for word in words:
        word_summary[word].append(idx)

In [51]:
#The value for each word key will be a list of line numbers that word occured on. If the word occurred twice on a single line, that line number will be listed twice, making it possible to identify various simple metrics about the text

In [53]:
f = open('myfile.txt','r')

In [54]:
#enumerate() is a nice shortcut for situations where you might be inclined to keep your own counter variable. You could write code like this:


lineno = 1
for line in f:
    #process line
    ...
    lineno +=1

In [55]:
for lineno, line in enumerate(f):


#The value returned by enumerate() is an instance of an enumerate object which is an iterator that returns successive tuples consisting of a counter and the value returned by calling next() on the sequence you have passed in.

#Although a minor point, its worth mentioning that sometimes it is easy to get tripped up when applying enumerate(0 to a sequence of tuples that are also being unpacked.To do it you have to write code like this:


data = [(1,2), (3,4), (5,6), (7,8)]

for n, (x,y) in enumerate(data):
    #Error
for n, x, y in enumerate(data)

1

Iterating Over Multiple Sequences Simultaneously

In [1]:
xpts = [1,5,4,2,10,7]
ypts = [101, 78, 37, 15, 62, 99]

for x,y in zip(xpts, ypts):
    print(x,y)

1 101
5 78
4 37
2 15
10 62
7 99


In [2]:
a = [1,2,3]
b = ['w','x','y','z']

In [3]:

for i in zip(a,b):
    print(i)

(1, 'w')
(2, 'x')
(3, 'y')


In [4]:
#If this behavior is not desired, use itertools.zip_longest() instead

from itertools import zip_longest
for i in zip_longest(a,b):
    print(i)

(1, 'w')
(2, 'x')
(3, 'y')
(None, 'z')


In [5]:
for i in zip_longest(a, b, fillvalue=0):
    print(i)

(1, 'w')
(2, 'x')
(3, 'y')
(0, 'z')


In [1]:
import os

In [3]:
os.walk()

TypeError: walk() missing 1 required positional argument: 'top'

Flattening a Nested Sequence

In [20]:
#You have a nested sequence that you want to flatten into a single list of values.
 #from itertools import 
items = [1, 2, [3, 4, [5, 6], 7], 8]

def flatten(items):
    for a in items:
        if (type(a) != list):
            yield a
        else:
            return flatten(a)


In [34]:
from collections.abc import Iterable

def flatten(items, ignore_types=(str, bytes)):
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, ignore_types):
            yield from flatten(x)
        else:
            yield x

In [35]:
for x in flatten(items):
    print(x)

1
2
3
4
5
6
7
8


Replacing Infinite while loops with an Iterator