# Python Data Science Toolbox - Part 2

## Iterators

An iterable is an object that can return an iterator, while an iterator is an object that keeps state and produces the next value when you call next() on it.

In [None]:
flash = ['jay garrick', 'barry allen', 'wally west', 'bart allen']      # a list is an iterable object

for element in flash:
    print(element)

superhero = iter(flash)      # create an iterator object for flash: superhero

# invoke next() on the superhero iterator to print each iterator value
print(next(superhero))
print(next(superhero))
print(next(superhero))
print(next(superhero))

In [None]:
# Create an iterator for range(3): small_value
small_value = iter(range(3))
print(type(small_value))

small_value_range = range(3)
print(type(small_value_range))     # range is an iterable object of type range

# Print the values in small_value
print(next(small_value))
print(next(small_value))
print(next(small_value))

# Loop over range(3) and print the values
for i in range(3):
    print(i)

# Create an iterator for range(10 ** 100): googol
googol = iter(range(10**100))

# Print the first 5 values from googol
print(next(googol))
print(next(googol))
print(next(googol))
print(next(googol))
print(next(googol))

In [None]:
values = range(10, 21)      # values is a range object
print(type(values))
print(values)

values_list = list(values)  # convert a range object to a list object
print(type(values_list))
print(values_list)

values_sum = sum(values)               # you can invoke sum on a range object or a list object
values_list_sum = sum(values_list)

print(values_sum)
print(values_list_sum)

In [None]:
# enumerate() returns an enumerate object that produces a sequence of tuples, and each of the tuples is an index-value pair
mutants = ['charles xavier', 'bobby drake', 'kurt wagner', 'max eisenhardt', 'kitty pryde']

mutant_list = list(enumerate(mutants))

print(mutant_list, "\n")

for index1, value1 in enumerate(mutants):
    print(index1, value1)

print("\n")    

for index2, value2 in enumerate(mutants, start=1):
    print(index2, value2)

In [None]:
mutants = ['charles xavier', 'bobby drake', 'kurt wagner', 'max eisenhardt', 'kitty pryde']
aliases = ["prof x", "iceman", "nightcrawler", "magneto", "shadowcat"]
powers = ["telepathy", "thermokinesis", "teleportation", "magnetokinesis", "intangibility"]

# create a zip object
mutants_zip = zip(mutants, aliases, powers)
print(mutants_zip, "\n")
print(type(mutants_zip), "\n")

# convert a zip object into a list of tuples
mutants_zip_list = list(mutants_zip)
print(mutants_zip_list, "\n")

# unpack the zip object and print the tuple values
for value1, value2, value3 in mutants_zip:
    print(value1, value2, value3)               # notice that you are iterating over a zip object

In [None]:
mutants = ("charles xavier", "bobby drake", "kurt wagner", "max eisenhardt", "kitty pryde")
powers = ("telepathy", "thermokinesis", "teleportation", "magnetokinesis", "intangibility")

z1 = zip(mutants, powers)

# print the tuples in z1 by unpacking with *
# this exhausts the content of the zip object (this is a tricky concept)
print(*z1)
print("\n")

# when you apply * to a zip object, you exhaust the content of the object when it is unpacked
# to use the zip object with its original content you thus have to re-create the zip object
z1 = zip(mutants, powers)

# "Unzip" the tuples in z1 by unpacking with * and zip()
print(list(z1), "\n")         # list of tuples is printed
print(list(z1), "\n")         # observe carefully the contents of the zip object as a result of the previous line

z1 = zip(mutants, powers)

result1, result2 = zip(*z1)
print(result1, "\n")
print(type(result1), "\n")
print(result2, "\n")
print(type(result2), "\n")

print(list(z1))              # observe carefully the contents of the zip object

# Check if unpacked tuples are equivalent to original tuples
print(result1 == mutants)
print(result2 == powers)

In [None]:
import pandas as pd
counts_dict = {}

for chunk in pd.read_csv("./data/tweets.csv", chunksize=10):
    for entry in chunk["lang"]:                 # chunk is a pandas dataframe
        if entry in counts_dict.keys():
            counts_dict[entry] += 1
        else:
            counts_dict[entry] = 1

print(counts_dict)

In [None]:
def count_entries(csv_file, c_size, colname):
    """Return a dictionary with counts of occurrences as value for each key."""
    counts_dict = {}

    for chunk in pd.read_csv(csv_file, chunksize=c_size):
        for entry in chunk[colname]:
            if entry in counts_dict.keys():
                counts_dict[entry] += 1
            else:
                counts_dict[entry] = 1

    return counts_dict

result_counts = count_entries("./data/tweets.csv", 10, "lang")
print(result_counts)

### List Comprehensions

List comprehensions collapse for loops for building lists  into a single line  
Syntax for list comprehensions require: 1) iterable, 2) iterator variable, and 3) output expression  
You can build a list comprehension over any iterable object - i.e. list comprehensions are NOT limited to lists

In [1]:
# range(10) is the iterable object
# i is the iterator variable
# i**2 is the output expression
# the list comprehension gets outputed to the list object: squares
squares = [i**2 for i in range(10)]
print(squares)
print(type(squares))

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
<class 'list'>


In [3]:
# Create a 5 x 5 matrix using a list of lists: matrix
matrix = [[col for col in range(5)] for row in range(5)]

print(matrix)

for row in matrix:
    print(row)

[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]


In [6]:
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

new_fellowship = [member if len(member) >= 7 else "" for member in fellowship]
print(new_fellowship)

['', 'samwise', '', 'aragorn', 'legolas', 'boromir', '']


In [7]:
# let's use comprehension to create a dictionary (dict) comprehension
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# dict comprehension
new_fellowship = {member:len(member) for member in fellowship}

print(new_fellowship)

{'frodo': 5, 'samwise': 7, 'merry': 5, 'aragorn': 7, 'legolas': 7, 'boromir': 7, 'gimli': 5}


In [8]:
# ceate generator object: result
result = (num for num in range(31))

# print the first 5 values in the generator object (also an iterator object)
print(next(result))
print(next(result))
print(next(result))
print(next(result))
print(next(result))

# iterate through the generator object
for value in result:
    print(value)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [9]:
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']

lengths = (len(x) for x in lannister)

for value in lengths:
    print(value)

6
5
5
6
7


In [12]:
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']

# define generator function get_lengths
def get_lengths(input_list):
    """Generator function that yields the length of the strings in input_list."""
    for person in input_list:
        yield len(person)

y = get_lengths(lannister)
print(y)                      # y is a generator object
print(type(y))

for value in get_lengths(lannister):       # iterate over the elements in a generator object
    print(value)

<generator object get_lengths at 0x000001872E8A49C8>
<class 'generator'>
6
5
5
6
7


In [13]:
import pandas as pd
df = pd.read_csv("./data/tweets.csv")

tweet_time = df["created_at"]

tweet_clock_time = [t[11:19] for t in tweet_time]

print(tweet_clock_time)

['23:40:17', '23:40:17', '23:40:17', '23:40:17', '23:40:17', '23:40:17', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:19', '23:40:18', '23:40:18', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23

In [14]:
import pandas as pd
df = pd.read_csv("./data/tweets.csv")

tweet_time = df["created_at"]

tweet_clock_time = [time[11:19] for time in tweet_time if time[17:19] == "19"]

print(tweet_clock_time)

['23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19']
