# Ch 1 Data Strucuters and algorithms
## Unpacking a sequence into separate variables

In [1]:
data = ['ACME', 50, 91.1, (2012,10,1)]

In [2]:
_, shares, price, _=data
print(shares, price)

50 91.1


## Unpacking elements from iterables of arbitary length
using "star expression"

In [3]:
record = ('Dave', 'Dave@example.com', '334', '443')
name, email, *etc = record
print(name, email, etc)
# etc will always be a list

Dave Dave@example.com ['334', '443']


In [5]:
*trailing, current = [10, 8, 7, 1, 9, 5]
print(trailing, current)

[10, 8, 7, 1, 9] 5


It is worth noting that the star syntax can be especially useful when unpacking a sequence of tuples of varying lenght. For example:

In [6]:
records = [
    ('foo', 1, 2),
    ('bar', 'Hello'),
    ('foo', 3, 4)
]

def do_foo(x, y):
    print('foo', x, y)
def do_bar(s):
    print('bar', s)
    
for tag, *args in records:
    if tag == 'foo':
        do_foo(*args)
    elif tag == 'bar':
        do_bar(*args)

foo 1 2
bar Hello
foo 3 4


Sometimes you might want to unpack values and throw them away. You can’t just specify
a bare * when unpacking, but you could use a common throwaway variable name, such
as _ or i gn (ignored). For example:

In [7]:
record = ('ACME', 50, 123.45, (12, 18, 2012))
name, *_, (*_, year) = record
print(name, year)

ACME 2012


## Keeping the last N items

Using deque(maxlen = N) creates a fix-sized queue. When new items are added and
the queue is full, the oldest item is automatically removed.  If you don’t give it a maximum size, you get an unbounded queue that lets you append and pop items on either end. 

In [9]:
from collections import deque
q = deque(maxlen = 2)
q.append(1)
q.append(2)
q

deque([1, 2])

In [10]:
q.append(4)
q

deque([2, 4])

In [14]:
q = deque()
q.append(3)
q.append(4)
q

deque([3, 4])

In [15]:
q.appendleft(1)
q.pop()
q

deque([1, 3])

When writing code to search for items, it is common to use a generator function in‐
volving __yield__, as shown in this recipe’s solution. This decouples the process of searching
from the code that uses the results.

In [24]:
from collections import deque

def search(lines, pattern, history=5):
    previous_lines = deque(maxlen=history)
    for line in lines:
        if pattern in line:
            yield line, previous_lines
        previous_lines.append(line)
    
# Example use on file
if __name__ == '__main__':
    with open('eg.txt') as f:
        for line, prevlines in search(f, 'python', 2):
            for pline in prevlines:
                print(pline, end='')
            print(line, end='')
            print('-'*20)

akdjal
ajkpythonldjf
--------------------
ajkpythonldjf
djfd
skpythonamf
--------------------
skpythonamf
afmkx
mxkafpythonjda
--------------------
mxm
mdkf
123python
--------------------
mdkf
123python
4845python
--------------------
4845python
jsalfd
48python34
--------------------


## Finding the largest or smallest N items

heapq module: heap queue, 堆算法实现的模块

In [25]:
import heapq

nums = [1, 8, 2, 23, -7, 4, 18, 42,37]
print(heapq.nlargest(3, nums))
print(heapq.nsmallest(3, nums))

[42, 37, 23]
[-7, 1, 2]


In [27]:
# mapping keys to multiple values in dict
from collections import defaultdict 
# defaultdict automatically initializes the first value so
# you can simply focus on adding items

d = defaultdict(list)
d['a'].append(1)
d['a'].append(2)
d['b'].append(4)
d

defaultdict(list, {'a': [1, 2], 'b': [4]})

In [5]:
# caution: defaultdict will automatically create dictionary entries of keys accessed later on even if they do not exist in the dictionary
# to avoid this, we can use setdefault() method on regular dictionaries
d = {} # a regular dictionary
d.setdefault('a', []).append(1)
d.setdefault('a', []).append(2)
d.setdefault('b', )
print(d)

{'a': [1, 2], 'b': None}


In [35]:
# Calculation with dictionaries
prices ={
    'acem':45.23,
    'aapl':612.78,
    'ibm':205.55,
    'hpq':37.20,
    'fb':10.75
}
# Common data reductions only process the keys, eg:
print(min(prices), max(prices))

aapl ibm


In [37]:
# We can fix this using the val ues() method of a dictionary
print(min(prices.values()), max(prices.values()))

10.75 612.78


In [39]:
# zip() can 'invert' the dictionary into a sequence of (value, key) pairs. When performing comparisons on such tuples, the
# value element is compared first, followed by the key
min_price = min(zip(prices.values(), prices.keys()))
max_price = max(zip(prices.values(), prices.keys()))
print(min_price, max_price)

(10.75, 'fb') (612.78, 'aapl')


In [42]:
# Commonalities between dictionaries
a ={
    'x':1,
    'y':2,
    'z':3
}

b = {
    'w':10,
    'x':11,
    'y':2
}
# finding keys in common
print(a.keys()&b.keys())

# finding keys in a that are not in b
print(a.keys() - b.keys())

#finding (key, value) pairs in common with b
print(a.items() & b.items())

{'y', 'x'}
{'z'}
{('y', 2)}


In [46]:
# Eliminate duplicates from a list
a = [1, 4, 2, 1, 9, 1, 4, 10]
# a simple way is to make a set
set(a)

{1, 2, 4, 9, 10}

In [54]:
# however, this approach does not maintain the order of elements
# in this case, we can use a generator function to do this
def dedup(items, key = None):
    uq = set()
    for item in items:
        val = item if key is None else key[item]
        if val not in uq:
            yield item
            uq.add(val)
a = [1, 4, 2, 1, 9, 1, 4, 10]

In [55]:
b = dedup(a)
next(b)

1

In [57]:
next(b)

4

In [58]:
list(dedup(a))

[1, 4, 2, 9, 10]

In [62]:
# naming the slice
a = slice(10, 50, 2)
s = 'HelloWorld'
# map a slice onto a sequence of a specific size by using its indices(size) method
a.indices(len(s))

(10, 10, 2)

## Sorting a list of dictionaries by a common key

In [1]:
rows = [
    {'fname': 'Brian', 'Iname':'Jones', 'uid':1003},
    {'fname': 'David', 'Iname':'Beazley', 'uid':1002},
    {'fname': 'John', 'Iname':'Cleese', 'uid':1001},
    {'fname': 'Big', 'Iname':'Jones', 'uid':1004}
]
from operator import itemgetter

rows_by_fname = sorted(rows, key=itemgetter('fname'))
rows_by_uid = sorted(rows,key= itemgetter('uid'))
# the operator.itemgetter function takes as arguments the lookup indices used to extract the desired values from the records in rows.
# we can use lambda function in key, but the performance of itemgetter is much better
# eg. rows_by_fname = sorted(rows, key=lambda r: r['fname'])

print(rows_by_fname)
print(rows_by_uid)

[{'fname': 'Big', 'Iname': 'Jones', 'uid': 1004}, {'fname': 'Brian', 'Iname': 'Jones', 'uid': 1003}, {'fname': 'David', 'Iname': 'Beazley', 'uid': 1002}, {'fname': 'John', 'Iname': 'Cleese', 'uid': 1001}]
[{'fname': 'John', 'Iname': 'Cleese', 'uid': 1001}, {'fname': 'David', 'Iname': 'Beazley', 'uid': 1002}, {'fname': 'Brian', 'Iname': 'Jones', 'uid': 1003}, {'fname': 'Big', 'Iname': 'Jones', 'uid': 1004}]


## Grouping records together based on a field

In [3]:
rows = [
    {'address': '5412 N CLARK', 'date':'07/01/2012'},
    {'address': '5148 N CLARK', 'date':'07/04/2012'},
    {'address': '5800 E 58EH', 'date':'07/02/2012'},
    {'address': '2122 N CLARK', 'date':'07/03/2012'},
    {'address': '5645 N RAVENSWOOD', 'date':'07/02/2012'},
    {'address': '1060 W ADDISON', 'date':'07/02/2012'},
    {'address': '4801 N BROADWAY', 'date':'07/01/2012'},
    {'address': '1039 W GRANVILLE', 'date':'07/04/2012'},
]
# first sort by the desired field(in this case, date), then use itertoools.groupby()
from operator import itemgetter
from itertools import groupby

rows.sort(key=itemgetter('date'))
for date, items in groupby(rows, key=itemgetter('date')):
    print(date)
    for i in items:
        print('   ', i)

# note: groupby() works by scanning a sequence and finding sequential “runs”
# of identical values (or values returned by the given key function). 

07/01/2012
    {'address': '5412 N CLARK', 'date': '07/01/2012'}
    {'address': '4801 N BROADWAY', 'date': '07/01/2012'}
07/02/2012
    {'address': '5800 E 58EH', 'date': '07/02/2012'}
    {'address': '5645 N RAVENSWOOD', 'date': '07/02/2012'}
    {'address': '1060 W ADDISON', 'date': '07/02/2012'}
07/03/2012
    {'address': '2122 N CLARK', 'date': '07/03/2012'}
07/04/2012
    {'address': '5148 N CLARK', 'date': '07/04/2012'}
    {'address': '1039 W GRANVILLE', 'date': '07/04/2012'}


In [9]:
# if dealing with a large data structure, it's faster to use deafaultdict() to build
# a multidict, eg:
from collections import defaultdict
rows_by_date = defaultdict(list)
for row in rows:
    rows_by_date[row['date']].append(row)

for r in rows_by_date['07/01/2012']:
    print(r)

{'address': '5412 N CLARK', 'date': '07/01/2012'}
{'address': '4801 N BROADWAY', 'date': '07/01/2012'}


## Filtering sequence elements

In [10]:
# easiest way is to use list comprehension
mylist = [1, 4, -5, 10, -7, 2, 3, -1]

[n for n in mylist if n > 0]

[1, 4, 10, 2, 3]

In [11]:
# if the data is large, you may want to use a generator
pos = (n for n in mylist if n > 0)
pos

<generator object <genexpr> at 0x000002AE513B8DB0>

In [12]:
for x in pos:
    print(x)

1
4
10
2
3


In [14]:
# If the above is not enough, you can use the built-in filter() function

values = ['1', '2', '-3', '-', '4', 'N/A', '5']

def is_int(val):
    try:
        x = int(val)
        return True
    except ValueError:
        return False

isvals = list(filter(is_int, values))
print(isvals)

['1', '2', '-3', '4', '5']


In [17]:
# Another notable filtering tool is itertools.compress()
# in this case, you need create a Boolean sequence to select the desired element
addresses = [
    '5412 N CLARK',
    '5148 N CLARK', 
    '5800 E 58EH', 
    '2122 N CLARK', 
    '5645 N RAVENSWOOD',
    '1060 W ADDISON', 
    '4801 N BROADWAY', 
    '1039 W GRANVILLE'
]
counts = [0,3,10,4,1,7,6,1]

from itertools import compress
more5 = [n > 4 for n in counts]
print(more5)
print(list(compress(addresses, more5)))

# like filter(), compress() also returns an iterator.

[False, False, True, False, False, True, True, False]
['5800 E 58EH', '1060 W ADDISON', '4801 N BROADWAY']


## Extracting a subset of a dictionary

In [19]:
# This's easily accomplised using a dictionary comprehension
prices = {
    'acme':45.23,
    'aapl':612.78,
    'IBM':205.55,
    'HPQ':37.20,
    'FB':10.75
}

p1 = {key:value for key, value in prices.items() if value > 40}
print(p1)

{'acme': 45.23, 'aapl': 612.78, 'IBM': 205.55}


In [20]:
p2 = {}
for key, value in prices.items():
    if value > 40:
        p2[key] = value

## Mapping names to sequence elements

In [32]:
# access elements by name, using collections.namedtuple()
from collections import namedtuple
Subscriber = namedtuple('Subs', ['addr', 'joined'])
sub = Subscriber('jonesy@example.com', '2012-10-19')
print(sub, sub.addr, sub.joined, sep='\n')

Subs(addr='jonesy@example.com', joined='2012-10-19')
jonesy@example.com
2012-10-19


In [33]:
# one possible use of a namedtuple is as a replacement of a dictionary, which required more space to store
# but remember, namedtuple is immutable, eg:
sub.joined = '2013-10-19'

AttributeError: can't set attribute

In [34]:
# Solution: use _replace method of a namedtuple instance
sub=sub._replace(joined='2013-10-19')
print(sub)

Subs(addr='jonesy@example.com', joined='2013-10-19')


# Strings and Text
## splitting strings on any of multiple delimiters

In [35]:
line = 'asdf fjdk; afed, sadkj,jk,   foo'
import re
re.split(r'[;,\s]\s*', line)

['asdf', 'fjdk', 'afed', 'sadkj', 'jk', 'foo']

In [38]:
# if a capaturing parathesis is used in the regular expression,
# then, the matched text is also inclued in the result
re.split(r'(;|,|\s)\s*', line)

['asdf fjdk; afed, sadkj,jk,   foo']

## matching text at the start or end of a string

In [1]:
# a simple way： str.startswith or str.endswith methods
filename = 'spam.txt'
filename.endswith('.txt')

True

In [3]:
import os
filenames = os.listdir('.')
filenames

['.ipynb_checkpoints',
 'Learn Python.ipynb',
 'Learning how to use Jupyter.ipynb',
 'loan prediction problem',
 'PivotTable',
 'printName.py',
 'pydata-code',
 'Python Challenge.ipynb',
 'Python Cookbook.ipynb',
 'Python for data analysis.ipynb']

In [5]:
[name for name in filenames if name.endswith(('.py', '.ipynb'))]

['Learn Python.ipynb',
 'Learning how to use Jupyter.ipynb',
 'printName.py',
 'Python Challenge.ipynb',
 'Python Cookbook.ipynb',
 'Python for data analysis.ipynb']

In [7]:
# note: here, the first argument in endswith is a tuple
choices = ['http', 'ftp']
url = 'http://www.python.org'
url.startswith(choices)

TypeError: startswith first arg must be str or a tuple of str, not list

In [9]:
url.startswith(tuple(choices))

True

## matchig strings using shell wildcard patterns