In [1]:
# 自定义排序
# operator.itemgetter 库

In [2]:
from operator import itemgetter

In [3]:
rows= [
{'fname': 'Brian','lname': 'Jones', 'uid': 1003},
{'fname': 'David','lname': 'Beazley', 'uid': 1002},
{'fname': 'John', 'lname': 'Cleese', 'uid': 1001},
{'fname': 'Big', 'lname': 'Jones', 'uid':1004}
]

In [4]:
rows_by_fname = sorted(rows,key=itemgetter('fname'))
rows_by_fname

[{'fname': 'Big', 'lname': 'Jones', 'uid': 1004},
 {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003},
 {'fname': 'David', 'lname': 'Beazley', 'uid': 1002},
 {'fname': 'John', 'lname': 'Cleese', 'uid': 1001}]

In [5]:
rows_by_ids = sorted(rows,key=itemgetter('uid'))
rows_by_ids

[{'fname': 'John', 'lname': 'Cleese', 'uid': 1001},
 {'fname': 'David', 'lname': 'Beazley', 'uid': 1002},
 {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003},
 {'fname': 'Big', 'lname': 'Jones', 'uid': 1004}]

In [7]:
# 多个key参数排序
rows_by = sorted(rows,key=itemgetter('fname','lname'))
rows_by

[{'fname': 'Big', 'lname': 'Jones', 'uid': 1004},
 {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003},
 {'fname': 'David', 'lname': 'Beazley', 'uid': 1002},
 {'fname': 'John', 'lname': 'Cleese', 'uid': 1001}]

In [10]:
# 也可以使用lambda表达式来实现自定义排序，但是速度没有itemgetter 快。
sorted(rows,key=lambda x:x['fname'])

[{'fname': 'Big', 'lname': 'Jones', 'uid': 1004},
 {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003},
 {'fname': 'David', 'lname': 'Beazley', 'uid': 1002},
 {'fname': 'John', 'lname': 'Cleese', 'uid': 1001}]

In [11]:
sorted(rows,key=lambda x:(x['lname'],x['fname']))

[{'fname': 'David', 'lname': 'Beazley', 'uid': 1002},
 {'fname': 'John', 'lname': 'Cleese', 'uid': 1001},
 {'fname': 'Big', 'lname': 'Jones', 'uid': 1004},
 {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003}]

#### max,min,sort,等支持key参数的函数也可以使用itemgetter

#### 类排序

In [15]:
class User:
    def __init__(self,user_id):
        self.user_id = user_id
        
    def __repr__(self):
        return f'User({self.user_id})'

In [16]:
users = [User(23),User(3),User(100),User(99)]

In [17]:
users

[User(23), User(3), User(100), User(99)]

In [18]:
sorted(users,key=lambda x:x.user_id)

[User(3), User(23), User(99), User(100)]

##### 使用operator.attrgetter()来代替lambda

In [19]:
from operator import attrgetter
sorted(users,key=attrgetter('user_id'))

[User(3), User(23), User(99), User(100)]

In [20]:
max(users,key=attrgetter('user_id'))

User(100)

In [21]:
rows= [
{'address': '5412 NCLARK', 'date': '07/01/2012'},
{'address': '5148 NCLARK', 'date': '07/04/2012'},
{'address': '5800 E58TH', 'date': '07/02/2012'},
{'address': '2122 NCLARK', 'date': '07/03/2012'},
{'address': '5645 NRAVENSWOOD', 'date':'07/02/2012'},
{'address': '1060 WADDISON', 'date': '07/02/2012'},
{'address': '4801 NBROADWAY', 'date': '07/01/2012'},
{'address': '1039 WGRANVILLE', 'date':'07/04/2012'},
]

In [22]:
from operator import itemgetter
from itertools import groupby

rows.sort(key=itemgetter('date'))

groupby() 函数扫描整个序列并且查找连续相同值(或者根据指定key函数返回值相同)的元
素序列。 在每次迭代的时候，它会返回一个值和一个迭代器对象， 这个迭代器对象可以
生成元素值全部等于上面那个值的组中所有对象。

In [23]:
for date,items in groupby(rows,key=itemgetter('date')):
    print(date)
    for item in items:
        print(' ',item)

07/01/2012
  {'address': '5412 NCLARK', 'date': '07/01/2012'}
  {'address': '4801 NBROADWAY', 'date': '07/01/2012'}
07/02/2012
  {'address': '5800 E58TH', 'date': '07/02/2012'}
  {'address': '5645 NRAVENSWOOD', 'date': '07/02/2012'}
  {'address': '1060 WADDISON', 'date': '07/02/2012'}
07/03/2012
  {'address': '2122 NCLARK', 'date': '07/03/2012'}
07/04/2012
  {'address': '5148 NCLARK', 'date': '07/04/2012'}
  {'address': '1039 WGRANVILLE', 'date': '07/04/2012'}


In [27]:
# 实现一键多值，且键不存在时不会报错
from collections import defaultdict


In [42]:
# 普通做法
d={}
z = [('a',1),('b',2),('c',3),('d',4),('a',5),('b',6),('c',7)]

In [38]:
for k,v in z:
    if k not in d:
        d[k]=[]
    d[k].append(v)

In [39]:
d

{'a': [1, 5], 'b': [2, 6], 'c': [3, 7], 'd': [4]}

In [51]:
# 使用defaultdict
d = defaultdict(list)
for k,v in z:
    # 不需要判断键是否存在
    d[k].append(v)

#### 查找序列中出现次数最多的元素

In [54]:
from collections import Counter

In [58]:
# 词频统计
words = [
'look', 'into', 'my', 'eyes', 'look', 'into', 'my', 'eyes',
'the', 'eyes', 'the', 'eyes', 'the', 'eyes', 'not', 'around', 'the',
'eyes', "don't", 'look', 'around', 'the','eyes', 'look', 'into',
'my', 'eyes', "you're", 'under'
]
morewords = ['why','are','you','not','looking','in','my','eyes']

In [60]:
wc = Counter(words)
wm = Counter(morewords)

In [57]:
wc.most_common(2)

[('eyes', 8), ('the', 5)]

In [61]:
wc+wm

Counter({'look': 4,
         'into': 3,
         'my': 4,
         'eyes': 9,
         'the': 5,
         'not': 2,
         'around': 2,
         "don't": 1,
         "you're": 1,
         'under': 1,
         'why': 1,
         'are': 1,
         'you': 1,
         'looking': 1,
         'in': 1})

In [62]:
wc-wm

Counter({'look': 4,
         'into': 3,
         'my': 2,
         'eyes': 7,
         'the': 5,
         'around': 2,
         "don't": 1,
         "you're": 1,
         'under': 1})

In [75]:
lst=[1,2,4,5,8,7]