# Collection data structures

Collection data structures are different from sequential data structures. They are containers which aggregate data without relating them. <br>
Collection data structures have several properties <br>
* membership operator (for i in j)
* size method (len(seq))
* iterability (for i in range(100))
<br></br>

***

Two in-built types are ***sets*** and ***dicts***

### Sets
Sets are mutable, iterable and contains no duplicate elements. They have *O(1)* insertion complexity. Sets don't have concept of indexing i.e. you can't directly call an element based upon it's position. They support iterating i.e. you will have to traverse up to the element (*O(n)* complexity)

In [1]:
# add method
st = {1,2,3}
st.add(1)
print(st)

set([1, 2, 3])


In [2]:
# think of set as of sets from maths - you will get intersection,union etc.
# udpate method
another_set = {4,3,5,1}
st.update(another_set)
print(st)

set([1, 2, 3, 4, 5])


In [3]:
# union method
another_set = {1,2,3,4,6,7,8,9}
print(st.union(another_set))

set([1, 2, 3, 4, 5, 6, 7, 8, 9])


In [4]:
# intersection method - contains the elements which are in both first and second set
set_1 = {1,2,3}
set_2 = {3,4,5}
print(set_1.intersection(set_2))

set([3])


In [5]:
# difference method - contains the elements which are in first set but not in second set
print(set_1.difference(set_2))

set([1, 2])


In [6]:
# clear method - removes all the elements in set
set_1.clear()
print(set_1)

set([])


In [7]:
# discard, remove and pop method - differences
# discard - removes an item from set and if not found will not return an error
set_2.discard(3)
print(set_2)
set_2.discard(34)
print(set_2)

# remove - it removes an item from set and if not found will return KeyError exception
set_2.remove(57)
print(set_2)
set_2.pop()
print(set_2)

set([4, 5])
set([4, 5])


KeyError: 57

In [8]:
# pop deletes random item from set and it doesn't take any argument like lists
set_2.pop()
print(set_2)

set([5])


### Dictionaries
Dictionaries in python are implemented using hash tables. It has collection mapping type. It has several properties like sets.
* iterable (for key,value in dict.items())
* membership (for i in dict)
* size function (len(dict))
***
Acessing items in dictionary happen in *O(1)* time. They are mutable but do not have any concept of indexing so they can't be sliced or striped. They are unordered by default.

In [22]:
# applying set operations and properties on dictionaries
from collections import OrderedDict

def set_operations_with_dict():
    pairs = [('b',2),('a',1),('c',3)]
    d1 = OrderedDict(pairs)
    print(d1)
    
    d2 = dict({'a':1,'c':2,'d':3,'e':4})
    print(d2)
    
    union_keys = d1.keys() + d2.keys()
    print(union_keys)
    
    intersection_keys = d1.keys() and d2.keys()
    print(intersection_keys)

    

if __name__=='__main__':
    set_operations_with_dict()
    
    


OrderedDict([('b', 2), ('a', 1), ('c', 3)])
{'a': 1, 'c': 2, 'e': 4, 'd': 3}
['b', 'a', 'c', 'a', 'c', 'e', 'd']
['a', 'c', 'e', 'd']


In [49]:
# Methods for dictionaries
# setdefault() method
# won't throw error if key is not found in dictionary - will set a default value and insert the key
import timeit
dictionary = (('key1','value1'),
                 ('key2','value2'),
                 ('key3','value3'),
                  ('key4','value4'))
def usual_dictionary():
    new_data = {}
    for k,v in dictionary:
        if k in new_data:
            new_data[k].append(v)
        else:
            new_data[k]=v
    return new_data

def setdefault_dictionary():
    new_data = {}
    for k,v in dictionary:
        new_data.setdefault(k,[]).append(v)
    return new_data

def test_dictionary():
    
    t1 = timeit.Timer('usual_dictionary()', 'from __main__ import usual_dictionary')
    print('usual dictionary time: ',t1.timeit(number=1000))
    t2 = timeit.Timer('setdefault_dictionary()','from __main__ import setdefault_dictionary')
    print('setdefault dictionary time: ',t2.timeit(number=1000))
    

if __name__=='__main__':
    test_dictionary()

('usual dictionary time: ', 0.00084686279296875)
('setdefault dictionary time: ', 0.001889944076538086)


In [71]:
# update() method for including key value pair and it overwrites the existing key
d1 = {'a':1,'b':2}
d1.update({'a':3})
print(d1)

{'a': 3, 'b': 2}


In [72]:
# get() method - returns the value for the key
print(d1.get('a'))
print(d1.get('c'))    # returns None if not found

3
None


In [73]:
# items(), values() and keys() methods in dictionary
print(d1.items())
print(d1.keys())
print(d1.values())

[('a', 3), ('b', 2)]
['a', 'b']
[3, 2]


In [74]:
# pop() and popitem() methods in dictionary
d1.pop('a')    # removes specified item from dictionary
print(d1)    
d1.popitem()    # removes random item from dictionary
print(d1)


{'b': 2}
{}


In [81]:
# clear() function in dictionary - clears all items in dictionary
d2 = dict([('a',1),('b',2)])
print(d2)
d2.clear()
print(d2)

{'a': 1, 'b': 2}
{}


In [96]:
# Runtime analysis for dictionaries
# dictionaries are faster to form

import timeit
import random
print('i\tlist_time\tdict_time')
for i in range(10000,200000,20000):
    t = timeit.Timer('random.randrange(%d) in x'%i,'from __main__ import random,x')
    x = list(range(i))
    list_time = t.timeit(number=1000)
    x = {j:None for j in range(i)}
    dict_time = t.timeit(number=1000)
    print('%d\t%5.4f\t\t%5.4f'%(i,list_time,dict_time))
    





i	list_time	dict_time
10000	0.0580		0.0007
30000	0.1586		0.0007
50000	0.2708		0.0008
70000	0.3770		0.0007
90000	0.4932		0.0008
110000	0.6200		0.0009
130000	0.7170		0.0009
150000	0.8412		0.0009
170000	0.9176		0.0008
190000	1.1050		0.0008


Time complexity for dictionaries for membership operation is almost constant ~ *O(1)*
***
| Operation | Big-O efficiency |
| --- | --- |
| copy item | O(n) |
| get item | O(1) |
| set item | O(1) |
| delete item | O(1) |
| contains item (in) | O(1) |
| iteration over items | O(1) |

In [8]:
# Iterating over dictionaries
dictionary = {'a':1,'b':2,'c':3,'d':4,'e':5}
for value in sorted(dictionary.values()):
    print(list(dictionary.keys())[list(dictionary.values()).index(value)],':',value)

('a', ':', 1)
('b', ':', 2)
('c', ':', 3)
('d', ':', 4)
('e', ':', 5)


In [18]:
# using next() iterator to get value one by one
G = ([key,value] for key,value in dictionary.items())
print(next(G))
print(next(G))

['a', 1]
['c', 3]


In [23]:
# using yield preserves the benefit of iterables and generators
def yield_dictionary():
    for key,value in dictionary.items():
        yield key,value
for i in yield_dictionary():
    print(i)

('a', 1)
('c', 3)
('b', 2)
('e', 5)
('d', 4)


In [80]:
# Python's collection data types - high performance; built-in
# defaultdict can help to have default keys as None directly with having to value for key to be intialized
from collections import defaultdict
import time

def defaultdict_example():
    pair = {('a',1),('b',2),('c',3),('d',4)}
    
    start_time = time.time()
    d1 = {}
    for key,value in pair:
        if key not in d1:
            d1[key]=[]
        d1[key].append(value)
    end_time = time.time()
    print('usual dictionary time-period: ',end_time-start_time)
    
    start_time = time.time()
    d2 = defaultdict(list)
    for key,value in pair:
        d2[key].append(value)
    end_time = time.time()
    print('defaultdict dictionary time-period: ',end_time-start_time)

    
if __name__=='__main__':
    defaultdict_example()


('usual dictionary time-period: ', 5.0067901611328125e-06)
('defaultdict dictionary time-period: ', 5.9604644775390625e-06)


In [86]:
# Ordered dictionaries - they store items in insertion order
from collections import OrderedDict
dct = OrderedDict()
dct['abhi'] = 'Hyderabad'
dct['pulkit'] = 'Mainpuri'
dct['arpit']= 'delhi'
dct


OrderedDict([('abhi', 'Hyderabad'),
             ('pulkit', 'Mainpuri'),
             ('arpit', 'delhi')])

In [87]:
dct.popitem('abhi')    # returns the last item in dictionary

('arpit', 'delhi')

In [9]:
# Counter dictionaries
# subclass for counting hashable objects
# dictionary that maps the item to the number of occurences
from collections import Counter

def Counter_example():
    'some relevant examples for counter'
    seq1 = [1, 2, 3, 5, 1, 2, 5, 5, 2, 5, 1, 4]
    # can be really helpful to form a word frequency type of dictionary in NLP as well
    seq_counts = Counter(seq1)
    print(seq_counts)
    
    seq2 = [1,2,3]
    seq_counts.update(seq2)
    print(seq_counts)
    
    seq3 = [4,5,6]
    for key in seq3:
        seq_counts[key]+=1
    print(seq_counts)

    # set operations can also be used a+b and a-b
    seq_counts_2 = Counter(seq3)
    print(seq_counts+seq_counts_2)
    print(seq_counts-seq_counts_2)
    
if __name__=='__main__':
    Counter_example()

Counter({5: 4, 1: 3, 2: 3, 3: 1, 4: 1})
Counter({1: 4, 2: 4, 5: 4, 3: 2, 4: 1})
Counter({5: 5, 1: 4, 2: 4, 3: 2, 4: 2, 6: 1})
Counter({5: 6, 1: 4, 2: 4, 4: 3, 3: 2, 6: 2})
Counter({1: 4, 2: 4, 5: 4, 3: 2, 4: 1})


In [15]:
# Traditional examples - counting frequency of items
# using most_common function of Counter
from collections import Counter

def top_N_recurring_words(seq,N):
    counter_obj = Counter()
    
    for word in seq.split():
        counter_obj[word]+=1
    return counter_obj.most_common(N)

def test_top_N_recurring_words():
    seq = 'buffy angel monster xander a willow gg buffy the monster super buffy angel'
    N = 3
    assert(top_N_recurring_words(seq,N)==[('buffy', 3),('monster', 2), ('angel', 2)])
    print('Test passed')
    
    
if __name__=='__main__':
    test_top_N_recurring_words()

Test passed


In [92]:
# Finding Anagrams using 
import string

def anagrams(str1,str2):
    anagram_table = {key:0 for key in string.ascii_lowercase}
    
    for i in str1:
        anagram_table[i]+=1
        
    for i in str2:
        anagram_table[i]-=1

    if len(set(anagram_table.values())) < 2:
        print('Anagrams')
    else:
        print('Not Anagrams')
        
def counter_anagram(str1,str2):
    counter_str1 = Counter(list(str1))
    counter_str2 = Counter(list(str2))
    if counter_str1==counter_str2:
        print('Anagrams')
    else:
        print('Not Anagrams')

if __name__=='__main__':
    anagrams('abhishek','kehsihba')
    counter_anagram('abhishek','pulkit')


Anagrams
Not Anagrams


In [101]:
# finding anagrams using hashing function - ord returns an integer representing Unicode point object
def hash_function(astring,tablesize):
    sum = 0
    for position in range(len(astring)):
        sum += ord(astring[position])
    return sum%tablesize

if __name__=='__main__':

    assert hash_function('abhishek',11)==hash_function('kehsihba',11), 'Not Anagrams'
    print('Anagrams')
        

Anagrams


In [104]:
# Sum of paths - find combination of dices such that results sums to a fixed number
from collections import Counter, defaultdict

def find_dice_prob(S,n_faces=6):
    if S>2*n_faces or S < 2:
        return None
    cdict = Counter()
    ddict = defaultdict(list)
    for dice1 in range(1,n_faces+1):
        for dice2 in range(1,n_faces+2):
            t = [dice1,dice2]
            cdict[dice1+dice2]+=1
            ddict[dice1+dice2].append(t)
    return [cdict[S],ddict[S]]

if __name__=='__main__':
    print(find_dice_prob(5,6))

[4, [[1, 4], [2, 3], [3, 2], [4, 1]]]


In [105]:
# Finding duplicates in a string and delete all of them
import string
def delete_unique_word(str1):
    table_c = {key:0 for key in string.ascii_lowercase}
    
    for i in str1:
        table_c[i]+=1
    
    for key,value in table_c.items():
        if value>1:
            str1 = str1.replace(key,"")
    return str1

if __name__=='__main__':
    print(delete_unique_word('abhishek'))


abisek


$$ The\ End $$