## Data structures and sequence

## Tuples

In [4]:
#another method to declare tuple
ls = [1, 2, 3]
tup =  4, 5, 6
string = "string"
tuple(ls), tup, tuple(string)

((1, 2, 3), (4, 5, 6), ('s', 't', 'r', 'i', 'n', 'g'))

In [5]:
# tuples are non mutable
# however mutable object inside tuple can be modified
tup = tuple (['foo', [1, 2], True])
tup[1].append(3) 

tup

('foo', [1, 2, 3], True)

In [9]:
# objects are not copied but refrenced
('foo', "ba") * 4

('foo', 'ba', 'foo', 'ba', 'foo', 'ba', 'foo', 'ba')

In [12]:
# unpacking
a,b,c = tup
print(a)

tup = 4, 5, (6, 7)
a, b, (c, d) = tup
d

foo


7

In [13]:
# swaping in python
a, b = b, a

In [16]:
seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]

for a, b ,c in seq:
    print(a, b, c)

1 2 3
4 5 6
7 8 9


In [18]:
# Plucking values
values = 1, 2, 3, 4, 5
a, b, *rest = values
rest

[3, 4, 5]

## list


In [None]:
everything = []
for chunk in list_of_list:
    everything.extend(chunk) # Cheaper in compute
#   everything = everything + chunk

## Dictionaries

In [7]:
key_list = [1,2,3]
value_list = [4,5,6]
mapping = {}
for key, value in zip(key_list, value_list):
    mapping[key] = value

mapping

{1: 4, 2: 5, 3: 6}

In [13]:
tuples = zip(range(5), reversed(range(5)))
tuples

<zip at 0x168622c89c0>

In [14]:
mapping = dict(tuples)
mapping

{0: 4, 1: 3, 2: 2, 3: 1, 4: 0}

In [None]:
# extract value if key is found else return default
if key in mapping:
    value = mapping[key]
else:
    value = default_value

In [None]:
# Same Functionality as above cell
value = some_dict.get(key,default_value)

.get method returns None by default 
whereas .pop method would raise exception

Collecting items in dictionary

In [16]:
words = ['apple', 'bat', 'bar', 'atom', 'cat']

by_letter = {}

for word in words:
    letter = word[0]
    if letter not in by_letter:
        by_letter[letter] = [word]
    else:
        by_letter[letter].append(word)

by_letter

{'a': ['apple', 'atom'], 'b': ['bat', 'bar'], 'c': ['cat']}

.setdefault(key, default_value)
if key contains value does nothing
else default_value would be assigned to the key

In [17]:
by_letter = {}

for word in words:
    letter = word[0]
    by_letter.setdefault(letter, []).append(word)

by_letter

{'a': ['apple', 'atom'], 'b': ['bat', 'bar'], 'c': ['cat']}

### Validity/hashability of dictionary keys
Dictionary keys have to be immutable

In [26]:
%xmode
hash('string')

hash((1,2))

# List objects are unhashable as they are immutable
hash([1,2])

Exception reporting mode: Plain


TypeError: unhashable type: 'list'

## Set
Unordered collection of unique items created by {}

In [21]:
set([1,2,3.4])

{1, 2, 3.4}

set contains .union(|) .intersect(&) .difference and .symmetric methods

In [23]:
a = {1, 2, 3, 4, 5}
b = {3, 4, 5, 6, 7, 8}

a.union(b)
a|b

{1, 2, 3, 4, 5, 6, 7, 8}

Sets are also immutable and hence they are hashable

In [25]:
my_data = [1, 2, 3, 4]
myset = {tuple(my_data)}
myset

{(1, 2, 3, 4)}

## List, Set and Dictionary Comprehension

In [26]:
# List Comprehension
strings = ['a', 'as', "bat", "car", "dove", "python"]
[x.upper() for x in strings if len(x) > 2]

['BAT', 'CAR', 'DOVE', 'PYTHON']

In [28]:
# Set comprehension
dict_comp = {len(x) for x in strings}
dict_comp

{1, 2, 3, 4, 6}

In [29]:
# Above can be also used with
set(map(len, strings))

{1, 2, 3, 4, 6}

In [31]:
# Dictionary comphrehension
loc_mapping = {value: index for index, value in enumerate(strings)}
loc_mapping

{'a': 0, 'as': 1, 'bat': 2, 'car': 3, 'dove': 4, 'python': 5}

### Nested list comprehension

In [34]:
all_data = [["John", "Emily", "Michael", "Mary", "Steven"], 
            ["Maria", "Juan", "Javier", "Natalia", "Pilar"]]

In [37]:
names_of_interest = []
for names in all_data:
    enough_as = [name for name in names if name.count("a") >= 2]
    names_of_interest.extend(enough_as)

names_of_interest

['Maria', 'Natalia']

In [44]:
# Wrapping the above operation into nested list comprehension
results = [name for names in all_data for name in names if name.count('a') >= 2]
results

['Maria', 'Natalia']

In [46]:
# another eg for nested list comprehension
some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
flattened = [x for tup in some_tuples for x in tup]
flattened

[1, 2, 3, 4, 5, 6, 7, 8, 9]

# Functions

In [None]:
# args are mandatory but keyword arguments are optional 
def my_function2(x, y, z=1.5):
    if z > 1:
        return z * (x + y)
    else:
        return z / (x + y)

In [47]:
# use of non-local and global
a = None

def bind_a_variable():
    global a
    a = []
bind_a_variable()

print(a)

[]


In [48]:
# returning multiple values
def f():
    a = 5
    b = 6
    c = 7
    return a, b, c

a, b, c = f()

## Data cleaning using re

In [55]:
states = ["   Alabama ", "Georgia!", "Georgia", "georgia", "FlOrIda", "south   carolina##", "West virginia?"]

In [50]:
import re
def clean_strings(strings):
    result = []
    for value in strings:
        value = value.strip()
        value = re.sub('[!#?]', "", value)
        value = value.title()
        result.append(value)
    return result


In [51]:
clean_strings(states)

['Alabama',
 'Georgia',
 'Georgia',
 'Georgia',
 'Florida',
 'South   Carolina',
 'West Virginia']

### Another approach
Passing functions as argument

In [56]:
def remove_punctuation(value):
    return re.sub("[!#?]", "", value)
clean_ops = [str.strip, remove_punctuation, str.title]

def clean_strings(strings, ops):
    results = []
    for value in strings:
        for func in ops:
            value = func(value)
        results.append(value)
    return results

clean_strings(states, clean_ops)

['Alabama',
 'Georgia',
 'Georgia',
 'Georgia',
 'Florida',
 'South   Carolina',
 'West Virginia']

## Anonymous function (lambda function)

In [None]:
def short_func(x): return x*2
equiv_anon = lambda x: x*2

In [57]:
strings.sort(key=lambda x: len(set(x)))
strings

['a', 'as', 'bat', 'car', 'dove', 'python']

# Using generator
Generator is a convinient way, similar to writinng a normal funciton
It constantly returns a value from a function for each loop iteration
`yield` is used instead of `return`

In [59]:
def squares(n=10):
    print(f'Generating squares from 1 to {n**2}')
    for i in range(1, n+1):
        yield i **2
squares()

<generator object squares at 0x00000168638FE110>

In [61]:
# Code is only executed when return objects are called
for i in squares():
    print(i)

Generating squares from 1 to 100
1
4
9
16
25
36
49
64
81
100


Generator are memory efficient since they only return single object at a time.

### Generator Expressions (Modified list/set/dictionary comprehension)


In [63]:
gen = (x**2 for x in range(100))
gen

<generator object <genexpr> at 0x00000168638FEA40>

In [65]:
print(sum(x ** 2 for x in range(100)))
dict((i, i**2) for i in range(5))

328350


{0: 0, 1: 1, 2: 4, 3: 9, 4: 16}

# Itertools
Itertools modules contains generator which are commonly used

In [1]:
import itertools
def first_letter(x):
    return x[0]

names = ['Alan', 'Adam', 'Agastya', 'Wes', 'Will', 'Albert', 'Steven']

In [14]:
# Group consecutive names by first letter
for letter, names in itertools.groupby(names, first_letter):
    print(letter, list(names))

In [None]:
Catch exception in %run mode in Jp | Returns full stack traceback 
%xmode : Control amount of Exception context being printed | There are different print modes available 
%debug / %pdp : Debug

In [None]:
This is used to run external script and it also returns the exception
%run examples/ipython_bug.py

# Files and OS

In [13]:
path = "../examples/segismundo.txt"
f = open(path, encoding = 'utf-8')

In [14]:
for line in f:
    print(line)

Sueña el rico en su riqueza,

que más cuidados le ofrece;



sueña el pobre que padece

su miseria y su pobreza;



sueña el que a medrar empieza,

sueña el que afana y pretende,

sueña el que agravia y ofende,



y en el mundo, en conclusión,

todos sueñan lo que son,

aunque ninguno lo entiende.





In [15]:
lines = [x.rstrip() for x in open(path, encoding='utf-8')]
lines

['Sueña el rico en su riqueza,',
 'que más cuidados le ofrece;',
 '',
 'sueña el pobre que padece',
 'su miseria y su pobreza;',
 '',
 'sueña el que a medrar empieza,',
 'sueña el que afana y pretende,',
 'sueña el que agravia y ofende,',
 '',
 'y en el mundo, en conclusión,',
 'todos sueñan lo que son,',
 'aunque ninguno lo entiende.',
 '']

In [16]:
f.close()

In [17]:
# another way of opening files
with open(path, encoding='utf-8') as file:
    lines = [x.rstrip() for x in file]

In [18]:
# Open file in normal mode
f1 = open(path)
f1.read(10)

'SueÃ±a el '

In [19]:
f2 = open(path, mode='rb') # Binary mode
f2.read(10)

b'Sue\xc3\xb1a el '

In [20]:
#get position of the cursor in the file
f1.tell(), f2.tell()

(10, 10)

In [21]:
import sys
sys.getdefaultencoding()

'utf-8'

In [29]:
# seek movest the cursor in the file to the position by bytes
f1.seek(3)

3

In [30]:
f1.read(1), f1.tell()

('Ã', 4)

In [31]:
f1.close()
f2.close()

In [33]:
with open('tmp.txt', 'w') as handle:
    handle.writelines(x for x in open(path, 'r') if len(x)> 1)

with open('tmp.txt', 'r') as f:
    lines = f.readlines()
lines

['SueÃ±a el rico en su riqueza,\n',
 'que mÃ¡s cuidados le ofrece;\n',
 'sueÃ±a el pobre que padece\n',
 'su miseria y su pobreza;\n',
 'sueÃ±a el que a medrar empieza,\n',
 'sueÃ±a el que afana y pretende,\n',
 'sueÃ±a el que agravia y ofende,\n',
 'y en el mundo, en conclusiÃ³n,\n',
 'todos sueÃ±an lo que son,\n',
 'aunque ninguno lo entiende.\n']

## Encoding - Bytes and Unicode

In [34]:
# decoded/normal file mode
with open(path) as f:
    chars = f.read(10)
chars, len(chars)

('SueÃ±a el ', 10)

In [37]:
# encoded file mode
with open(path, 'rb') as f:
    chars = f.read(10)
chars, chars.decode('utf-8'), len(chars)

(b'Sue\xc3\xb1a el ', 'Sueña el ', 10)

In [38]:
# Returns an error as byte is not completely read when it is splitted
chars[:4].decode('utf-8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 3: unexpected end of data

In [39]:
# Encoding options
sink_path = 'sink.txt'
with open(path) as source:
    with open(sink_path, 'x', encoding='iso-8859-1') as sink:
        sink.write(source.read())

with open(sink_path, encoding='iso-8859-1') as f:
    print(f.read(10))

SueÃ±a el 
