# Data Types and Structures

## Basic Data Types

### Integers

In [1]:
a = 10
type(a)

int

In [2]:
a.bit_length()

4

In [3]:
a = 100000
a.bit_length()

17

In [4]:
googol = 10 ** 100
googol

10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000L

In [5]:
googol.bit_length()

333

In [6]:
1 + 4

5

In [7]:
1 / 4

0

In [8]:
type(1 / 4)

int

### Floats

In [9]:
1. / 4

0.25

In [10]:
type (1. / 4)

float

In [11]:
b = 0.35
type(b)

float

In [12]:
b + 0.1

0.44999999999999996

In [13]:
c = 0.5
c.as_integer_ratio()

(1, 2)

In [14]:
b.as_integer_ratio()

(3152519739159347L, 9007199254740992L)

In [15]:
import decimal
from decimal import Decimal

In [16]:
decimal.getcontext()

Context(prec=28, rounding=ROUND_HALF_EVEN, Emin=-999999999, Emax=999999999, capitals=1, flags=[], traps=[Overflow, DivisionByZero, InvalidOperation])

In [17]:
d = Decimal(1) / Decimal (11)
d

Decimal('0.09090909090909090909090909091')

In [18]:
decimal.getcontext().prec = 4  # lower precision than default

In [19]:
e = Decimal(1) / Decimal (11)
e

Decimal('0.09091')

In [20]:
decimal.getcontext().prec = 50  # higher precision than default

In [21]:
f = Decimal(1) / Decimal (11)
f

Decimal('0.090909090909090909090909090909090909090909090909091')

In [22]:
g = d + e + f
g

Decimal('0.27272818181818181818181818181909090909090909090909')

### Strings

In [23]:
t = 'this is a string object'

In [24]:
t.capitalize()

'This is a string object'

In [25]:
t.split()

['this', 'is', 'a', 'string', 'object']

In [26]:
t.find('string')

10

In [27]:
t.find('Python')

-1

In [28]:
t.replace(' ', '|')

'this|is|a|string|object'

In [29]:
'http://www.python.org'.strip('htp:/')

'www.python.org'

In [30]:
import re

In [31]:
series = """
'01/18/2014 13:00:00', 100, '1st';
'01/18/2014 13:30:00', 110, '2nd';
'01/18/2014 14:00:00', 120, '3rd'
"""

In [32]:
dt = re.compile("'[0-9/:\s]+'")  # datetime

In [33]:
result = dt.findall(series)
result

["'01/18/2014 13:00:00'", "'01/18/2014 13:30:00'", "'01/18/2014 14:00:00'"]

In [34]:
from datetime import datetime
pydt = datetime.strptime(result[0].replace("'", ""),
                         '%m/%d/%Y %H:%M:%S')
pydt

datetime.datetime(2014, 1, 18, 13, 0)

In [35]:
print pydt

2014-01-18 13:00:00


In [36]:
print type(pydt)

<type 'datetime.datetime'>


## Basic Data Structures

### Tuples

In [37]:
t = (1, 2.5, 'data')
type(t)

tuple

In [38]:
t = 1, 2.5, 'data'
type(t)

tuple

In [39]:
t[2]

'data'

In [40]:
type(t[2])

str

In [41]:
t.count('data')

1

In [42]:
t.index(1)

0

### Lists

In [43]:
l = [1, 2.5, 'data']
l[2]

'data'

In [44]:
l = list(t)
l

[1, 2.5, 'data']

In [45]:
type(l)

list

In [46]:
l.append([4, 3])  # append list at the end
l

[1, 2.5, 'data', [4, 3]]

In [47]:
l.extend([1.0, 1.5, 2.0])  # append elements of list
l

[1, 2.5, 'data', [4, 3], 1.0, 1.5, 2.0]

In [48]:
l.insert(1, 'insert')  # insert object before index position
l

[1, 'insert', 2.5, 'data', [4, 3], 1.0, 1.5, 2.0]

In [49]:
l.remove('data')  # remove first occurence of object
l

[1, 'insert', 2.5, [4, 3], 1.0, 1.5, 2.0]

In [50]:
p = l.pop(3)  # removes and returns object at index
print l, p

[1, 'insert', 2.5, 1.0, 1.5, 2.0] [4, 3]


In [51]:
l[2:5]  # 3rd to 5th element

[2.5, 1.0, 1.5]

### Excursion: Control Structures

In [52]:
for element in l[2:5]:
    print element ** 2

6.25
1.0
2.25


In [53]:
r = range(0, 8, 1)  # start, end, step width
r

[0, 1, 2, 3, 4, 5, 6, 7]

In [54]:
type(r)

list

In [55]:
for i in range(2, 5):
    print l[i] ** 2

6.25
1.0
2.25


In [56]:
for i in range(1, 10):
    if i % 2 == 0:  # % is for modulo
        print "%d is even" % i
    elif i % 3 == 0:
        print "%d is multiple of 3" % i
    else:
        print "%d is odd" % i

1 is odd
2 is even
3 is multiple of 3
4 is even
5 is odd
6 is even
7 is odd
8 is even
9 is multiple of 3


In [57]:
total = 0
while total < 100:
    total += 1
print total

100


In [58]:
m = [i ** 2 for i in range(5)]
m

[0, 1, 4, 9, 16]

### Excursion: Functional Programming

In [59]:
def f(x):
    return x ** 2
f(2)

4

In [60]:
def even(x):
    return x % 2 == 0
even(3)

False

In [61]:
map(even, range(10))

[True, False, True, False, True, False, True, False, True, False]

In [62]:
map(lambda x: x ** 2, range(10))

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [63]:
filter(even, range(15)) 

[0, 2, 4, 6, 8, 10, 12, 14]

In [64]:
reduce(lambda x, y: x + y, range(10))

45

In [65]:
def cumsum(l):
    total = 0
    for elem in l:
        total += elem
    return total
cumsum(range(10))

45

### Dicts

In [66]:
d = {
     'Name' : 'Angela Merkel',
     'Country' : 'Germany',
     'Profession' : 'Chancelor',
     'Age' : 60
     }
type(d)

dict

In [67]:
print d['Name'], d['Age']

Angela Merkel 60


In [68]:
d.keys()

['Country', 'Age', 'Profession', 'Name']

In [69]:
d.values()

['Germany', 60, 'Chancelor', 'Angela Merkel']

In [70]:
d.items()

[('Country', 'Germany'),
 ('Age', 60),
 ('Profession', 'Chancelor'),
 ('Name', 'Angela Merkel')]

In [71]:
birthday = True
if birthday is True:
    d['Age'] += 1
print d['Age']

61


In [72]:
for item in d.iteritems():
    print item

('Country', 'Germany')
('Age', 61)
('Profession', 'Chancelor')
('Name', 'Angela Merkel')


In [73]:
for value in d.itervalues():
    print type(value)

<type 'str'>
<type 'int'>
<type 'str'>
<type 'str'>


### Sets

In [74]:
s = set(['u', 'd', 'ud', 'du', 'd', 'du'])
s

{'d', 'du', 'u', 'ud'}

In [75]:
t = set(['d', 'dd', 'uu', 'u'])

In [76]:
s.union(t)  # all of s and t

{'d', 'dd', 'du', 'u', 'ud', 'uu'}

In [77]:
s.intersection(t)  # both in s and t

{'d', 'u'}

In [78]:
s.difference(t)  # in s but not t

{'du', 'ud'}

In [79]:
t.difference(s)  # in t but not s

{'dd', 'uu'}

In [80]:
s.symmetric_difference(t)  # in either one but not both

{'dd', 'du', 'ud', 'uu'}

In [81]:
from random import randint
l = [randint(0, 10) for i in range(1000)]
    # 1,000 random integers between 0 and 10
len(l)  # number of elements in l

1000

In [82]:
l[:20]

[3, 7, 5, 10, 3, 1, 2, 7, 6, 1, 6, 4, 0, 8, 9, 8, 6, 9, 6, 4]

In [83]:
s = set(l)
s

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

## NumPy Data Structures

### Arrays with Python Lists

In [84]:
v = [0.5, 0.75, 1.0, 1.5, 2.0]  # vector of numbers

In [85]:
m = [v, v, v]  # matrix of numbers
m

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

In [86]:
m[1]

[0.5, 0.75, 1.0, 1.5, 2.0]

In [87]:
m[1][0]

0.5

In [88]:
v1 = [0.5, 1.5]
v2 = [1, 2]
m = [v1, v2]
c = [m, m]  # cube of numbers
c

[[[0.5, 1.5], [1, 2]], [[0.5, 1.5], [1, 2]]]

In [89]:
c[1][1][0]

1

In [90]:
v = [0.5, 0.75, 1.0, 1.5, 2.0]
m = [v, v, v]
m

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

In [91]:
v[0] = 'Python'
m

[['Python', 0.75, 1.0, 1.5, 2.0],
 ['Python', 0.75, 1.0, 1.5, 2.0],
 ['Python', 0.75, 1.0, 1.5, 2.0]]

In [92]:
from copy import deepcopy
v = [0.5, 0.75, 1.0, 1.5, 2.0]
m = 3 * [deepcopy(v), ]
m

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

In [93]:
v[0] = 'Python'
m

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

### Regular NumPy Arrays

In [94]:
import numpy as np

In [95]:
a = np.array([0, 0.5, 1.0, 1.5, 2.0])
type(a)

numpy.ndarray

In [96]:
a[:2]  # indexing as with list objects in 1 dimension

array([ 0. ,  0.5])

In [97]:
a.sum()  # sum of all elements

5.0

In [98]:
a.std()  # standard deviation

0.70710678118654757

In [99]:
a.cumsum()  # running cumulative sum

array([ 0. ,  0.5,  1.5,  3. ,  5. ])

In [100]:
a * 2

array([ 0.,  1.,  2.,  3.,  4.])

In [101]:
a ** 2

array([ 0.  ,  0.25,  1.  ,  2.25,  4.  ])

In [102]:
np.sqrt(a)

array([ 0.        ,  0.70710678,  1.        ,  1.22474487,  1.41421356])

In [103]:
b = np.array([a, a * 2])
b

array([[ 0. ,  0.5,  1. ,  1.5,  2. ],
       [ 0. ,  1. ,  2. ,  3. ,  4. ]])

In [104]:
b[0]  # first row

array([ 0. ,  0.5,  1. ,  1.5,  2. ])

In [105]:
b[0, 2]  # third element of first row

1.0

In [106]:
b.sum()

15.0

In [107]:
b.sum(axis=0)
  # sum along axis 0, i.e. column-wise sum

array([ 0. ,  1.5,  3. ,  4.5,  6. ])

In [108]:
b.sum(axis=1)
  # sum along axis 1, i.e. row-wise sum

array([  5.,  10.])

In [109]:
c = np.zeros((2, 3, 4), dtype='i', order='C')  # also: np.ones()
c

array([[[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]])

In [110]:
d = np.ones_like(c, dtype='float64', order='C')  # also: np.zeros_like()
d

array([[[ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.]],

       [[ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.]]])

In [111]:
import random
#I = 5000 
I = 500 

In [112]:
%time mat = [[random.gauss(0, 1) for j in range(I)] for i in range(I)]
  # a nested list comprehension

Wall time: 234 ms


In [113]:
%time reduce(lambda x, y: x + y,      \
     [reduce(lambda x, y: x + y, row) \
             for row in mat])

Wall time: 28 ms


-206.8253734393859

In [114]:
%time mat = np.random.standard_normal((I, I))

Wall time: 25 ms


In [115]:
%time mat.sum()

Wall time: 1e+03 µs


-335.48736424426653

### Structured Arrays

In [116]:
dt = np.dtype([('Name', 'S10'), ('Age', 'i4'),
               ('Height', 'f'), ('Children/Pets', 'i4', 2)])
s = np.array([('Smith', 45, 1.83, (0, 1)),
              ('Jones', 53, 1.72, (2, 2))], dtype=dt)
s

array([('Smith', 45, 1.8300000429153442, [0, 1]),
       ('Jones', 53, 1.7200000286102295, [2, 2])], 
      dtype=[('Name', 'S10'), ('Age', '<i4'), ('Height', '<f4'), ('Children/Pets', '<i4', (2,))])

In [117]:
s['Name']

array(['Smith', 'Jones'], 
      dtype='|S10')

In [118]:
s['Height'].mean()

1.7750001

In [119]:
s[1]['Age']

53

## Vectorization of Code

### Basic Vectorization

In [120]:
r = np.random.standard_normal((4, 3))
s = np.random.standard_normal((4, 3))

In [121]:
r + s

array([[-2.18255964,  0.72553393, -0.86333612],
       [ 0.13449945,  1.08867781,  0.23384467],
       [ 2.14466423,  1.86872132,  0.15411218],
       [ 0.89176108, -2.74417199, -0.87109851]])

In [122]:
2 * r + 3

array([[ 3.38259714,  5.47320875,  1.85954444],
       [ 3.55583406,  6.5283087 ,  3.93023976],
       [ 6.40364137,  0.76854295,  2.44127281],
       [ 5.40039776, -0.76336976,  2.54338328]])

In [123]:
s = np.random.standard_normal(3)
r + s

array([[ 1.8212887 ,  0.5868764 ,  0.85209482],
       [ 1.90790716,  1.11442638,  1.88744248],
       [ 3.33181081, -1.7654565 ,  1.14295901],
       [ 2.83018901, -2.53141285,  1.19401424]])

In [124]:
# causes intentional error
# s = np.random.standard_normal(4)
# r + s

In [125]:
# r.transpose() + s

In [126]:
np.shape(r.T)

(3L, 4L)

In [127]:
def f(x):
    return 3 * x + 5

In [128]:
f(0.5)  # float object

6.5

In [129]:
f(r)  # NumPy array

array([[  5.57389571,   8.70981313,   3.28931667],
       [  5.83375109,  10.29246305,   6.39535963],
       [ 10.10546205,   1.65281443,   4.16190922],
       [  8.60059664,  -0.64505465,   4.31507491]])

In [130]:
# causes intentional error
# import math
# math.sin(r)

In [131]:
np.sin(r)  # array as input

array([[ 0.19013394,  0.94467566, -0.5398238 ],
       [ 0.2743532 ,  0.98136451,  0.44852995],
       [ 0.99142858, -0.89823123, -0.27574397],
       [ 0.93211113, -0.95206213, -0.22633011]])

In [132]:
np.sin(np.pi)  # float as input

1.2246467991473532e-16

### Memory Layout

In [133]:
#x = np.random.standard_normal((5, 10000000))
x = np.random.standard_normal((5, 10000))
y = 2 * x + 3  # linear equation y = a * x + b
C = np.array((x, y), order='C')
F = np.array((x, y), order='F')
x = 0.0; y = 0.0  # memory clean-up

In [134]:
C[:2].round(2)

array([[[-1.13, -0.14,  0.4 , ...,  0.15,  1.1 ,  1.19],
        [ 0.8 ,  0.15,  0.43, ...,  0.37,  0.54,  0.36],
        [ 1.87,  0.19,  1.51, ...,  1.06,  0.9 , -0.1 ],
        [ 0.01,  0.  , -0.68, ...,  1.84,  0.58,  0.71],
        [ 0.84,  2.04, -0.79, ...,  1.94,  1.  ,  0.12]],

       [[ 0.73,  2.72,  3.81, ...,  3.3 ,  5.19,  5.38],
        [ 4.6 ,  3.3 ,  3.87, ...,  3.74,  4.07,  3.72],
        [ 6.75,  3.37,  6.01, ...,  5.13,  4.81,  2.8 ],
        [ 3.02,  3.  ,  1.64, ...,  6.67,  4.16,  4.42],
        [ 4.68,  7.08,  1.41, ...,  6.87,  5.01,  3.25]]])

In [135]:
%timeit C.sum()

The slowest run took 4.52 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 55.1 µs per loop


In [136]:
%timeit F.sum()

10000 loops, best of 3: 55.3 µs per loop


In [137]:
%timeit C[0].sum(axis=0)

10000 loops, best of 3: 29 µs per loop


In [138]:
%timeit C[0].sum(axis=1)

The slowest run took 4.98 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 30 µs per loop


In [139]:
%timeit F.sum(axis=0)

1000 loops, best of 3: 608 µs per loop


In [140]:
%timeit F.sum(axis=1)

1000 loops, best of 3: 1.97 ms per loop


In [141]:
F = 0.0; C = 0.0  # memory clean-up

## Conclusions

## Further Reading