# Pandas and Numpy Intro

In [87]:
import numpy as np
import pandas as pd

def print_ops(string):
    '''Splits the string on spaces to get operations
       Tries to run each operation and print the operation and its result
       If error: print operation and the type of error encoutered
       String example: 's s[1::3] s[10] s[-1]' '''
    for op in string.split():
        try:
            print(op + ':', eval(op), sep='\n', end='\n\n')
        except Exception as e:
            print(op + ': ' + str(type(e)), end='\n\n')

### List vs. array execution speed example

In [2]:
n = int(1e+6)
lst = list(range(n))
array = np.arange(n)

"Magic commands":
    - % to time a single line
    - %% to time an entire cell

In [3]:
t_list = %timeit -o sum_list = sum(lst)

10 loops, best of 3: 28.7 ms per loop


In [4]:
t_array = %timeit -o sum_array = np.sum(array)

1000 loops, best of 3: 432 Âµs per loop


In [5]:
print('Sum on the array is ~{} times faster than on the list'.format(int(t_list.best / t_array.best)))

Sum on the array is ~66 times faster than on the list


In [6]:
%lsmagic

Available line magics:
%alias  %alias_magic  %autocall  %automagic  %autosave  %bookmark  %cd  %clear  %cls  %colors  %config  %connect_info  %copy  %ddir  %debug  %dhist  %dirs  %doctest_mode  %echo  %ed  %edit  %env  %gui  %hist  %history  %killbgscripts  %ldir  %less  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %ls  %lsmagic  %macro  %magic  %matplotlib  %mkdir  %more  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %popd  %pprint  %precision  %profile  %prun  %psearch  %psource  %pushd  %pwd  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %ren  %rep  %rerun  %reset  %reset_selective  %rmdir  %run  %save  %sc  %set_env  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %who  %who_ls  %whos  %xdel  %xmode

Available cell magics:
%%!  %%HTML  %%SVG  %%bash  %%capture  %%cmd  %%debug  %%file  %%html  %%javascript  %%js  %%latex  %%perl  %%prun  %%pypy  %%python  %%python2  %%python3  %%rub

In [7]:
s = pd.Series([1, 3, 3, 77])
print('Series s:', s, sep='\n')
print('\ns.value_counts():', s.value_counts(), sep='\n')
# see help:
# s.value_counts?
# see source code:
# s.value_counts??
s_index = s.index
s_values = s.values
print('\ntype(s_index): ', type(s_index))
print('\ntype(s_values): ', type(s_values))
print('s_index.values: ', s_index.values)
print('s.index.values: ', s.index.values)
print('type(s.index.values): ', type(s.index.values))
print('s_values: ', s_values)

Series s:
0     1
1     3
2     3
3    77
dtype: int64

s.value_counts():
3     2
77    1
1     1
dtype: int64

type(s_index):  <class 'pandas.indexes.range.RangeIndex'>

type(s_values):  <class 'numpy.ndarray'>
s_index.values:  [0 1 2 3]
s.index.values:  [0 1 2 3]
type(s.index.values):  <class 'numpy.ndarray'>
s_values:  [ 1  3  3 77]


### Problem 1
<span style="color:green">What type of object is returned from the values of the index of a Series?</span>

In [8]:
print('type(s.index.values): ', type(s.index.values))

type(s.index.values):  <class 'numpy.ndarray'>


### Series with a given index

In [9]:
s = pd.Series(data = [1, 3, 3, 77], index = ['a', 'b', 'c', 'd'])
print(s, '\n')
s2 = pd.Series(data = [1, 3, 3, 77], index = [{'a', 'b'}, ('b', 'c'), range(10), 'd'])
print(s2)
print(s2.index)

a     1
b     3
c     3
d    77
dtype: int64 

{a, b}                             1
(b, c)                             3
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)     3
d                                 77
dtype: int64
Index([{'a', 'b'}, ('b', 'c'), (0, 1, 2, 3, 4, 5, 6, 7, 8, 9), 'd'], dtype='object')


### Simple mathematical operations on Series
Thanks to vectorization, no need to loop through the values in a Series or Array object.

In [10]:
print(s, '\n')
print(s / 5, '\n')
print(s**2 - 12 + 40)

a     1
b     3
c     3
d    77
dtype: int64 

a     0.2
b     0.6
c     0.6
d    15.4
dtype: float64 

a      29
b      37
c      37
d    5957
dtype: int64


### Aggregations

In [11]:
print('Mean: {}\nMedian: {}\nMode: {}\nStandard deviation: {}\nMin: {}\nMax: {}'.format(
    s.mean(), s.median(), s.mode().values[0], s.std(), s.min(), s.max()))

Mean: 21.0
Median: 3.0
Mode: 3
Standard deviation: 37.345236197762446
Min: 1
Max: 77


### Operations on the entire data set

In [12]:
print(s.cumsum(), s.cumprod(), sep='\n\n')

a     1
b     4
c     7
d    84
dtype: int64

a      1
b      3
c      9
d    693
dtype: int64


### Head and tail

In [13]:
s = pd.Series(np.arange(500))
print(s.head(2), s.head(), s.tail(), s.tail(2), sep='\n\n')

0    0
1    1
dtype: int32

0    0
1    1
2    2
3    3
4    4
dtype: int32

495    495
496    496
497    497
498    498
499    499
dtype: int32

498    498
499    499
dtype: int32


### Access individual elements

In [22]:
s = pd.Series(data=[1,3,3,77, 100, -9], index=['a', 'b', 'c', 'd', 'e', 'f'])
for op in 's s[0] s[-1] s[[0,3,5]] s[3:] s[2:6:2]'.split(' '):
    print(op + ':', eval(op), sep='\n', end='\n\n')

# print(s, s[0], s[-1], s[[0, 3, 5]], s[3:], s[2:6:2], sep='\n\n')

s:
a      1
b      3
c      3
d     77
e    100
f     -9
dtype: int64

s[0]:
1

s[-1]:
-9

s[[0,3,5]]:
a     1
d    77
f    -9
dtype: int64

s[3:]:
d     77
e    100
f     -9
dtype: int64

s[2:6:2]:
c      3
e    100
dtype: int64



### WARNING: indexes that don't start with 0 can be confusing

In [44]:
s = pd.Series(data=[1,3,3,77, 100, -9], index=range(2, 8))
for op in 's s[0] s[2] s[7] s[-1] s[2:8:2]'.split():
    try:
        print(op + ':', eval(op), sep='\n', end='\n\n')
    except Exception as e:
        print(op + ': ' + str(type(e)), end='\n\n')

s:
2      1
3      3
4      3
5     77
6    100
7     -9
dtype: int64

s[0]: <class 'KeyError'>

s[2]:
1

s[7]:
-9

s[-1]: <class 'KeyError'>

s[2:8:2]:
4      3
6    100
dtype: int64



In [71]:
s1 = pd.Series(data=range(10), index=range(10))
s2 = pd.Series(data=range(10), index=range(1, 11))
for s in [s1, s2]:
    for op in 's s[1::3] s[10] s[-1]'.split():
        try:
            print(op + ':', eval(op), sep='\n', end='\n\n')
        except Exception as e:
            print(op + ': ' + str(type(e)), end='\n\n')

s:
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

s[1::3]:
1    1
4    4
7    7
dtype: int32

s[10]: <class 'KeyError'>

s[-1]: <class 'KeyError'>

s:
1     0
2     1
3     2
4     3
5     4
6     5
7     6
8     7
9     8
10    9
dtype: int32

s[1::3]:
2    1
5    4
8    7
dtype: int32

s[10]:
9

s[-1]: <class 'KeyError'>



# Integer Location vs Label Location
### Integer location

In [65]:
s = pd.Series(data=[1,3,3,77, 100, -9], index=['a', 'b', 'c', 'd', 'e', 'f'])
for op in 's s.iloc[0] s.iloc[4] s.iloc[[1,3,5]] s.iloc[9]'.split(' '):
    try:
        print(op + ':', eval(op), sep='\n', end='\n\n')
    except Exception as e:
        print(op + ': ' + str(type(e)) + '\n')

s:
a      1
b      3
c      3
d     77
e    100
f     -9
dtype: int64

s.iloc[0]:
1

s.iloc[4]:
100

s.iloc[[1,3,5]]:
b     3
d    77
f    -9
dtype: int64

s.iloc[9]: <class 'IndexError'>



### Label location

In [88]:
print_ops("s s.loc[0] s.loc['a'] s.loc[[1,3,5]] s.loc[['d','f']] s.loc['b'::2]")

# for op in "s s.loc[0] s.loc['a'] s.loc[[1,3,5]] s.loc[['d','f']] s.loc['b'::2]".split(' '):
#     try:
#         print(op + ':', eval(op), sep='\n', end='\n\n')
#     except Exception as e:
#         print(op + ': ' + str(type(e)) + '\n')

s:
1     0
2     1
3     2
4     3
5     4
6     5
7     6
8     7
9     8
10    9
dtype: int32

s.loc[0]: <class 'KeyError'>

s.loc['a']: <class 'KeyError'>

s.loc[[1,3,5]]:
1    0
3    2
5    4
dtype: int32

s.loc[['d','f']]: <class 'KeyError'>

s.loc['b'::2]:
Series([], dtype: int32)



### Adding series with different indexes
The indexes align first and then the values are summed

In [86]:
s1 = pd.Series(data=range(10), index=range(1, 11))
s2 = pd.Series(data=range(10), index=range(10))
print('s1 + s2:', s1 + s2, sep='\n', end='\n\n')
print('s1.iloc[:] + s2.iloc[:]', s1.loc[:] + s2.loc[:], sep='\n', end='\n\n')
df = s1.to_frame(name='s1').join(s2.to_frame(name='s2'), how='outer')
df['sum'] = df['s1'] + df['s2']
print('Visualize s1 + s2 in a DataFrame:', df, sep='\n', end='\n\n')

print('pd.Series(s1.values + s2.values, index=s1.index)', 
       pd.Series(s1.values + s2.values, index=s1.index), sep='\n', end='\n\n')
print('pd.Series(s1.values + s2.values, index=s2.index)', 
       pd.Series(s1.values + s2.values, index=s2.index), sep='\n', end='\n\n')
print('pd.Series(s1.values + s2.values)', 
       pd.Series(s1.values + s2.values), sep='\n', end='\n\n')

s1 + s2:
0      NaN
1      1.0
2      3.0
3      5.0
4      7.0
5      9.0
6     11.0
7     13.0
8     15.0
9     17.0
10     NaN
dtype: float64

s1.iloc[:] + s2.iloc[:]
0      NaN
1      1.0
2      3.0
3      5.0
4      7.0
5      9.0
6     11.0
7     13.0
8     15.0
9     17.0
10     NaN
dtype: float64

Visualize s1 + s2 in a DataFrame:
     s1   s2   sum
0   NaN  0.0   NaN
1   0.0  1.0   1.0
2   1.0  2.0   3.0
3   2.0  3.0   5.0
4   3.0  4.0   7.0
5   4.0  5.0   9.0
6   5.0  6.0  11.0
7   6.0  7.0  13.0
8   7.0  8.0  15.0
9   8.0  9.0  17.0
10  9.0  NaN   NaN

pd.Series(s1.values + s2.values, index=s1.index)
1      0
2      2
3      4
4      6
5      8
6     10
7     12
8     14
9     16
10    18
dtype: int32

pd.Series(s1.values + s2.values, index=s2.index)
0     0
1     2
2     4
3     6
4     8
5    10
6    12
7    14
8    16
9    18
dtype: int32

pd.Series(s1.values + s2.values)
0     0
1     2
2     4
3     6
4     8
5    10
6    12
7    14
8    16
9    18
dtype: int32

