# Pandas and Numpy Intro

In [1]:
import numpy as np
import pandas as pd

### List vs. array execution speed example

In [2]:
n = int(1e+6)
lst = list(range(n))
array = np.arange(n)

"Magic commands":
    - % to time a single line
    - %% to time an entire cell

In [3]:
t_list = %timeit -o sum_list = sum(lst)

10 loops, best of 3: 30.6 ms per loop


In [4]:
t_array = %timeit -o sum_array = np.sum(array)

1000 loops, best of 3: 451 µs per loop


In [5]:
print('Sum on the array is ~{} times faster than on the list'.format(int(t_list.best / t_array.best)))

Sum on the array is ~67 times faster than on the list


In [6]:
%lsmagic

Available line magics:
%alias  %alias_magic  %autocall  %automagic  %autosave  %bookmark  %cd  %clear  %cls  %colors  %config  %connect_info  %copy  %ddir  %debug  %dhist  %dirs  %doctest_mode  %echo  %ed  %edit  %env  %gui  %hist  %history  %killbgscripts  %ldir  %less  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %ls  %lsmagic  %macro  %magic  %matplotlib  %mkdir  %more  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %popd  %pprint  %precision  %profile  %prun  %psearch  %psource  %pushd  %pwd  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %ren  %rep  %rerun  %reset  %reset_selective  %rmdir  %run  %save  %sc  %set_env  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %who  %who_ls  %whos  %xdel  %xmode

Available cell magics:
%%!  %%HTML  %%SVG  %%bash  %%capture  %%cmd  %%debug  %%file  %%html  %%javascript  %%js  %%latex  %%perl  %%prun  %%pypy  %%python  %%python2  %%python3  %%rub

In [7]:
s = pd.Series([1, 3, 3, 77])
print('Series s:', s, sep='\n')
print('\ns.value_counts():', s.value_counts(), sep='\n')
# see help:
# s.value_counts?
# see source code:
# s.value_counts??
s_index = s.index
s_values = s.values
print('\ntype(s_index): ', type(s_index))
print('\ntype(s_values): ', type(s_values))
print('s_index.values: ', s_index.values)
print('s.index.values: ', s.index.values)
print('type(s.index.values): ', type(s.index.values))
print('s_values: ', s_values)

Series s:
0     1
1     3
2     3
3    77
dtype: int64

s.value_counts():
3     2
77    1
1     1
dtype: int64

type(s_index):  <class 'pandas.indexes.range.RangeIndex'>

type(s_values):  <class 'numpy.ndarray'>
s_index.values:  [0 1 2 3]
s.index.values:  [0 1 2 3]
type(s.index.values):  <class 'numpy.ndarray'>
s_values:  [ 1  3  3 77]


### Problem 1
<span style="color:green">What type of object is returned from the values of the index of a Series?</span>

In [8]:
print('type(s.index.values): ', type(s.index.values))

type(s.index.values):  <class 'numpy.ndarray'>


### Series with a given index

In [9]:
s = pd.Series(data = [1, 3, 3, 77], index = ['a', 'b', 'c', 'd'])
print(s, '\n')
s2 = pd.Series(data = [1, 3, 3, 77], index = [{'a', 'b'}, ('b', 'c'), range(10), 'd'])
print(s2)
print(s2.index)

a     1
b     3
c     3
d    77
dtype: int64 

{b, a}                             1
(b, c)                             3
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)     3
d                                 77
dtype: int64
Index([{'b', 'a'}, ('b', 'c'), (0, 1, 2, 3, 4, 5, 6, 7, 8, 9), 'd'], dtype='object')


### Simple mathematical operations on Series
Thanks to vectorization, no need to loop through the values in a Series or Array object.

In [10]:
print(s, '\n')
print(s / 5, '\n')
print(s**2 - 12 + 40)

a     1
b     3
c     3
d    77
dtype: int64 

a     0.2
b     0.6
c     0.6
d    15.4
dtype: float64 

a      29
b      37
c      37
d    5957
dtype: int64


### Aggregations

In [20]:
print('Mean: {}\nMedian: {}\nMode: {}\nStandard deviation: {}\nMin: {}\nMax: {}'.format(
    s.mean(), s.median(), s.mode().values[0], s.std(), s.min(), s.max()))

Mean: 21.0
Median: 3.0
Mode: 3
Standard deviation: 37.345236197762446
Min: 1
Max: 77
