# Boolean Indexing

In [1]:
import numpy as np
import pandas as pd

def print_ops(string):
    '''Splits the string on spaces to get operations
       Tries to run each operation and print the operation and its result
       If error: print operation and the type of error encoutered
       String example: 's s[1::3] s[10] s[-1]' '''
    for op in string.split():
        try:
            print(op + ':', eval(op), sep='\n', end='\n\n')
        except Exception as e:
            print(op + ': ' + str(type(e)), end='\n\n')

It is recommended to use .loc or .iloc to access elements of a series but boolean indexing is an exception and True/False can be placed directly inside []

In [2]:
s = pd.Series(np.random.randn(7), index=list('abcdefg'))
keep = [True, False, True, False, False, True, False]
criteria_1 = s > 0
criteria_2 = s.index > 'c'

ops = 's keep s[keep] s>0 s[s>0] s[criteria_1] s[s.index>"c"] s[criteria_2]'
print_ops(ops)

s:
a   -1.181167
b   -0.141630
c   -0.775126
d    0.053219
e    0.553559
f    0.398240
g    0.178585
dtype: float64

keep:
[True, False, True, False, False, True, False]

s[keep]:
a   -1.181167
c   -0.775126
f    0.398240
dtype: float64

s>0:
a    False
b    False
c    False
d     True
e     True
f     True
g     True
dtype: bool

s[s>0]:
d    0.053219
e    0.553559
f    0.398240
g    0.178585
dtype: float64

s[criteria_1]:
d    0.053219
e    0.553559
f    0.398240
g    0.178585
dtype: float64

s[s.index>"c"]:
d    0.053219
e    0.553559
f    0.398240
g    0.178585
dtype: float64

s[criteria_2]:
d    0.053219
e    0.553559
f    0.398240
g    0.178585
dtype: float64



In [3]:
ops = '''s[s.index.isin(['a','c'])] s.loc[['a','c']]'''
print_ops(ops)

s[s.index.isin(['a','c'])]:
a   -1.181167
c   -0.775126
dtype: float64

s.loc[['a','c']]:
a   -1.181167
c   -0.775126
dtype: float64



### More complex boolean indexing and reverse criteria

In [4]:
s = pd.Series(np.arange(500))
criteria = (s % 2 == 0) & (s % 13 == 0) | (s % 100 == 0)
print_ops('criteria.head(5) s[criteria].head(7) s[~criteria].head(7)')

criteria.head(5):
0     True
1    False
2    False
3    False
4    False
dtype: bool

s[criteria].head(7):
0        0
26      26
52      52
78      78
100    100
104    104
130    130
dtype: int32

s[~criteria].head(7):
1    1
2    2
3    3
4    4
5    5
6    6
7    7
dtype: int32



# Case Study: Calculating Normality of Stock Market Returns
We will examine the returns of different stocks to determine if their daily returns follow a normal distribution. Before we begin you will have to install the third-party library **`pandas-datareader`**. This library does not come pre-packaged with Anaconda so you will have to install it by opening up a terminal and running the following command:

### Pandas Datareader
[Pandas Datareader](http://pandas-datareader.readthedocs.io/en/latest/) is a nice package that retrieves financial data from different online data sources. In this example we will import the **`data`** module and use the **`DataReader`** function to retrieve Amazon stock price data from google finance from 2010 to present.

In [8]:
from pandas_datareader import data
amzn = data.DataReader('amzn', 'google')
close = amzn['Close']

ops = 'amzn.head(10) close.head(10) type(close)'
print_ops(ops)

amzn.head(10):
              Open    High     Low   Close    Volume
Date                                                
2010-01-04  136.25  136.61  133.14  133.90   7600543
2010-01-05  133.43  135.48  131.81  134.69   8856456
2010-01-06  134.60  134.73  131.65  132.25   7180977
2010-01-07  132.01  132.32  128.80  130.00  11030124
2010-01-08  130.56  133.68  129.03  133.52   9833829
2010-01-11  132.62  132.80  129.21  130.31   8786668
2010-01-12  128.99  129.82  126.55  127.35   9098190
2010-01-13  127.90  129.71  125.75  129.11  10727856
2010-01-14  129.14  130.38  126.40  127.35   9788435
2010-01-15  129.18  129.65  127.06  127.14  15382763

close.head(10):
Date
2010-01-04    133.90
2010-01-05    134.69
2010-01-06    132.25
2010-01-07    130.00
2010-01-08    133.52
2010-01-11    130.31
2010-01-12    127.35
2010-01-13    129.11
2010-01-14    127.35
2010-01-15    127.14
Name: Close, dtype: float64

type(close):
<class 'pandas.core.series.Series'>



### % difference (actually fraction, not a %...) between current and previous element and handling missing value

In [15]:
close_change = close.pct_change()
close_change_dropna = close_change.dropna()
print_ops('close_change.head() close_change_dropna.head()')

close_change.head():
Date
2010-01-04         NaN
2010-01-05    0.005900
2010-01-06   -0.018116
2010-01-07   -0.017013
2010-01-08    0.027077
Name: Close, dtype: float64

close_change_dropna.head():
Date
2010-01-05    0.005900
2010-01-06   -0.018116
2010-01-07   -0.017013
2010-01-08    0.027077
2010-01-11   -0.024041
Name: Close, dtype: float64



In [20]:
s = pd.Series(range(10), index=range(10))
s_pct_change = s.pct_change()
s_pct_change_drop = s_pct_change.dropna()
print(s, s_pct_change, s_pct_change_drop, sep='\n\n')

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

0         NaN
1         inf
2    1.000000
3    0.500000
4    0.333333
5    0.250000
6    0.200000
7    0.166667
8    0.142857
9    0.125000
dtype: float64

1         inf
2    1.000000
3    0.500000
4    0.333333
5    0.250000
6    0.200000
7    0.166667
8    0.142857
9    0.125000
dtype: float64
