### Boolean Masking

Boolean masking is basically like a filter that can be placed on an array to select elements which satisfy certain criteria.

First, let's look at a function in NumPy, called `np.less`.

In [1]:
import numpy as np

m = np.array([-1, 1, -2, 2, -3, 3])
m

array([-1,  1, -2,  2, -3,  3])

In [2]:
np.less(m, 0)

array([ True, False,  True, False,  True, False])

In [3]:
m[np.less(m, 0)]

array([-1, -2, -3])

In [4]:
m[(m > 0) & (m % 2 ==1)]

array([1, 3])

In [5]:
m < 0

array([ True, False,  True, False,  True, False])

In [6]:
mask = m > 0
mask

array([False,  True, False,  True, False,  True])

In [7]:
m[mask]

array([1, 2, 3])

In [9]:
m = np.array(
    [
        [-1, 1, -2, 2, -3, 3],
        [-4, 4, -5, 5, -6, 6],
        [-7, 7, -8, 8, -9, 9],
    ]
)

In [10]:
negative_number = m[m < 0]
negative_number

array([-1, -2, -3, -4, -5, -6, -7, -8, -9])

In [11]:
arr = np.arange(-5, 6)
arr

array([-5, -4, -3, -2, -1,  0,  1,  2,  3,  4,  5])

In [12]:
arr[(arr > 0) & (arr < 4)]

array([1, 2, 3])

In [15]:
arr

array([-5, -4, -3, -2, -1,  0,  1,  2,  3,  4,  5])

In [14]:
arr[~(arr < 0)]
arr  # print the original array, not the masked one

array([-5, -4, -3, -2, -1,  0,  1,  2,  3,  4,  5])

In [16]:
arr[~(arr < 0)]

array([0, 1, 2, 3, 4, 5])

In [17]:
import csv
from dateutil import parser

In [18]:
with open('AAPL.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)

['Symbol', ' Date', ' Close', ' Volume', ' Open', ' High', ' Low']
['AAPL', ' 10/29/2020', ' 115.32', ' 146129200', ' 112.37', ' 116.93', ' 112.2']
['AAPL', ' 10/28/2020', ' 111.2', ' 143937800', ' 115.05', ' 115.43', ' 111.1']
['AAPL', ' 10/27/2020', ' 116.6', ' 92276770', ' 115.49', ' 117.28', ' 114.5399']
['AAPL', ' 10/26/2020', ' 115.05', ' 111850700', ' 114.01', ' 116.55', ' 112.88']
['AAPL', ' 10/23/2020', ' 115.04', ' 82572650', ' 116.39', ' 116.55', ' 114.28']
['AAPL', ' 10/22/2020', ' 115.75', ' 101988000', ' 117.45', ' 118.04', ' 114.59']
['AAPL', ' 10/21/2020', ' 116.87', ' 89945980', ' 116.67', ' 118.705', ' 116.45']
['AAPL', ' 10/20/2020', ' 117.51', ' 124423700', ' 116.2', ' 118.98', ' 115.63']
['AAPL', ' 10/19/2020', ' 115.98', ' 120639300', ' 119.96', ' 120.419', ' 115.66']
['AAPL', ' 10/16/2020', ' 119.02', ' 115393800', ' 121.28', ' 121.548', ' 118.81']
['AAPL', ' 10/15/2020', ' 120.71', ' 112559200', ' 118.72', ' 121.2', ' 118.15']
['AAPL', ' 10/14/2020', ' 121.19', 

In [19]:
with open('AAPL.csv') as f:
    reader = csv.reader(f, skipinitialspace=True)
    headers = next(reader)

    data = list(reader)
    
    print(headers)
    print(data)

['Symbol', 'Date', 'Close', 'Volume', 'Open', 'High', 'Low']
[['AAPL', '10/29/2020', '115.32', '146129200', '112.37', '116.93', '112.2'], ['AAPL', '10/28/2020', '111.2', '143937800', '115.05', '115.43', '111.1'], ['AAPL', '10/27/2020', '116.6', '92276770', '115.49', '117.28', '114.5399'], ['AAPL', '10/26/2020', '115.05', '111850700', '114.01', '116.55', '112.88'], ['AAPL', '10/23/2020', '115.04', '82572650', '116.39', '116.55', '114.28'], ['AAPL', '10/22/2020', '115.75', '101988000', '117.45', '118.04', '114.59'], ['AAPL', '10/21/2020', '116.87', '89945980', '116.67', '118.705', '116.45'], ['AAPL', '10/20/2020', '117.51', '124423700', '116.2', '118.98', '115.63'], ['AAPL', '10/19/2020', '115.98', '120639300', '119.96', '120.419', '115.66'], ['AAPL', '10/16/2020', '119.02', '115393800', '121.28', '121.548', '118.81'], ['AAPL', '10/15/2020', '120.71', '112559200', '118.72', '121.2', '118.15'], ['AAPL', '10/14/2020', '121.19', '151062300', '121', '123.03', '119.62'], ['AAPL', '10/13/2020'

In [21]:
print(headers)
for row in data:
    print(row)

['Symbol', 'Date', 'Close', 'Volume', 'Open', 'High', 'Low']
['AAPL', '10/29/2020', '115.32', '146129200', '112.37', '116.93', '112.2']
['AAPL', '10/28/2020', '111.2', '143937800', '115.05', '115.43', '111.1']
['AAPL', '10/27/2020', '116.6', '92276770', '115.49', '117.28', '114.5399']
['AAPL', '10/26/2020', '115.05', '111850700', '114.01', '116.55', '112.88']
['AAPL', '10/23/2020', '115.04', '82572650', '116.39', '116.55', '114.28']
['AAPL', '10/22/2020', '115.75', '101988000', '117.45', '118.04', '114.59']
['AAPL', '10/21/2020', '116.87', '89945980', '116.67', '118.705', '116.45']
['AAPL', '10/20/2020', '117.51', '124423700', '116.2', '118.98', '115.63']
['AAPL', '10/19/2020', '115.98', '120639300', '119.96', '120.419', '115.66']
['AAPL', '10/16/2020', '119.02', '115393800', '121.28', '121.548', '118.81']
['AAPL', '10/15/2020', '120.71', '112559200', '118.72', '121.2', '118.15']
['AAPL', '10/14/2020', '121.19', '151062300', '121', '123.03', '119.62']
['AAPL', '10/13/2020', '121.1', '2

In [22]:
arr = np.array(data)
arr

array([['AAPL', '10/29/2020', '115.32', '146129200', '112.37', '116.93',
        '112.2'],
       ['AAPL', '10/28/2020', '111.2', '143937800', '115.05', '115.43',
        '111.1'],
       ['AAPL', '10/27/2020', '116.6', '92276770', '115.49', '117.28',
        '114.5399'],
       ['AAPL', '10/26/2020', '115.05', '111850700', '114.01', '116.55',
        '112.88'],
       ['AAPL', '10/23/2020', '115.04', '82572650', '116.39', '116.55',
        '114.28'],
       ['AAPL', '10/22/2020', '115.75', '101988000', '117.45', '118.04',
        '114.59'],
       ['AAPL', '10/21/2020', '116.87', '89945980', '116.67', '118.705',
        '116.45'],
       ['AAPL', '10/20/2020', '117.51', '124423700', '116.2', '118.98',
        '115.63'],
       ['AAPL', '10/19/2020', '115.98', '120639300', '119.96', '120.419',
        '115.66'],
       ['AAPL', '10/16/2020', '119.02', '115393800', '121.28', '121.548',
        '118.81'],
       ['AAPL', '10/15/2020', '120.71', '112559200', '118.72', '121.2',
        '11

In [24]:
dates = np.array([parser.parse(dt) for dt in arr[:,1]])
dates

array([datetime.datetime(2020, 10, 29, 0, 0),
       datetime.datetime(2020, 10, 28, 0, 0),
       datetime.datetime(2020, 10, 27, 0, 0),
       datetime.datetime(2020, 10, 26, 0, 0),
       datetime.datetime(2020, 10, 23, 0, 0),
       datetime.datetime(2020, 10, 22, 0, 0),
       datetime.datetime(2020, 10, 21, 0, 0),
       datetime.datetime(2020, 10, 20, 0, 0),
       datetime.datetime(2020, 10, 19, 0, 0),
       datetime.datetime(2020, 10, 16, 0, 0),
       datetime.datetime(2020, 10, 15, 0, 0),
       datetime.datetime(2020, 10, 14, 0, 0),
       datetime.datetime(2020, 10, 13, 0, 0),
       datetime.datetime(2020, 10, 12, 0, 0),
       datetime.datetime(2020, 10, 9, 0, 0),
       datetime.datetime(2020, 10, 8, 0, 0),
       datetime.datetime(2020, 10, 7, 0, 0),
       datetime.datetime(2020, 10, 6, 0, 0),
       datetime.datetime(2020, 10, 5, 0, 0),
       datetime.datetime(2020, 10, 2, 0, 0),
       datetime.datetime(2020, 10, 1, 0, 0),
       datetime.datetime(2020, 9, 30, 0, 

In [26]:
print(headers)

['Symbol', 'Date', 'Close', 'Volume', 'Open', 'High', 'Low']


In [25]:
ohlc = arr[:, [4, 5, 6, 2]].astype(float) # open, high, low, close
ohlc

array([[112.37  , 116.93  , 112.2   , 115.32  ],
       [115.05  , 115.43  , 111.1   , 111.2   ],
       [115.49  , 117.28  , 114.5399, 116.6   ],
       [114.01  , 116.55  , 112.88  , 115.05  ],
       [116.39  , 116.55  , 114.28  , 115.04  ],
       [117.45  , 118.04  , 114.59  , 115.75  ],
       [116.67  , 118.705 , 116.45  , 116.87  ],
       [116.2   , 118.98  , 115.63  , 117.51  ],
       [119.96  , 120.419 , 115.66  , 115.98  ],
       [121.28  , 121.548 , 118.81  , 119.02  ],
       [118.72  , 121.2   , 118.15  , 120.71  ],
       [121.    , 123.03  , 119.62  , 121.19  ],
       [125.27  , 125.39  , 119.65  , 121.1   ],
       [120.06  , 125.18  , 119.2845, 124.4   ],
       [115.28  , 117.    , 114.92  , 116.97  ],
       [116.25  , 116.4   , 114.5901, 114.97  ],
       [114.62  , 115.55  , 114.13  , 115.08  ],
       [115.7   , 116.12  , 112.25  , 113.16  ],
       [113.91  , 116.65  , 113.55  , 116.5   ],
       [112.89  , 115.37  , 112.22  , 113.02  ],
       [117.64  , 11

In [27]:
ohlc[:, 3] > 116.0  # close (open, high, low, close)

array([False, False,  True, False, False, False,  True,  True, False,
        True,  True,  True,  True,  True,  True, False, False, False,
        True, False,  True, False, False])

In [28]:
ohlc[ohlc[:, 3] > 116.0]

array([[115.49  , 117.28  , 114.5399, 116.6   ],
       [116.67  , 118.705 , 116.45  , 116.87  ],
       [116.2   , 118.98  , 115.63  , 117.51  ],
       [121.28  , 121.548 , 118.81  , 119.02  ],
       [118.72  , 121.2   , 118.15  , 120.71  ],
       [121.    , 123.03  , 119.62  , 121.19  ],
       [125.27  , 125.39  , 119.65  , 121.1   ],
       [120.06  , 125.18  , 119.2845, 124.4   ],
       [115.28  , 117.    , 114.92  , 116.97  ],
       [113.91  , 116.65  , 113.55  , 116.5   ],
       [117.64  , 117.72  , 115.83  , 116.79  ]])

In [34]:
ohlc[:, 3]

array([115.32, 111.2 , 116.6 , 115.05, 115.04, 115.75, 116.87, 117.51,
       115.98, 119.02, 120.71, 121.19, 121.1 , 124.4 , 116.97, 114.97,
       115.08, 113.16, 116.5 , 113.02, 116.79, 115.81, 114.09])

In [35]:
ohlc

array([[112.37  , 116.93  , 112.2   , 115.32  ],
       [115.05  , 115.43  , 111.1   , 111.2   ],
       [115.49  , 117.28  , 114.5399, 116.6   ],
       [114.01  , 116.55  , 112.88  , 115.05  ],
       [116.39  , 116.55  , 114.28  , 115.04  ],
       [117.45  , 118.04  , 114.59  , 115.75  ],
       [116.67  , 118.705 , 116.45  , 116.87  ],
       [116.2   , 118.98  , 115.63  , 117.51  ],
       [119.96  , 120.419 , 115.66  , 115.98  ],
       [121.28  , 121.548 , 118.81  , 119.02  ],
       [118.72  , 121.2   , 118.15  , 120.71  ],
       [121.    , 123.03  , 119.62  , 121.19  ],
       [125.27  , 125.39  , 119.65  , 121.1   ],
       [120.06  , 125.18  , 119.2845, 124.4   ],
       [115.28  , 117.    , 114.92  , 116.97  ],
       [116.25  , 116.4   , 114.5901, 114.97  ],
       [114.62  , 115.55  , 114.13  , 115.08  ],
       [115.7   , 116.12  , 112.25  , 113.16  ],
       [113.91  , 116.65  , 113.55  , 116.5   ],
       [112.89  , 115.37  , 112.22  , 113.02  ],
       [117.64  , 11

In [39]:
# aaron - verification

print(ohlc[:10, 3])

[115.32 111.2  116.6  115.05 115.04 115.75 116.87 117.51 115.98 119.02]


In [43]:
for index, item in enumerate(ohlc[:, 3]):
    if item > 116:
        print(index)
        print(item)

2
116.6
6
116.87
7
117.51
9
119.02
10
120.71
11
121.19
12
121.1
13
124.4
14
116.97
18
116.5
20
116.79


In [44]:
ohlc[ohlc[:, 3] > 116]

array([[115.49  , 117.28  , 114.5399, 116.6   ],
       [116.67  , 118.705 , 116.45  , 116.87  ],
       [116.2   , 118.98  , 115.63  , 117.51  ],
       [121.28  , 121.548 , 118.81  , 119.02  ],
       [118.72  , 121.2   , 118.15  , 120.71  ],
       [121.    , 123.03  , 119.62  , 121.19  ],
       [125.27  , 125.39  , 119.65  , 121.1   ],
       [120.06  , 125.18  , 119.2845, 124.4   ],
       [115.28  , 117.    , 114.92  , 116.97  ],
       [113.91  , 116.65  , 113.55  , 116.5   ],
       [117.64  , 117.72  , 115.83  , 116.79  ]])

In [45]:
115.49 > 116

False

In [46]:
ohlc[:, 3]

array([115.32, 111.2 , 116.6 , 115.05, 115.04, 115.75, 116.87, 117.51,
       115.98, 119.02, 120.71, 121.19, 121.1 , 124.4 , 116.97, 114.97,
       115.08, 113.16, 116.5 , 113.02, 116.79, 115.81, 114.09])

In [48]:
mask = ohlc[:, 3] > 116
mask

array([False, False,  True, False, False, False,  True,  True, False,
        True,  True,  True,  True,  True,  True, False, False, False,
        True, False,  True, False, False])

In [49]:
ohlc[mask]

array([[115.49  , 117.28  , 114.5399, 116.6   ],
       [116.67  , 118.705 , 116.45  , 116.87  ],
       [116.2   , 118.98  , 115.63  , 117.51  ],
       [121.28  , 121.548 , 118.81  , 119.02  ],
       [118.72  , 121.2   , 118.15  , 120.71  ],
       [121.    , 123.03  , 119.62  , 121.19  ],
       [125.27  , 125.39  , 119.65  , 121.1   ],
       [120.06  , 125.18  , 119.2845, 124.4   ],
       [115.28  , 117.    , 114.92  , 116.97  ],
       [113.91  , 116.65  , 113.55  , 116.5   ],
       [117.64  , 117.72  , 115.83  , 116.79  ]])

### What's the problem in the above validation?

- You saw the condition is the `close` > 116
- however, the first number shown is `115.49` when ohlc[mask] that seems less than 116
- that's not an error, because you've print all ohlc's 4 columns, instead of the condition on colume 3(#4 colume which is `close`)
> - you can use the following the see the `close` float which are greater than 116: 

In [50]:
ohlc[ohlc[:, 3] > 116][:, 3]

array([116.6 , 116.87, 117.51, 119.02, 120.71, 121.19, 121.1 , 124.4 ,
       116.97, 116.5 , 116.79])

In [55]:
filtered_data = [
    [date, row]
    for date, row in zip(dates[mask], ohlc[mask])
]

filtered_data

[[datetime.datetime(2020, 10, 27, 0, 0),
  array([115.49  , 117.28  , 114.5399, 116.6   ])],
 [datetime.datetime(2020, 10, 21, 0, 0),
  array([116.67 , 118.705, 116.45 , 116.87 ])],
 [datetime.datetime(2020, 10, 20, 0, 0),
  array([116.2 , 118.98, 115.63, 117.51])],
 [datetime.datetime(2020, 10, 16, 0, 0),
  array([121.28 , 121.548, 118.81 , 119.02 ])],
 [datetime.datetime(2020, 10, 15, 0, 0),
  array([118.72, 121.2 , 118.15, 120.71])],
 [datetime.datetime(2020, 10, 14, 0, 0),
  array([121.  , 123.03, 119.62, 121.19])],
 [datetime.datetime(2020, 10, 13, 0, 0),
  array([125.27, 125.39, 119.65, 121.1 ])],
 [datetime.datetime(2020, 10, 12, 0, 0),
  array([120.06  , 125.18  , 119.2845, 124.4   ])],
 [datetime.datetime(2020, 10, 9, 0, 0),
  array([115.28, 117.  , 114.92, 116.97])],
 [datetime.datetime(2020, 10, 5, 0, 0),
  array([113.91, 116.65, 113.55, 116.5 ])],
 [datetime.datetime(2020, 10, 1, 0, 0),
  array([117.64, 117.72, 115.83, 116.79])]]

In [56]:
for row in filtered_data:
    print(row)

[datetime.datetime(2020, 10, 27, 0, 0), array([115.49  , 117.28  , 114.5399, 116.6   ])]
[datetime.datetime(2020, 10, 21, 0, 0), array([116.67 , 118.705, 116.45 , 116.87 ])]
[datetime.datetime(2020, 10, 20, 0, 0), array([116.2 , 118.98, 115.63, 117.51])]
[datetime.datetime(2020, 10, 16, 0, 0), array([121.28 , 121.548, 118.81 , 119.02 ])]
[datetime.datetime(2020, 10, 15, 0, 0), array([118.72, 121.2 , 118.15, 120.71])]
[datetime.datetime(2020, 10, 14, 0, 0), array([121.  , 123.03, 119.62, 121.19])]
[datetime.datetime(2020, 10, 13, 0, 0), array([125.27, 125.39, 119.65, 121.1 ])]
[datetime.datetime(2020, 10, 12, 0, 0), array([120.06  , 125.18  , 119.2845, 124.4   ])]
[datetime.datetime(2020, 10, 9, 0, 0), array([115.28, 117.  , 114.92, 116.97])]
[datetime.datetime(2020, 10, 5, 0, 0), array([113.91, 116.65, 113.55, 116.5 ])]
[datetime.datetime(2020, 10, 1, 0, 0), array([117.64, 117.72, 115.83, 116.79])]


In [57]:
date = '10/1/2020'
numbers = [100, 200, 300, 400]

[date, numbers]

['10/1/2020', [100, 200, 300, 400]]

> We can `unpack` an iterable using `*`:

In [58]:
numbers

[100, 200, 300, 400]

In [59]:
*numbers

SyntaxError: can't use starred expression here (3428227854.py, line 1)

In [60]:
[*numbers]

[100, 200, 300, 400]

In [62]:
a = (*numbers)
a

SyntaxError: cannot use starred expression here (3825842964.py, line 1)

In [63]:
[date, *numbers]

['10/1/2020', 100, 200, 300, 400]

In [64]:
filter_data = [
    [date, *row]
    for date, row in zip(dates[mask], ohlc[mask])
]

for row in filter_data:
    print(row)

[datetime.datetime(2020, 10, 27, 0, 0), 115.49, 117.28, 114.5399, 116.6]
[datetime.datetime(2020, 10, 21, 0, 0), 116.67, 118.705, 116.45, 116.87]
[datetime.datetime(2020, 10, 20, 0, 0), 116.2, 118.98, 115.63, 117.51]
[datetime.datetime(2020, 10, 16, 0, 0), 121.28, 121.548, 118.81, 119.02]
[datetime.datetime(2020, 10, 15, 0, 0), 118.72, 121.2, 118.15, 120.71]
[datetime.datetime(2020, 10, 14, 0, 0), 121.0, 123.03, 119.62, 121.19]
[datetime.datetime(2020, 10, 13, 0, 0), 125.27, 125.39, 119.65, 121.1]
[datetime.datetime(2020, 10, 12, 0, 0), 120.06, 125.18, 119.2845, 124.4]
[datetime.datetime(2020, 10, 9, 0, 0), 115.28, 117.0, 114.92, 116.97]
[datetime.datetime(2020, 10, 5, 0, 0), 113.91, 116.65, 113.55, 116.5]
[datetime.datetime(2020, 10, 1, 0, 0), 117.64, 117.72, 115.83, 116.79]


> Compre with the original:
> - this is with the whole array containing open, high, low, close as one packed array, which is not what we want.
> - therefore we use unpack [...*something...] to unpack all the values in an array into individual float

In [66]:
# the original unpacked array, which might NOT what we want
filtered_data = [
    [date, row]
    for date, row in zip(dates[mask], ohlc[mask])
]

for row in filtered_data:
    print(row)

[datetime.datetime(2020, 10, 27, 0, 0), array([115.49  , 117.28  , 114.5399, 116.6   ])]
[datetime.datetime(2020, 10, 21, 0, 0), array([116.67 , 118.705, 116.45 , 116.87 ])]
[datetime.datetime(2020, 10, 20, 0, 0), array([116.2 , 118.98, 115.63, 117.51])]
[datetime.datetime(2020, 10, 16, 0, 0), array([121.28 , 121.548, 118.81 , 119.02 ])]
[datetime.datetime(2020, 10, 15, 0, 0), array([118.72, 121.2 , 118.15, 120.71])]
[datetime.datetime(2020, 10, 14, 0, 0), array([121.  , 123.03, 119.62, 121.19])]
[datetime.datetime(2020, 10, 13, 0, 0), array([125.27, 125.39, 119.65, 121.1 ])]
[datetime.datetime(2020, 10, 12, 0, 0), array([120.06  , 125.18  , 119.2845, 124.4   ])]
[datetime.datetime(2020, 10, 9, 0, 0), array([115.28, 117.  , 114.92, 116.97])]
[datetime.datetime(2020, 10, 5, 0, 0), array([113.91, 116.65, 113.55, 116.5 ])]
[datetime.datetime(2020, 10, 1, 0, 0), array([117.64, 117.72, 115.83, 116.79])]


In [1]:
import numpy as np

In [2]:
m = np.array([-1, 1, -2, 2, -3, 3])

We can call the `np.less` function to determine which values in `m` are less than some value:

In [3]:
np.less(m, 0)

array([ True, False,  True, False,  True, False])

As you can see, we get an array of Boolean values indicating whether the values was less than `0` or not.

Although we can use this function, we can also just use the Python comparison operators, which will in turn use these NumPy comparison functions transparently:

In [4]:
m < 0

array([ True, False,  True, False,  True, False])

Now, the result is an array, and we can assign that to a variable. 
We're going to call that variable `mask`:

In [5]:
mask = m > 0

In [6]:
mask

array([False,  True, False,  True, False,  True])

Then, we can "filter" (or **mask**) the elements of `m` using this array of Boolean values:

In [7]:
m[mask]

array([1, 2, 3])

We can simplify this by specifying the indexing and the boolean comparison in a single statement:

In [8]:
m[m > 0]

array([1, 2, 3])

Our original array had a single dimension, and our masked array also has a single dimension, and we probably would have expected that.

However, in higher dimensional arrays, the masked array will still be 1 dimensional.

In [9]:
m = np.array(
    [
        [-1, 1, -2, 2],
        [-3, 3, -4, 4],
        [-5, 5, -6, 6]
    ]
)

In [10]:
negative_numbers = m[m < 0]

In [11]:
negative_numbers

array([-1, -2, -3, -4, -5, -6])

We can combine these boolean masks using **and**, **or** and **not** as well.

However, we have to use `&` for **and**, `|` for **or** and `~` for **not**, not the standard Python operators `and`, `or` and `not`.

In [12]:
arr = np.arange(-5, 6)
arr

array([-5, -4, -3, -2, -1,  0,  1,  2,  3,  4,  5])

In [13]:
arr[(arr > 0) & (arr < 4)]

array([1, 2, 3])

Note also that because of operator precedence we **have** to use the parentheses shown above.

For the **not** operator, we use `~`:

In [14]:
arr[~(arr < 0)]

array([0, 1, 2, 3, 4, 5])

Of course, we should probably just use the `>=` operator instead - simpler to understand:

In [15]:
arr[arr >= 0]

array([0, 1, 2, 3, 4, 5])

Let's look at an example where we want to filter certain rows of data based on the values in one of the columns.

We'll use the file `AAPL.csv` that we've seen before, and first load it up into two arrays - one for the dates, and the other with the OHLC/Volume numerical data.

In [16]:
import csv
from dateutil import parser

Let's first recall what's in the file:

In [17]:
with open('AAPL.csv') as f:
    reader = csv.reader(f, skipinitialspace=True)
    headers = next(reader)
    data = list(reader)

print(headers)
for row in data:
    print(row)

['Symbol', 'Date', 'Close', 'Volume', 'Open', 'High', 'Low']
['AAPL', '10/29/2020', '115.32', '146129200', '112.37', '116.93', '112.2']
['AAPL', '10/28/2020', '111.2', '143937800', '115.05', '115.43', '111.1']
['AAPL', '10/27/2020', '116.6', '92276770', '115.49', '117.28', '114.5399']
['AAPL', '10/26/2020', '115.05', '111850700', '114.01', '116.55', '112.88']
['AAPL', '10/23/2020', '115.04', '82572650', '116.39', '116.55', '114.28']
['AAPL', '10/22/2020', '115.75', '101988000', '117.45', '118.04', '114.59']
['AAPL', '10/21/2020', '116.87', '89945980', '116.67', '118.705', '116.45']
['AAPL', '10/20/2020', '117.51', '124423700', '116.2', '118.98', '115.63']
['AAPL', '10/19/2020', '115.98', '120639300', '119.96', '120.419', '115.66']
['AAPL', '10/16/2020', '119.02', '115393800', '121.28', '121.548', '118.81']
['AAPL', '10/15/2020', '120.71', '112559200', '118.72', '121.2', '118.15']
['AAPL', '10/14/2020', '121.19', '151062300', '121', '123.03', '119.62']
['AAPL', '10/13/2020', '121.1', '2

We load up everything into a NumPy array:

In [18]:
arr = np.array(data)
arr

array([['AAPL', '10/29/2020', '115.32', '146129200', '112.37', '116.93',
        '112.2'],
       ['AAPL', '10/28/2020', '111.2', '143937800', '115.05', '115.43',
        '111.1'],
       ['AAPL', '10/27/2020', '116.6', '92276770', '115.49', '117.28',
        '114.5399'],
       ['AAPL', '10/26/2020', '115.05', '111850700', '114.01', '116.55',
        '112.88'],
       ['AAPL', '10/23/2020', '115.04', '82572650', '116.39', '116.55',
        '114.28'],
       ['AAPL', '10/22/2020', '115.75', '101988000', '117.45', '118.04',
        '114.59'],
       ['AAPL', '10/21/2020', '116.87', '89945980', '116.67', '118.705',
        '116.45'],
       ['AAPL', '10/20/2020', '117.51', '124423700', '116.2', '118.98',
        '115.63'],
       ['AAPL', '10/19/2020', '115.98', '120639300', '119.96', '120.419',
        '115.66'],
       ['AAPL', '10/16/2020', '119.02', '115393800', '121.28', '121.548',
        '118.81'],
       ['AAPL', '10/15/2020', '120.71', '112559200', '118.72', '121.2',
        '11

Next we make an array of just the dates:

In [19]:
dates = np.array([parser.parse(dt) for dt in arr[:, 1]])
dates

array([datetime.datetime(2020, 10, 29, 0, 0),
       datetime.datetime(2020, 10, 28, 0, 0),
       datetime.datetime(2020, 10, 27, 0, 0),
       datetime.datetime(2020, 10, 26, 0, 0),
       datetime.datetime(2020, 10, 23, 0, 0),
       datetime.datetime(2020, 10, 22, 0, 0),
       datetime.datetime(2020, 10, 21, 0, 0),
       datetime.datetime(2020, 10, 20, 0, 0),
       datetime.datetime(2020, 10, 19, 0, 0),
       datetime.datetime(2020, 10, 16, 0, 0),
       datetime.datetime(2020, 10, 15, 0, 0),
       datetime.datetime(2020, 10, 14, 0, 0),
       datetime.datetime(2020, 10, 13, 0, 0),
       datetime.datetime(2020, 10, 12, 0, 0),
       datetime.datetime(2020, 10, 9, 0, 0),
       datetime.datetime(2020, 10, 8, 0, 0),
       datetime.datetime(2020, 10, 7, 0, 0),
       datetime.datetime(2020, 10, 6, 0, 0),
       datetime.datetime(2020, 10, 5, 0, 0),
       datetime.datetime(2020, 10, 2, 0, 0),
       datetime.datetime(2020, 10, 1, 0, 0),
       datetime.datetime(2020, 9, 30, 0, 

Then we make a float array of the numerical values:

In [20]:
ohlc = arr[:, [4, 5, 6, 2]].astype(float)
ohlc

array([[112.37  , 116.93  , 112.2   , 115.32  ],
       [115.05  , 115.43  , 111.1   , 111.2   ],
       [115.49  , 117.28  , 114.5399, 116.6   ],
       [114.01  , 116.55  , 112.88  , 115.05  ],
       [116.39  , 116.55  , 114.28  , 115.04  ],
       [117.45  , 118.04  , 114.59  , 115.75  ],
       [116.67  , 118.705 , 116.45  , 116.87  ],
       [116.2   , 118.98  , 115.63  , 117.51  ],
       [119.96  , 120.419 , 115.66  , 115.98  ],
       [121.28  , 121.548 , 118.81  , 119.02  ],
       [118.72  , 121.2   , 118.15  , 120.71  ],
       [121.    , 123.03  , 119.62  , 121.19  ],
       [125.27  , 125.39  , 119.65  , 121.1   ],
       [120.06  , 125.18  , 119.2845, 124.4   ],
       [115.28  , 117.    , 114.92  , 116.97  ],
       [116.25  , 116.4   , 114.5901, 114.97  ],
       [114.62  , 115.55  , 114.13  , 115.08  ],
       [115.7   , 116.12  , 112.25  , 113.16  ],
       [113.91  , 116.65  , 113.55  , 116.5   ],
       [112.89  , 115.37  , 112.22  , 113.02  ],
       [117.64  , 11

What we want is to identify the days on which the stock closed higher than `116.00`.

To do this, we basically want to create a mask on column #3:

In [21]:
ohlc[:, 3] > 116.0

array([False, False,  True, False, False, False,  True,  True, False,
        True,  True,  True,  True,  True,  True, False, False, False,
        True, False,  True, False, False])

We could use that mask on the `ohlc` array to filter out records which closed higher than `116.00`:

In [22]:
ohlc[ohlc[:, 3] > 116.0]

array([[115.49  , 117.28  , 114.5399, 116.6   ],
       [116.67  , 118.705 , 116.45  , 116.87  ],
       [116.2   , 118.98  , 115.63  , 117.51  ],
       [121.28  , 121.548 , 118.81  , 119.02  ],
       [118.72  , 121.2   , 118.15  , 120.71  ],
       [121.    , 123.03  , 119.62  , 121.19  ],
       [125.27  , 125.39  , 119.65  , 121.1   ],
       [120.06  , 125.18  , 119.2845, 124.4   ],
       [115.28  , 117.    , 114.92  , 116.97  ],
       [113.91  , 116.65  , 113.55  , 116.5   ],
       [117.64  , 117.72  , 115.83  , 116.79  ]])

But what we're really after are those dates.

Now the `dates` and `ohlc` arrays are in the same order, so we can use the mask we obtain from looking at `ohlc` and apply that mask to the `dates` array:

In [23]:
dates[ohlc[:, 3] > 116.0]

array([datetime.datetime(2020, 10, 27, 0, 0),
       datetime.datetime(2020, 10, 21, 0, 0),
       datetime.datetime(2020, 10, 20, 0, 0),
       datetime.datetime(2020, 10, 16, 0, 0),
       datetime.datetime(2020, 10, 15, 0, 0),
       datetime.datetime(2020, 10, 14, 0, 0),
       datetime.datetime(2020, 10, 13, 0, 0),
       datetime.datetime(2020, 10, 12, 0, 0),
       datetime.datetime(2020, 10, 9, 0, 0),
       datetime.datetime(2020, 10, 5, 0, 0),
       datetime.datetime(2020, 10, 1, 0, 0)], dtype=object)

If we want to, we could re-assemble this data into a Python list so that we can have the dates and numerical values side by side:

In [24]:
mask = ohlc[:, 3] > 116.0

We could try this approach, which almost works:

In [25]:
filtered_data = [
    [date, row]
    for date, row in zip(dates[mask], ohlc[mask])
]

for row in filtered_data:
    print(row)

[datetime.datetime(2020, 10, 27, 0, 0), array([115.49  , 117.28  , 114.5399, 116.6   ])]
[datetime.datetime(2020, 10, 21, 0, 0), array([116.67 , 118.705, 116.45 , 116.87 ])]
[datetime.datetime(2020, 10, 20, 0, 0), array([116.2 , 118.98, 115.63, 117.51])]
[datetime.datetime(2020, 10, 16, 0, 0), array([121.28 , 121.548, 118.81 , 119.02 ])]
[datetime.datetime(2020, 10, 15, 0, 0), array([118.72, 121.2 , 118.15, 120.71])]
[datetime.datetime(2020, 10, 14, 0, 0), array([121.  , 123.03, 119.62, 121.19])]
[datetime.datetime(2020, 10, 13, 0, 0), array([125.27, 125.39, 119.65, 121.1 ])]
[datetime.datetime(2020, 10, 12, 0, 0), array([120.06  , 125.18  , 119.2845, 124.4   ])]
[datetime.datetime(2020, 10, 9, 0, 0), array([115.28, 117.  , 114.92, 116.97])]
[datetime.datetime(2020, 10, 5, 0, 0), array([113.91, 116.65, 113.55, 116.5 ])]
[datetime.datetime(2020, 10, 1, 0, 0), array([117.64, 117.72, 115.83, 116.79])]


The problem here is that our resulting rows contain two elements: the date and an array containing that row's OHLC data.

We just need to unpack the numeric items in each row, and zip that up.

Before we do it, let's just look at that concept again and recall how this works using a simpler example.

In [26]:
date = '10/1/2020'
numbers = [100, 200, 300, 400]

[date, numbers]

['10/1/2020', [100, 200, 300, 400]]

As you can see, we have the same problem here.

Recall that we can unpack an iterable using `*`:

In [27]:
[date, *numbers]

['10/1/2020', 100, 200, 300, 400]

As you can see, this does exactly what we want, so we can apply that to our problem.

In [28]:
filtered_data = [
    [date, *row]
    for date, row in zip(dates[mask], ohlc[mask])
]

for row in filtered_data:
    print(row)

[datetime.datetime(2020, 10, 27, 0, 0), 115.49, 117.28, 114.5399, 116.6]
[datetime.datetime(2020, 10, 21, 0, 0), 116.67, 118.705, 116.45, 116.87]
[datetime.datetime(2020, 10, 20, 0, 0), 116.2, 118.98, 115.63, 117.51]
[datetime.datetime(2020, 10, 16, 0, 0), 121.28, 121.548, 118.81, 119.02]
[datetime.datetime(2020, 10, 15, 0, 0), 118.72, 121.2, 118.15, 120.71]
[datetime.datetime(2020, 10, 14, 0, 0), 121.0, 123.03, 119.62, 121.19]
[datetime.datetime(2020, 10, 13, 0, 0), 125.27, 125.39, 119.65, 121.1]
[datetime.datetime(2020, 10, 12, 0, 0), 120.06, 125.18, 119.2845, 124.4]
[datetime.datetime(2020, 10, 9, 0, 0), 115.28, 117.0, 114.92, 116.97]
[datetime.datetime(2020, 10, 5, 0, 0), 113.91, 116.65, 113.55, 116.5]
[datetime.datetime(2020, 10, 1, 0, 0), 117.64, 117.72, 115.83, 116.79]
