# Numpy

In [93]:
import numpy as np # import convention

np.random.seed(1337) # important if you want to reproduce your code exactly and you are using random functionality

Numpy is a __fixed-type__ array with advanced mathematical and data manipulation functionality.

In [3]:
np.array([1,2,2,3.5], dtype='int64')

array([1, 2, 2, 3], dtype=int64)

### How to create a Numpy ndarray? 

* Create a new array or derive from existing object like python list or python tuple 
* Load data from a .npy file


In [13]:
# Derive a numpy array from existing object
some_list = [1,2,3,4]

new_array = np.array(some_list)
print('we have: {} which from type: {}'.format(repr(new_array),type(new_array)))

we have: array([1, 2, 3, 4]) which from type: <class 'numpy.ndarray'>


In [14]:
# Load data from a .npy file
np.save('numpy_array_file', new_array)
loaded_array = np.load('numpy_array_file.npy')

print('we have: {} which from type: {}'.format(repr(loaded_array),type(loaded_array)))

we have: array([1, 2, 3, 4]) which from type: <class 'numpy.ndarray'>


## Typing

Usually in your day to day usage typically you won't mind regard the typing of your ndarrays - that the pythonic way!<BR>Whenever you'll run something in production or execute some job that reach your memory limits - that when you'll start caring about your types.<BR>You can check what is the type of your objects in the array with `.dtype`

In [23]:
new_array.dtype

dtype('int32')

You can cast to a different type with `.astype`

In [31]:
print(new_array)
print(new_array.astype('str'))
print(new_array.astype('float64'))

[1 2 3 4]
['1' '2' '3' '4']
[1. 2. 3. 4.]


Pay attention that ndarrays are need to be with a single value type, so it will cast your values to a single type #TODO: rephrase this sentence

In [35]:
np.array([1,2,3.0])

array([1., 2., 3.])

In [36]:
np.array([1,2,'3'])

array(['1', '2', '3'], dtype='<U11')

In [32]:
np.array([1, 2.0, {'1':3}])

array([1, 2.0, {'1': 3}], dtype=object)

## Shapes

ndarray, as it name suggest, can be in n dimensions. 

In [60]:
array_with_4_dim = np.ones((2,3,2,4))
print(array_with_4_dim.ndim)

4


To get a flatten number of elements without manipulate the ndarray itself you can use `array.size`

In [61]:
array_with_4_dim.size

48

And we can control the number of dimension as we wish with `array.reshape` - as long as the number of elements is equal

In [63]:
array_with_4_dim.reshape(12,4)
print(array_with_4_dim.ndim)
print(array_with_4_dim.size)

4
48


In [64]:
array_with_4_dim.reshape(13,1)

ValueError: cannot reshape array of size 48 into shape (13,1)

# Indexing & Slicing

if you are already familiar with python its pretty much the same as the indexing and slicing with lists just need to remember we have here multiple dimensions from time to time

In [68]:
array = np.random.randn(100,20,30)
print(array)

[[[ 0.52938652 -0.83994697  0.32864716 ...  0.3762326   0.156106
   -0.97743426]
  [-0.6594944  -0.45703816 -0.6702949  ...  0.53688367 -0.6240286
   -0.51758273]
  [-1.73807483  2.61795864  1.17218913 ... -0.23255257 -0.40140325
    1.6841075 ]
  ...
  [-1.19714171  0.0035688   0.60915884 ... -0.61207464 -0.32521271
    0.20707459]
  [-0.17937436  0.05275047  0.08467558 ...  0.57802976  2.33252905
    1.649631  ]
  [-0.02209427 -2.11854351 -1.58157123 ...  1.49360365 -0.08060168
    0.42669526]]

 [[-1.16428365  0.12476508  1.47206711 ... -1.26552597 -1.23354345
    0.2221427 ]
  [-0.77463748  0.01185623  0.2165534  ... -1.37177047  1.25073027
    1.45602533]
  [ 2.01100922  1.82152192 -0.61011272 ...  0.45057522  0.43022343
   -0.76058846]
  ...
  [-0.24892789  0.96195701 -0.44650901 ...  0.21069819 -0.06369483
   -0.50684692]
  [-0.06061266 -1.7090016   0.22884191 ...  0.34483726  1.05322149
   -0.3856483 ]
  [-1.39063151 -0.67441875 -0.73880329 ...  0.19005337  1.57958906
    1.923

In [73]:
array[0].shape

(20, 30)


In [74]:
array[0][0].shape

(30,)

In [75]:
array[0][0][0]

0.5293865236292322

In [77]:
array[0][1][1]

-0.4570381585666605

In [78]:
array[0][1][2:5]

array([-0.6702949 ,  0.01303136,  1.28011943])

In [81]:
array[0][3:7][2:5]

array([[-0.71980177,  0.49225744, -0.65103468,  1.10912565, -1.20717885,
        -1.02946936, -0.20443938, -0.05153465,  0.15509645,  0.6151977 ,
        -0.63514605, -0.39025962, -0.63929214, -0.91052064,  1.84468561,
         0.5271816 ,  0.07433414, -2.75640775,  0.12227863, -0.55646033,
         1.46240417, -2.34231875, -0.47400237, -2.28288816, -1.38167892,
        -0.11479028, -0.27891198, -0.16512914,  0.07448304, -0.68718957],
       [-0.84895969,  0.0148896 ,  0.6619462 ,  0.22310783, -0.86563915,
         1.93451475, -0.36313688, -0.2933794 ,  0.88160805, -1.58928198,
         0.47021477,  0.87359654, -0.82437938, -0.21305495, -0.10447653,
        -0.0891155 ,  0.68734312,  0.35791004, -0.85589621,  1.27919389,
        -1.09601333,  1.27554935, -0.02915743,  0.18691832, -0.55739567,
        -0.98065161,  0.26945734, -0.76153821,  0.37863944,  0.12952512]])

In [83]:
array[0, 3:7, 2:5]

array([[-0.51418918, -0.73175332, -0.81510193],
       [ 1.69581392,  0.70176038, -0.23375956],
       [-0.65103468,  1.10912565, -1.20717885],
       [ 0.6619462 ,  0.22310783, -0.86563915]])

You need to understand exactly what you want to slice

# Statistical methods

You can extract various of statistical information about your arrays with built-in functions:

In [86]:
x = np.random.randn(10,10)
print(x)

[[-1.06928945  1.18953254  0.01661448 -1.27747716 -0.71888577  0.69121574
   2.84230309 -1.87252539 -0.05785404 -0.27492886]
 [-0.44030645  0.6921194   2.12919315  0.87100109 -1.05207292  0.01069154
  -1.32661083 -1.67618595 -0.69389513 -1.48547955]
 [ 0.30601961  0.27577203  2.05613796 -0.26384822  0.19154534 -1.18722174
   1.31870991 -0.45889699  1.05948954  0.63837986]
 [-1.03047412 -0.22327308  0.35544532 -0.85283153 -1.48655106  1.20254967
  -0.99943595 -0.7517425  -2.00034688  0.87740231]
 [ 0.73681765  2.20626222  1.5098606  -1.49823601  2.50956105  0.52009767
  -2.59871962  0.08338329 -0.43204939  0.39063443]
 [-0.69621742 -0.46421735  1.26452802 -2.0950217  -0.85515019 -0.72727013
  -0.00367092 -2.06689706 -0.19161956 -0.04291323]
 [ 0.53577508 -1.10685103 -0.53492738 -0.12237227 -0.01074414 -0.09124817
  -0.12477244 -0.89283073 -0.35234412  0.6290123 ]
 [ 0.22089461 -0.30696394 -0.66953404 -0.02152393  0.63247217  0.72829756
  -0.29250438 -0.66768777 -0.06055272  1.39381033]


In [87]:
x.sum()

-1.4026516291894564

In [88]:
x.sum(axis=1)

array([-0.53129482, -2.97154565,  3.93608731, -4.90925783,  3.42761188,
       -5.87844954, -2.0713029 ,  0.9567079 ,  4.56713456,  2.07165746])

In [90]:
x[0 ,:].sum()

-0.531294822333365

In [91]:
x.var()

1.144775233558038

In [92]:
x.max()

2.8423030862011096

There are other, you can see the list [here](https://docs.scipy.org/doc/numpy/reference/arrays.ndarray.html#calculation)

# Broadcasting

In general, python loops are abomination performance wise. To handle that, Numpy is introduced the broadcasting functionality.

In [49]:
zeros = np.zeros((20,20))
print(zeros)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

Now lets say we want to change value to `2` in the last column. in an naive way we can:

In [50]:
%%timeit
for i in range(zeros.shape[0]):
    zeros[i][-1] + 2
    

28.6 µs ± 5.35 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [51]:
change_to_2 = np.zeros(zeros.shape[1])
change_to_2[-1] = 2

In [52]:
print(change_to_2)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2.]


In [53]:
%%timeit
zeros + change_to_2
    

4.85 µs ± 440 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Here you can read about the [rules of broadcasting](https://docs.scipy.org/doc/numpy/reference/ufuncs.html#broadcasting)

# Boolean filtering 

We can use filtering to try and ask about our arrays ( without loops! )

In [95]:
x = np.random.randn(5, 5)

In [98]:
x[0] > x[0].mean()

array([ True,  True, False,  True,  True])

In [99]:
x > x.mean()

array([[ True,  True, False, False,  True],
       [ True, False,  True, False,  True],
       [False,  True,  True, False,  True],
       [ True, False, False,  True,  True],
       [ True,  True, False,  True, False]])

In [100]:
np.where(x[0] > x[0].mean())

(array([0, 1, 3, 4], dtype=int64),)

In [96]:
np.where(x > x.mean())

(array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int64),
 array([0, 1, 4, 0, 2, 4, 1, 2, 4, 0, 3, 4, 0, 1, 3], dtype=int64))

# Usefull functionality 

In [103]:
np.count_nonzero(np.where(x > x.mean()))

23

In [105]:
np.concatenate
np.vstack
np.hstack

<function numpy.hstack(tup)>

In [None]:
np.sort
np.argsort

Of course we only scratched the surface but that is enough for now :) 