---
### 3. Intro to NumPy
---

In [39]:
'''
NumPy is an extension to the Python programming language, adding support for large, 
multidimensional (numerical) arrays and matrices, along with a large library of high-level 
mathematical functions to operate on these arrays.
'''

import numpy as np

# Create lists
data1 = [range(1, 5)] 
data2 = [range(1, 5), range(5, 9)] 
data3 = [range(1, 5), range(5, 9), range(9, 13)] 

# Create arrays
arr1 = np.array(data1) # 1d array
arr2 = np.array(data2) # 2d array
arr3 = np.array(data3) # 3d array

# Convert arrays back to lists
type(arr1.tolist()), type(arr2.tolist()), arr3.tolist()

(list, list, [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])

In [2]:
# All zeros
np.zeros((3, 2))

array([[0., 0.],
       [0., 0.],
       [0., 0.]])

In [3]:
# All ones
np.ones((2, 6))

array([[1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1.]])

In [4]:
# 0 to 1 (inclusive) with 5 points
np.linspace(0, 1, 5) 

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [5]:
# 10^0 to 10^3 (inclusive) with 4 points
np.logspace(0, 3, 4) 

array([   1.,   10.,  100., 1000.])

In [6]:
# arange is like range - except it returns an array (instead of a list)
int_array = np.arange(5)
int_array

array([0, 1, 2, 3, 4])

In [7]:
float_array = int_array.astype(float)
float_array.dtype

dtype('float64')

In [8]:
# Examining arrays
arr1.dtype # float64

dtype('int32')

In [9]:
arr2.ndim # 2

2

In [10]:
arr2.shape # (2, 4) - axis 0 is rows, axis 1 is columns

(2, 4)

In [11]:
arr2.size # 8 - total number of elements

8

In [12]:
len(arr2) # 2 - size of first dimension (aka axis)

2

In [13]:
# Reshaping
arr = np.arange(10, dtype=float).reshape((2, 5))
print(arr.shape)
print(arr.reshape(5, 2))

(2, 5)
[[0. 1.]
 [2. 3.]
 [4. 5.]
 [6. 7.]
 [8. 9.]]


In [14]:
# Add an axis
a = np.array([0, 1, 2])
print('Original \n', a)

a_col = a[:, np.newaxis]
print('np.newaxis \n', a_col)

a_col = a[:, None]
print('None \n', a_col)

Original 
 [0 1 2]
np.newaxis 
 [[0]
 [1]
 [2]]
None 
 [[0]
 [1]
 [2]]


In [15]:
# Transpose
a_col.T

array([[0, 1, 2]])

In [16]:
# Flatten: always returns a flat copy of the orriginal array
arr_flt = arr.flatten()
arr_flt[0] = 33

print(arr_flt)
print(arr)

[33.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]]


In [17]:
# Ravel: returns a view of the original array whenever possible.
arr_flt = arr.ravel()
arr_flt[0] = 33
print(arr_flt)
print(arr)

[33.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
[[33.  1.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]]


In [18]:
# Stack arrays
a = np.array([0, 1])
b = np.array([2, 3])

# Horizontal stacking
np.hstack([a, b])

array([0, 1, 2, 3])

In [19]:
# Vertical stacking
np.vstack([a, b])

array([[0, 1],
       [2, 3]])

In [20]:
# Default Vertical
np.stack([a, b])

array([[0, 1],
       [2, 3]])

In [21]:
# Selection

#Single item
arr = np.arange(10, dtype=float).reshape((2, 5))
arr[0] # 0th element (slices like a list)
arr[0, 3] # row 0, column 3: returns 4
arr[0][3] # alternative syntax

3.0

In [22]:
# Slicing

# Syntax: start:stop:step with start (default 0) stop (default last) step (default 1)
arr[0, :] # row 0: returns 1d array ([1, 2, 3, 4])
arr[:, 0] # column 0: returns 1d array ([1, 5])
arr[:, :2] # columns strictly before index 2 (2 first columns)
arr[:, 2:] # columns after index 2 included
arr2 = arr[:, 1:4] # columns between index 1 (included) and 4 (excluded)
print(arr2)

[[1. 2. 3.]
 [6. 7. 8.]]


In [23]:
# Slicing returns a view (not a copy) Modification
arr2[0, 0] = 33
print(arr2)
print(arr)

[[33.  2.  3.]
 [ 6.  7.  8.]]
[[ 0. 33.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]]


In [24]:
# Row 0: reverse order
print(arr[0, ::-1])

[ 4.  3.  2. 33.  0.]


In [25]:
# Fancy indexing: Integer or boolean array indexing - returns a copy not a view.

# Integer array indexing
arr2 = arr[:, [1, 2, 3]]
print(arr2)
arr2[0, 0] = 44
print(arr2)
print(arr)

[[33.  2.  3.]
 [ 6.  7.  8.]]
[[44.  2.  3.]
 [ 6.  7.  8.]]
[[ 0. 33.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]]


In [26]:
# Boolean arrays indexing
arr2 = arr[arr > 5] # return a copy
print(arr2)
arr2[0] = 44
print(arr2)
print(arr)

# However, In the context of lvalue indexing (left hand side value of an assignment) Fancy authorizes the modification of the original array
arr[arr > 5] = 0
print(arr)

[33.  6.  7.  8.  9.]
[44.  6.  7.  8.  9.]
[[ 0. 33.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]]
[[0. 0. 2. 3. 4.]
 [5. 0. 0. 0. 0.]]


In [27]:
# Boolean arrays indexing continues
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Eduardo'])
names == 'Bob' # returns a boolean array

array([ True, False, False,  True, False])

In [28]:
names[names != 'Bob'] # logical selection

array(['Joe', 'Will', 'Eduardo'], dtype='<U7')

In [29]:
(names == 'Bob') | (names == 'Will') # keywords "and/or" don't work with boolean arrays

array([ True, False,  True,  True, False])

In [30]:
names[names != 'Bob'] = 'Joe' # assign based on a logical selection

In [31]:
np.unique(names) # set function

array(['Bob', 'Joe'], dtype='<U7')

In [32]:
# Vectorized operations
nums = np.arange(5)
nums * 10 # multiply each element by 10
nums = np.sqrt(nums) # square root of each element
np.ceil(nums) # also floor, rint (round to nearest int)
np.isnan(nums) # checks for NaN
nums + np.arange(5) # add element-wise
np.maximum(nums, np.array([1, -2, 3, -4, 5])) # compare element-wise

array([1.        , 1.        , 3.        , 1.73205081, 5.        ])

In [33]:
# Compute Euclidean distance between 2 vectors
vec1 = np.random.randn(10)
vec2 = np.random.randn(10)
dist = np.sqrt(np.sum((vec1 - vec2) ** 2))

In [34]:
# math and stats
rnd = np.random.randn(4, 2) # random normals in 4x2 array
rnd.mean()
rnd.std()
rnd.argmin() # index of minimum element
rnd.sum()
rnd.sum(axis=0) # sum of columns
rnd.sum(axis=1) # sum of rows

array([-0.17667841, -1.06077926,  1.69479387,  0.51848914])

In [35]:
# methods for boolean arrays
(rnd > 0).sum() # counts number of positive values
(rnd > 0).any() # checks if any value is True
(rnd > 0).all() # checks if all values are True

False

In [36]:
# random numbers
np.random.seed(12234) # Set the seed
np.random.rand(2, 3) # 2 x 3 matrix in [0, 1]
np.random.randn(10) # random normals (mean 0, sd 1)
np.random.randint(0, 2, 10) # 10 randomly picked 0 or 1

array([0, 0, 0, 1, 1, 0, 1, 1, 1, 1])

In [37]:
# Broadcasting

'''
Rules

Starting with the trailing axis and working backward, Numpy compares arrays dimensions.
    - If two dimensions are equal then continues
    - If one of the operand has dimension 1 stretches it to match the largest one
    - When one of the shapes runs out of dimensions (because it has less dimensions than the other shape), Numpy will use 1 in the comparison process until the other shape’s
dimensions run out as well.
'''

a = np.array([[ 0, 0, 0],
              [10, 10, 10],
              [20, 20, 20],
              [30, 30, 30]])

b = np.array([0, 1, 2])

print(a + b)              

[[ 0  1  2]
 [10 11 12]
 [20 21 22]
 [30 31 32]]


In [38]:
# Scale (center, normalise) data column-wise
(a - a.mean(axis=0)) / a.std(axis=0)

array([[-1.34164079, -1.34164079, -1.34164079],
       [-0.4472136 , -0.4472136 , -0.4472136 ],
       [ 0.4472136 ,  0.4472136 ,  0.4472136 ],
       [ 1.34164079,  1.34164079,  1.34164079]])