In [1]:
import numpy as np

# Numpy Array Basics

In [37]:
# This creates a new 1D numpy array
array = np.array(range(5))
array

array([0, 1, 2, 3, 4])

In [13]:
# This creates a new 2D numpy array
array_2d = np.array([range(5), range(5)])
array_2d

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]])

In [14]:
# Simple operation of adding 1 to each element
array_2d + 1 

array([[1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5]])

In [17]:
# (5,) means 1 column of data with 5 elements
array.shape 

(5,)

In [18]:
# 5 means 5 elements of data
array.size 

5

In [19]:
# (2,5) means 2 columns of data with 5 elements
array_2d.shape 

(2, 5)

In [21]:
# 10 means 10 elements of data
array_2d.size 

10

In [22]:
# T is the transpose method, it will essentially swap our axes
array_2d.T 

array([[0, 0],
       [1, 1],
       [2, 2],
       [3, 3],
       [4, 4]])

In [23]:
# means 2 dimention
array_2d.ndim 

2

In [24]:
# means what datatype is stored in the array
array_2d.dtype 

dtype('int32')

In [25]:
# Non-integer array
np.array(['I','love','Python']) 

array(['I', 'love', 'Python'], dtype='<U6')

# Array Creation

In [30]:
# Creates an array of ones of a given size, as float by default
np.ones((2,2), 'int32') 

array([[1, 1],
       [1, 1]])

In [44]:
# Creates an array of zeros of a given size, as float by default
np.zeros((2,2), dtype=int) 

array([[0, 0],
       [0, 0]])

In [34]:
# Creates an array of integers with given start & stop values, and a step size (only stop is required, and is not inclusive)
# (start, stop, step) Start is 0 and step is 1 by default
np.arange(1,5,1)

array([1, 2, 3, 4])

In [43]:
# Creates an array of floats with given start & stop values with n elements, separated by a consistent step size (stop is inclusive)
np.linspace(0, 100, 5) 

array([  0.,  25.,  50.,  75., 100.])

In [45]:
# Changes an array into specified dimensions (rows, cols)
np.arange(1,9,2).reshape(2,2) 

array([[1, 3],
       [5, 7]])

In [46]:
# 100 zeros reshaped to 10x10 matrix
np.zeros(100, 'int').reshape(10,10)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [51]:
from numpy.random import default_rng

# Here we create a random number generator with specific seed
rng = default_rng(12345)

# Creating a seed is important to ensure you and others can recreate the work

# Returns n random numbers from a uniform distribution between 0 and 1
random_array = rng.random(10)
random_array

array([0.22733602, 0.31675834, 0.79736546, 0.67625467, 0.39110955,
       0.33281393, 0.59830875, 0.18673419, 0.67275604, 0.94180287])

In [52]:
rng = default_rng(12345)
mean, stddev = 5, 1

# Returns n random numbers from a normal distribution with a given mean and standard deviation
random_normal = rng.normal(mean, stddev, size=10)
random_normal

array([3.57617496, 6.26372846, 4.12933826, 4.74082677, 4.92465669,
       4.25911535, 3.6322073 , 5.6488928 , 5.36105811, 3.04713694])

In [53]:
# 100 Random integers from 0 to 10
rng.integers(0, 10, 100)

array([7, 2, 9, 9, 7, 6, 1, 0, 2, 4, 0, 8, 4, 6, 2, 3, 1, 7, 7, 2, 7, 0,
       3, 1, 7, 3, 4, 4, 4, 2, 5, 8, 4, 1, 0, 1, 0, 0, 1, 5, 8, 8, 6, 6,
       3, 9, 6, 7, 7, 8, 7, 9, 5, 5, 2, 9, 5, 4, 3, 2, 6, 4, 5, 6, 8, 3,
       6, 9, 4, 2, 2, 3, 6, 2, 8, 3, 0, 0, 0, 6, 3, 2, 6, 0, 5, 6, 0, 1,
       3, 3, 9, 4, 6, 1, 8, 2, 7, 4, 4, 4], dtype=int64)

# Array Indexing & Slicing

### Indexing & Slicing one-dimensional array
Indexing to access a single element (0-indexed)
`array[index]`

Slicing to access a series of elements (stop is not inclusive)
`array[start:stop:step]`

In [79]:
# Getting elements by indexing
product_array = np.array(['fruits', 'vegetables', 'cereal', 'dairy','eggs',
                          'snacks', 'beverages','coffee','tea', 'spices'])
print(product_array[1])
print(product_array[-1])

vegetables
spices


In [66]:
# Getting elements by slicing
product_array[:5] # Default start point is 0 and default step is 1

array(['fruits', 'vegetables', 'cereal', 'coffee', 'eggs'], dtype='<U10')

In [68]:
# [start:stop:step] here we have [start::step] stop default is the last index
product_array[1::2]

array(['vegetables', 'coffee'], dtype='<U10')

### Indexing & Slicing two-dimensional array
Indexing to access a single element (0-indexed) `array[row index, column index]`

Slicing to access a series of elements `array[start:stop:step, start:stop:step]`

In [98]:
# Getting elements by indexing
product_array2D = product_array.reshape(2,5)
product_array2D

# This goes to the second row and grabs the third element
print(product_array2D[1,2])

coffee


In [82]:
# Getting elements by slicing

# This goes all rows and grabs all elements starting from the third in each row
product_array2D[:, 2:]

array([['cereal', 'dairy', 'eggs'],
       ['coffee', 'tea', 'spices']], dtype='<U10')

In [83]:
# This goes to the second row and grabs all its elements
product_array2D[1:, :]

array([['snacks', 'beverages', 'coffee', 'tea', 'spices']], dtype='<U10')

### Excercises

In [84]:
integer_array = np.arange(12)

In [85]:
integer_array

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [86]:
integer_array[0]

0

In [88]:
integer_array[-1]

11

In [89]:
integer_array[:]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [90]:
integer_array[::2]

array([ 0,  2,  4,  6,  8, 10])

In [92]:
new_array = integer_array.reshape(3,4)
new_array

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [93]:
new_array[:, :]

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [94]:
new_array[1:, :]

array([[ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [95]:
new_array[:, 3]

array([ 3,  7, 11])

In [96]:
new_array[:, 1:]

array([[ 1,  2,  3],
       [ 5,  6,  7],
       [ 9, 10, 11]])

In [97]:
new_array[2,1]

9

# Array Operations

Arithmetic operators can be used to perform array operations


In [99]:
sales = [[0,5,155,0,518], [0,1827,616,317,325]]
sales_array = np.array(sales)
sales_array

array([[   0,    5,  155,    0,  518],
       [   0, 1827,  616,  317,  325]])

In [100]:
# This adds 2 to every element in the array
sales_array + 2

array([[   2,    7,  157,    2,  520],
       [   2, 1829,  618,  319,  327]])

In [101]:
# This assigns all elements in the first row to quantity and 
# in the second row to price
# Finally it multiplies the corresponding elements in each array
quantity = sales_array[0, :]
price = sales_array[1, :]

quantity * price

array([     0,   9135,  95480,      0, 168350])

In [102]:
# This array operations are applied via vectorization and broadcasting,
# which eliminates the need to loop through the array's elements

# Filtering Arrays
You can filter arrays by indexing them with a logical test
- Only the array elements in positions where the logical test returns True are returned

In [103]:
sales_array

array([[   0,    5,  155,    0,  518],
       [   0, 1827,  616,  317,  325]])

In [104]:
# Performing a logical test on a NumPy array returns a Boolean array with
# the results of the logical test on each array element
sales_array != 0

array([[False,  True,  True, False,  True],
       [False,  True,  True,  True,  True]])

In [105]:
# So we can use that array to do some filtering operations... :)
sales_array[sales_array != 0]

array([   5,  155,  518, 1827,  616,  317,  325])

In [107]:
sales_array[(sales_array == 616) | (sales_array < 100)]

array([  0,   5,   0,   0, 616])

In [108]:
sales_array[(sales_array > 100) & (sales_array < 500)]

array([155, 317, 325])

In [109]:
# Pro Tip: Store complex filtering criteria in a variable
# Usually known as Boolean mask
mask = (sales_array > 100) & (sales_array < 500)
sales_array[mask]

array([155, 317, 325])

In [136]:
# You can filter arrays based on values in other arrays

sales_arr = [0,5,155,0,518]
sales_filter = np.array(sales_arr)

In [137]:
product_arr = ['fruits','vegetables','cereal','dairy','eggs',]
product_filter = np.array(product_arr)

In [138]:
product_filter[sales_filter > 0]

array(['vegetables', 'cereal', 'eggs'], dtype='<U10')

In [139]:
# You can modify array values by assigning new ones
sales_filter[1] = 25
sales_filter

array([  0,  25, 155,   0, 518])

In [140]:
# Filters the zero values and assigns them a new value of 5
sales_filter[sales_filter == 0] = 5
sales_filter

array([  5,  25, 155,   5, 518])

# The Where Function
The where() NumPy function performs a logical test and returns a given value if the test is True, or another if the test is False

Sintax: np.where(logical test, value if True, value if False)

In [143]:
my_array = np.arange(20)
my_array

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [144]:
np.where(my_array % 2 == 0, 'even', 'odd')

array(['even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even',
       'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'even', 'odd'], dtype='<U4')

In [145]:
np.where(my_array % 2 == 0, 'even', np.where(my_array == 9, my_array, 'odd'))

array(['even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even',
       '9', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'even', 'odd'], dtype='<U11')

# Array Aggregation Methods
Array aggregation methods let you calculate metrics like sum, mean, and max

In [146]:
sales_array

array([[   0,    5,  155,    0,  518],
       [   0, 1827,  616,  317,  325]])

In [147]:
# Returns the sum of all values in an array
sales_array.sum()

3763

In [148]:
# Returns the average of the values in an array
sales_array.mean()

376.3

In [149]:
# Returns the largest value in the array
sales_array.max()

1827

In [159]:
# Returns the smallest value in an array
sales_array.min()

0

In [160]:
# Returns the standard deviation
sales_array.std()

529.1366647662965

In [158]:
# You can also aggregate across rows or columns

# Agregates across rows
sales_array.sum(axis=0)

array([   0, 1832,  771,  317,  843])

In [157]:
# Agregates across columns
sales_array.sum(axis=1)

array([ 678, 3085])

# Array Operations
Array functions let you perform other aggregations like median and percentiles

In [162]:
sales_array

array([[   0,    5,  155,    0,  518],
       [   0, 1827,  616,  317,  325]])

In [163]:
# Returns the median value in an array
np.median(sales_array)

236.0

In [164]:
# Returns a value in the nth percentile in an array
np.percentile(sales_array, 90)

737.0999999999996

In [165]:
# Returns the unique values in an array
np.unique(sales_array)

array([   0,    5,  155,  317,  325,  518,  616, 1827])

In [167]:
# Returns the square root of each value of the array
np.sqrt(sales_array)

array([[ 0.        ,  2.23606798, 12.4498996 ,  0.        , 22.75961335],
       [ 0.        , 42.74342055, 24.81934729, 17.80449381, 18.02775638]])

## Sorting Arrays
The sort() method will sort arrays in place

In [168]:
sales_array

array([[   0,    5,  155,    0,  518],
       [   0, 1827,  616,  317,  325]])

In [169]:
# Use the axis argument to specify the dimension to sort by
# Axis=1 by default, which sorts a two-dimensional array row by row

sales_array.sort() 
sales_array

array([[   0,    0,    5,  155,  518],
       [   0,  317,  325,  616, 1827]])

In [172]:
sales_array.sort(axis=0)
sales_array

array([[   0,    0,    5,  155,  518],
       [   0,  317,  325,  616, 1827]])

# Vectorization
Vectorization is the process of pushing array operations into optimized C code, which is easier and more efficient than writing for loops

In [180]:
def for_loop_multiply_lists(list1, list2):
    product_list = []
    for element1, element2 in zip(list1, list2):
        product_list.append(element1 * element2)
    return product_list

def multiply_arrays(array1, array2):
    return array1 * array2

In [178]:
list1 = list(range(1000))
list2 = list(range(1000))

In [181]:
%%timeit -r 5 -n 10000
for_loop_multiply_lists(list1, list2)

67.5 µs ± 1.45 µs per loop (mean ± std. dev. of 5 runs, 10,000 loops each)


In [182]:
array1 = np.array(list1)
array2 = np.array(list2)

In [183]:
%%timeit -r 5 -n 10000
multiply_arrays(array1, array2)

1.52 µs ± 80.7 ns per loop (mean ± std. dev. of 5 runs, 10,000 loops each)


In [184]:
# Pro Tip: use vectorized operations whenever possible when manipulating data,
# and avoid writing loops

# Broadcasting
Broadcasting lets you perform vectorized operations with arrays of different sizes, where NumPy will expand the smaller array to 'fit' the larger one

- Single values (scalars) can be broadcast into arrays of any dimension
- Dimensions with a length grater than one must be the same size

In [185]:
test_array = np.array([[1,2,3],[1,2,3],[1,2,3]])
test_array

array([[1, 2, 3],
       [1, 2, 3],
       [1, 2, 3]])

In [186]:
test_array + 1

array([[2, 3, 4],
       [2, 3, 4],
       [2, 3, 4]])

In [187]:
# Behind the scenes, there is another 3x3 matrix created filled by 1
# So te operation can be done by adding both matrices

In [188]:
test_array + np.array([3,2,1])

array([[4, 4, 4],
       [4, 4, 4],
       [4, 4, 4]])

In [189]:
# Behind the scenes, we again fill out a 3x3 matrix with 3,2,1

In [190]:
test_array + np.array([3,2,1]).reshape(3,1)

array([[4, 5, 6],
       [3, 4, 5],
       [2, 3, 4]])

In [191]:
# Behind the scenes, it is filled out a 3x3 matrix to match the dimensions

# But what happens when we don't have a matching dimension? 

In [193]:
test_array + np.array([2,1])

# Broadcasting relies in compatible shapes

In [197]:
test_array[0, :] + test_array[:, 1].reshape(3,1)

array([[3, 4, 5],
       [3, 4, 5],
       [3, 4, 5]])

In [None]:
# Behind the scenes, numpy knows that he has to fill each matrix with 
# 1,2,3 and 2,2,2 respectively to have both matrices 3x3

# THE IMPORTANT is to have a matching dimension either AxB or BxA