# NumPy Introduction

In [1]:
import numpy as np
import pandas as pd

## Data Types & Attributes
NumPy's main datatype is a `ndarray`, which stands for **N-Dimensional Array**. Most everything you'll see in NumPy is just an ndarray.

Here's how an ndarray works:
![](../images/anatomy-of-numpy-array.png)

**1-D Array**:
- Shape(#) with # meaning the number of values in the array
- Axis = 0 is the single row of cells

**2-D Array**:
- Shape(x, y) with x being the number of rows and y being the number of columns
- Axis = 0 refers to x, the rows
- Axis = 1 refers to y, the columns

**3-D Array**:
- Shape (x, y, z) with x being the depth/number of frames, y being the rows, z being the columns
- Axis = 0 refers to x, the depth of frames
- Axis = 1 refers to y, the rows
- Axis = 2 refers to z, the columns

**N-D Array**:
- Shape(n, o, p, q, ..., z)
  - Each the number in each dimension refers to how many of the next dimension exist
  - For example, in a 2D array, Shape(2, 3) means that there will be 2 single-dimension arrays (rows) and each of those 1D arrays will have 3 values (columns)
  - Another example, a 3D array with Shape(2, 3, 4) means that there are 2 2D arrays (frames), each with 3 1D arrays (rows) of 4 values (columns)
  - So in an N-D array, any value x in Shape() refers to how many instances exist of the dimension 1 below x
- Axis = 0 refers to the highest dimension
- Axis = 1 refers to the second highest dimension
- Axis = n-1 refers to the rows
- Axis = n refers to the columns

In [2]:
# NumPy's main datatype is ndarray (N-Dimensional Array)
a1 = np.array([1, 2, 3])
a1

array([1, 2, 3])

In [3]:
type(a1)

numpy.ndarray

## Demonstrating Shape and Dimension

In [17]:
a2 = np.array([[1, 2, 3],
               [4, 5, 6.5]])

a3 = np.array([[[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12]],
                [[13, 14, 15, 16],
                [17, 18, 19, 20],
                [21, 22, 23, 24]]])

In [5]:
a2

array([[1. , 2. , 3. ],
       [4. , 5. , 6.5]])

In [6]:
a3

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]]])

In [37]:
a1.shape

(3,)

In [21]:
a2.shape

(2, 3)

In [23]:
a3.shape

(2, 3, 4)

In [30]:
a1.ndim, a2.ndim, a3.ndim

(1, 2, 3)

In [33]:
# a2 got all turned into floats because even just 1 existed
a1.dtype, a2.dtype, a3.dtype

(dtype('int32'), dtype('float64'), dtype('int32'))

In [35]:
# How many elements in array?
a1.size, a2.size, a3.size

(3, 6, 24)

In [41]:
# Create a DataFrame from a NumPy array
df = pd.DataFrame(a2)
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,5.0,6.5


## Creating NumPy Arrays

In [43]:
sample_array = np.array([1,2,3])
sample_array

array([1, 2, 3])

In [44]:
# Create array of 1s in specified shape of dimensions
ones = np.ones((2,3))
ones

array([[1., 1., 1.],
       [1., 1., 1.]])

In [46]:
# Also works for 0s
zeros = np.zeros((4,2,2))
zeros

array([[[0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.]]])

In [47]:
# arnage(x,y,z) creates array consisting of values within range [x, y) with optional step by z
range_array = np.arange(0, 10, 2)
range_array

array([0, 2, 4, 6, 8])

In [48]:
# random() fills an array with random numbers, but needs specification on what kind of numbers
# randint(x, y, size=(a,b,c)) says to use integers in range [0, 10) and will create an array of Shape(a,b,c)
random_array = np.random.randint(0, 10, size=(3, 5))
random_array

array([[5, 5, 0, 9, 3],
       [2, 8, 6, 5, 7],
       [8, 8, 4, 6, 3]])

In [49]:
# random() fills with floats in (0,0)
np.random.random((5,3))

array([[0.92965781, 0.81860938, 0.17450964],
       [0.61972379, 0.89295967, 0.04786846],
       [0.21341156, 0.23012243, 0.4470598 ],
       [0.1004651 , 0.82117943, 0.93324163],
       [0.05658621, 0.07766017, 0.03003696]])

## Random Number Seeding

In [60]:
# Pseudo-random numbers can be seeded
np.random.seed(seed=0)
random_array_4 = np.random.randint(10, size=(5,3))
random_array_4

array([[5, 0, 3],
       [3, 7, 9],
       [3, 5, 2],
       [4, 7, 6],
       [8, 8, 1]])

## Viewing Arrays and Matrices

In [61]:
# unique() will show each value that is used in the array
np.unique(random_array_4)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [63]:
a1

array([1, 2, 3])

In [64]:
a2

array([[1. , 2. , 3. ],
       [4. , 5. , 6.5]])

In [65]:
a3

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]],

       [[13, 14, 15, 16],
        [17, 18, 19, 20],
        [21, 22, 23, 24]]])

### Indexing An Array By Dimension
Given a NumPy ndarray like a1, you can access values inside the ndarray like normal accessing. The indices are given in order starting with highest dimension, like `Shape()`

In [66]:
# For 1D array, indexing grabs the value at specific location
a1[0]

1

In [67]:
# For 2D array, indexing with a single value grabs the row (2nd dimension) at the specific location
a2[0]

array([1., 2., 3.])

In [68]:
# For a 3D array, indexing with a single value grabs the 2D Matrix at the specific location
a3[0]

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [69]:
# Slice notation to get the first 2 values of each dimension
a3[:2, :2, :2]

array([[[ 1,  2],
        [ 5,  6]],

       [[13, 14],
        [17, 18]]])

In [70]:
a4 = np.random.randint(10, size=(2, 3, 4, 5))
a4

array([[[[6, 7, 7, 8, 1],
         [5, 9, 8, 9, 4],
         [3, 0, 3, 5, 0],
         [2, 3, 8, 1, 3]],

        [[3, 3, 7, 0, 1],
         [9, 9, 0, 4, 7],
         [3, 2, 7, 2, 0],
         [0, 4, 5, 5, 6]],

        [[8, 4, 1, 4, 9],
         [8, 1, 1, 7, 9],
         [9, 3, 6, 7, 2],
         [0, 3, 5, 9, 4]]],


       [[[4, 6, 4, 4, 3],
         [4, 4, 8, 4, 3],
         [7, 5, 5, 0, 1],
         [5, 9, 3, 0, 5]],

        [[0, 1, 2, 4, 2],
         [0, 3, 2, 0, 7],
         [5, 9, 0, 2, 7],
         [2, 9, 2, 3, 3]],

        [[2, 3, 4, 1, 2],
         [9, 1, 4, 6, 8],
         [2, 3, 0, 0, 6],
         [0, 6, 3, 3, 8]]]])

In [71]:
a4.shape, a4.ndim

((2, 3, 4, 5), 4)

In [73]:
# Get the first 2 numbers of the innermost array
a4[:,:,:,:2]

array([[[[6, 7],
         [5, 9],
         [3, 0],
         [2, 3]],

        [[3, 3],
         [9, 9],
         [3, 2],
         [0, 4]],

        [[8, 4],
         [8, 1],
         [9, 3],
         [0, 3]]],


       [[[4, 6],
         [4, 4],
         [7, 5],
         [5, 9]],

        [[0, 1],
         [0, 3],
         [5, 9],
         [2, 9]],

        [[2, 3],
         [9, 1],
         [2, 3],
         [0, 6]]]])

## Manipulating Arrays

### Arithmetic
- For simple operations on arrays of the same size, the operation is applied to elements in the same position and a new array is returned
- For arrays that are not the same size, the smaller array is `Broadcast` across the larger array so they have the same shape
  - See [NumPy Broadcast Docs](https://numpy.org/doc/stable/user/basics.broadcasting.html#:~:text=The%20term%20broadcasting%20describes%20how%20numpy%20treats%20arrays,that%20looping%20occurs%20in%20C%20instead%20of%20Python.)
  - Broadcasting has some rules though. It only works when shapes are compatible or one of the shapes is 1
- You can use the operator to perform the arithmetic (`a1 + a2`), but NumPy also has the operations built in so you can do something like `np.add(a1, a2)`

In [74]:
a1

array([1, 2, 3])

In [75]:
ones

array([[1., 1., 1.],
       [1., 1., 1.]])

In [76]:
# Add elements in the same position across both arrays
a1 + ones

array([[2., 3., 4.],
       [2., 3., 4.]])

In [77]:
a1 - ones

array([[0., 1., 2.],
       [0., 1., 2.]])

In [78]:
a1 * ones

array([[1., 2., 3.],
       [1., 2., 3.]])

In [79]:
a2

array([[1. , 2. , 3. ],
       [4. , 5. , 6.5]])

In [80]:
# a1 is only 1 row, but that row was applied to both rows of a2
a1 * a2

array([[ 1. ,  4. ,  9. ],
       [ 4. , 10. , 19.5]])

In [81]:
a3

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]],

       [[13, 14, 15, 16],
        [17, 18, 19, 20],
        [21, 22, 23, 24]]])

In [82]:
# Broadcasting doesn't work here
a2 * a3

ValueError: operands could not be broadcast together with shapes (2,3) (2,3,4) 

In [83]:
a1 / ones

array([[1., 2., 3.],
       [1., 2., 3.]])

In [85]:
a2 / a1

array([[1.        , 1.        , 1.        ],
       [4.        , 2.5       , 2.16666667]])

In [86]:
# Floor division (//) in Python performs the division and floors the answer
a2 // a1

array([[1., 1., 1.],
       [4., 2., 2.]])

In [87]:
a2 ** 2

array([[ 1.  ,  4.  ,  9.  ],
       [16.  , 25.  , 42.25]])

In [88]:
np.square(a2)

array([[ 1.  ,  4.  ,  9.  ],
       [16.  , 25.  , 42.25]])

### Aggregation
Aggregation = performing the same operation on a number of things. With NumPy, you can use aggregation to perform the same operation on every element in the array

- You should use Python's built-in operations on lists and other Python data types
- For NumPy data types (`ndarray`), you should use NumPy's version

In [89]:
a1

array([1, 2, 3])

In [90]:
# Python built-in. Use this on Python data types
sum(a1)

6

In [91]:
# NumPy sum method. Use NumPy methods on ndarrays
np.sum(a1)

6

In [94]:
# Create a massive NumPy array
massive = np.random.random(100000)
massive.size

100000

In [97]:
massive[:10]

array([0.59816399, 0.17370251, 0.49752936, 0.51231935, 0.41529741,
       0.44150892, 0.96844105, 0.23242417, 0.90336451, 0.35172075])

In [101]:
# Demonstration about how much faster NumPy's functions are
%timeit sum(massive) # Python's sum()
%timeit np.sum(massive) # NumPy's np.sum()

7.65 ms ± 82.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
30.6 µs ± 334 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [102]:
np.mean(a2)

3.5833333333333335

In [103]:
# Standard Deviation = a measure of how spread out a group of numbers is from the mean
# Standard deviation is the square root of variance
np.std(a2)

1.8352262954621035

In [104]:
# Variance = measure of the average degree to which each number is different from the mean
# Higher variance = wider range of numbers
# Lower variance = lower range of numbers (tighter spread)
np.var(a2)

3.368055555555556

### Reshape and Transpose

Changing the shape of an ndarray

In [105]:
a2.shape, a3.shape

((2, 3), (2, 3, 4))

In [106]:
# This does not work because of incompatible shapes
a2 * a3

ValueError: operands could not be broadcast together with shapes (2,3) (2,3,4) 

In [107]:
# Use NumPy reshape(x, y, z) to reshape an array into the given Shape dimensions
a2.reshape(2, 3, 1)

array([[[1. ],
        [2. ],
        [3. ]],

       [[4. ],
        [5. ],
        [6.5]]])

In [108]:
# Multiplication works after reshaping to compatible shape
a2_reshape = a2.reshape(2,3,1)
a2_reshape * a3

array([[[  1. ,   2. ,   3. ,   4. ],
        [ 10. ,  12. ,  14. ,  16. ],
        [ 27. ,  30. ,  33. ,  36. ]],

       [[ 52. ,  56. ,  60. ,  64. ],
        [ 85. ,  90. ,  95. , 100. ],
        [136.5, 143. , 149.5, 156. ]]])

In [109]:
# Transposing an ndarray will swap its axes
# Transpose with .T
a2.shape, a2.T.shape

((2, 3), (3, 2))

In [111]:
a3.shape, a3.T.shape

((2, 3, 4), (4, 3, 2))

## Dot Product

In [113]:
np.random.seed(0)
mat1 = np.random.randint(10, size=(5,3))
mat2 = np.random.randint(10, size=(5,3))

mat1, mat2

(array([[5, 0, 3],
        [3, 7, 9],
        [3, 5, 2],
        [4, 7, 6],
        [8, 8, 1]]),
 array([[6, 7, 7],
        [8, 1, 5],
        [9, 8, 9],
        [4, 3, 0],
        [3, 5, 0]]))

In [114]:
# Element-wise multiplication (Hadamard Product)
# Works since both are the same Shape
mat1 * mat2

array([[30,  0, 21],
       [24,  7, 45],
       [27, 40, 18],
       [16, 21,  0],
       [24, 40,  0]])

In [117]:
# Dot Product
# Requires transpose because the matrixes are incompatible for dot product in their normal shape 
mat3 = np.dot(mat1, mat2.T)
mat3

array([[ 51,  55,  72,  20,  15],
       [130,  76, 164,  33,  44],
       [ 67,  39,  85,  27,  34],
       [115,  69, 146,  37,  47],
       [111,  77, 145,  56,  64]])

![](../images/dot-product.png)

Website to [practice matrix multiplication](https://matrixmultiplication.xyz)

In [119]:
# Showing shapes after transpose and dot product
mat1.shape, mat2.T.shape, mat3.shape

((5, 3), (3, 5), (5, 5))

## Dot Product Example (Nut Butter Sales)

In [121]:
np.random.seed(0)
# Number of jars sold
sales_amount = np.random.randint(20, size=(5,3))
sales_amount

array([[12, 15,  0],
       [ 3,  3,  7],
       [ 9, 19, 18],
       [ 4,  6, 12],
       [ 1,  6,  7]])

In [122]:
# Create weekly_sales DataFrame
weekly_sales = pd.DataFrame(sales_amount, index=["Mon", "Tues", "Wed", "Thurs", "Fri"], 
                                            columns=["Almond Butter", "Peanut Butter", "Cashew Butter"])
weekly_sales

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter
Mon,12,15,0
Tues,3,3,7
Wed,9,19,18
Thurs,4,6,12
Fri,1,6,7


In [123]:
# Create Prices Array
prices = np.array([10, 8, 12])
prices

array([10,  8, 12])

In [125]:
# Create butter_prices DataFrame

butter_prices = pd.DataFrame(prices.reshape(1,3), index=["Price"], columns=["Almond Butter", "Peanut Butter", "Cashew Butter"])
butter_prices

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter
Price,10,8,12


In [126]:
sales_amount.shape, prices.shape

((5, 3), (3,))

In [129]:
# Shapes aren't aligned, need to transpose
total_sales = prices.dot(sales_amount.T)
total_sales

array([240, 138, 458, 232, 142])

In [130]:
# Checking what weekly_sales looks like
weekly_sales

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter
Mon,12,15,0
Tues,3,3,7
Wed,9,19,18
Thurs,4,6,12
Fri,1,6,7


In [131]:
# Checking what butter_prices looks like
butter_prices

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter
Price,10,8,12


In [132]:
# Comparing shapes to see if we can dot product
weekly_sales.shape, butter_prices.shape

((5, 3), (1, 3))

In [134]:
# Inner numbers need to match, so we should transpose butter_prices before doing dot product
daily_sales = weekly_sales.dot(butter_prices.T)
daily_sales

Unnamed: 0,Price
Mon,240
Tues,138
Wed,458
Thurs,232
Fri,142


In [135]:
# Add total sales as new column
weekly_sales["Total ($)"] = daily_sales
weekly_sales

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter,Total ($)
Mon,12,15,0,240
Tues,3,3,7,138
Wed,9,19,18,458
Thurs,4,6,12,232
Fri,1,6,7,142


### Comparison Operators

In [136]:
a1

array([1, 2, 3])

In [137]:
a2

array([[1. , 2. , 3. ],
       [4. , 5. , 6.5]])

In [138]:
a1 > a2

array([[False, False, False],
       [False, False, False]])

In [140]:
bool_array = a1 >= a2
bool_array

array([[ True,  True,  True],
       [False, False, False]])

In [141]:
type(bool_array), bool_array.dtype

(numpy.ndarray, dtype('bool'))

In [142]:
a1 > 5

array([False, False, False])

In [143]:
a1 < 5

array([ True,  True,  True])

In [144]:
a1 == a1

array([ True,  True,  True])

In [145]:
a1 == a2

array([[ True,  True,  True],
       [False, False, False]])