# Numpy

Main usage is to bring the functionality of R to Python, but they are not the same!

In [1]:
import numpy as np

# Creating Numpy arrays

### Creating one dimensional array

In [2]:
a = np.array([1, 2, 3])
print(a)

[1 2 3]


### Creating arrays with other data types

In [3]:
a_float = np.array([1, 2, 3], dtype=np.float64)

### seq in R -> np.arange in Python

In [4]:
one_dim_range = np.arange(0, 10)
print(one_dim_range)

one_dim_float = np.arange(0, 1, 0.1)
print(one_dim_float)

[0 1 2 3 4 5 6 7 8 9]
[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]


### length.out in R -> np.linspace in Python.

In [5]:
one_dim_custom_range = np.linspace(0, 20, 8)
print(one_dim_custom_range)

[ 0.          2.85714286  5.71428571  8.57142857 11.42857143 14.28571429
 17.14285714 20.        ]


### rep() in R -> np.repeat in Python

In [6]:
np.repeat(3, 4)

array([3, 3, 3, 3])

In [7]:
np.repeat([1, 2], 4)

array([1, 1, 1, 1, 2, 2, 2, 2])

### Row - column vectors

### The default numpy arrays are like "geometric" vectors, not row or column vectors like in linear algebra. Creating row and column vectors:

In [8]:
row = np.array([    # 1 row, with 3 elements each
   [1, 2, 3]
])

In [9]:
column = np.array([  # 3 rows, with 1 element each
    [1],
    [2],
    [3]
])

### Or, in a shorter way:

In [10]:
row = np.r_['r', [1, 2, 3]]

In [11]:
column = np.r_['c', [1, 2, 3]]

### These return "matrix" class, which may be deprecated in the future. Do this instead:

In [12]:
row = np.array([[1, 2, 3]])
column = row.reshape(3, 1)

### Creating matrices and more multidimensional arrays

In [13]:
b = np.array([[9.0, 8.0, 7.0], [6.0, 5.0, 4.0]])
print(b)

[[9. 8. 7.]
 [6. 5. 4.]]


In [14]:
c = np.array([
    [[1, 2, 3],
     [4, 5, 6]],

    [[7, 8, 9],
     [10, 11, 12]]
    ])
print(c)

[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]]


### matrix(data, nrow, ncol, byrow) in R -> np.reshape in Python. Numpy is row major by default. R is column major.

In [15]:
one_to_two_dim = np.reshape(one_dim_custom_range, (4, 2))
print(one_to_two_dim)

[[ 0.          2.85714286]
 [ 5.71428571  8.57142857]
 [11.42857143 14.28571429]
 [17.14285714 20.        ]]


### Getting a column major matrix

In [16]:
np.reshape(one_dim_custom_range, (4, 2)).swapaxes(0, 1)

array([[ 0.        ,  5.71428571, 11.42857143, 17.14285714],
       [ 2.85714286,  8.57142857, 14.28571429, 20.        ]])

### t() in R -> np.transpose in Python.

In [17]:
two_dim_transpoze = np.transpose(b)
print(two_dim_transpoze)

[[9. 6.]
 [8. 5.]
 [7. 4.]]


### identity matrix

In [18]:
identity_matrix = np.eye(5)
print(identity_matrix)

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


### Create a large matrix with random numbers.

In [19]:
large_matrix = np.random.rand(20, 5)
print(large_matrix)

[[0.01450485 0.46935426 0.95468631 0.06753839 0.90455841]
 [0.09387146 0.66036727 0.08284177 0.03835385 0.39657769]
 [0.69938588 0.71258694 0.30304754 0.03373264 0.13651504]
 [0.23498198 0.89310614 0.83581501 0.07222996 0.079412  ]
 [0.73140092 0.86070634 0.3921439  0.20889967 0.54337815]
 [0.94459809 0.03177107 0.55335102 0.34726017 0.72609543]
 [0.63089613 0.92904222 0.66931024 0.62187252 0.17801659]
 [0.35746421 0.04393292 0.40306159 0.35992502 0.0349381 ]
 [0.71832143 0.41914372 0.54647839 0.20491556 0.4775229 ]
 [0.54097873 0.72038657 0.98425477 0.41073367 0.28756995]
 [0.75395661 0.63899878 0.44673242 0.82681947 0.63395598]
 [0.63291818 0.48822844 0.22514152 0.31748971 0.2672816 ]
 [0.85007628 0.52551704 0.87582169 0.57391274 0.77748673]
 [0.59419333 0.67771757 0.78281156 0.1588898  0.51140268]
 [0.78927128 0.44309971 0.62088324 0.0600722  0.39624365]
 [0.2487962  0.59562283 0.48702903 0.38873877 0.30251831]
 [0.65700721 0.17714735 0.73957356 0.09648378 0.35907713]
 [0.19754972 0

# Array Analysis and Cleaning

### number of dimensions in array. Normal vectors have 0 dimensions in R, 1 dimensions in NumPy.

In [20]:
a.ndim

1

In [21]:
b.ndim

2

In [22]:
row.ndim

2

In [23]:
column.ndim

2

In [24]:
c.ndim

3

### shape is length of dimensions. The output is like:
### - for normal vectors: (length, )
### - for 2 dimensions: # of rows, # of columns (like in linear algebra)
### - for more dimensionals: (x, y, z) -> there are x number of matrices with y number of rows with z elements

In [25]:
a.shape

(3,)

In [26]:
row.shape

(1, 3)

In [27]:
column.shape

(3, 1)

In [28]:
b.shape

(2, 3)

In [29]:
c.shape

(2, 2, 3)

### len works well in 1 dim, but not > 1. It returns the number of "first children of the parent"

In [30]:
len(a)

3

In [31]:
len(b)

2

In [32]:
len(row)

1

In [33]:
len(column)

3

In [34]:
len(c)

2

### size returns the # of numbers in array

In [35]:
a.size

3

In [36]:
b.size

6

### Accessing elements

In [37]:
a[0]

1

### The code below would return all elements but the first element, in R. Slicing and indexing is more like built-in lists. More slicing examples are in 1st notebook.

In [38]:
a[-1]

3

### Reversing

In [39]:
a[::-1]

array([3, 2, 1])

### Turn an array of numbers into an array of booleans by for each element, checking if the elements satisfies a condition.

In [40]:
large_matrix > 0.5

array([[False, False,  True, False,  True],
       [False,  True, False, False, False],
       [ True,  True, False, False, False],
       [False,  True,  True, False, False],
       [ True,  True, False, False,  True],
       [ True, False,  True, False,  True],
       [ True,  True,  True,  True, False],
       [False, False, False, False, False],
       [ True, False,  True, False, False],
       [ True,  True,  True, False, False],
       [ True,  True, False,  True,  True],
       [ True, False, False, False, False],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True, False,  True],
       [ True, False,  True, False, False],
       [False,  True, False, False, False],
       [ True, False,  True, False, False],
       [False,  True,  True,  True, False],
       [ True,  True, False,  True, False],
       [ True,  True,  True, False, False]])

### Flatten the matrix and return all elements matching a condition

In [41]:
print(large_matrix[large_matrix > 0.5])

[0.95468631 0.90455841 0.66036727 0.69938588 0.71258694 0.89310614
 0.83581501 0.73140092 0.86070634 0.54337815 0.94459809 0.55335102
 0.72609543 0.63089613 0.92904222 0.66931024 0.62187252 0.71832143
 0.54647839 0.54097873 0.72038657 0.98425477 0.75395661 0.63899878
 0.82681947 0.63395598 0.63291818 0.85007628 0.52551704 0.87582169
 0.57391274 0.77748673 0.59419333 0.67771757 0.78281156 0.51140268
 0.78927128 0.62088324 0.59562283 0.65700721 0.73957356 0.90846994
 0.93002858 0.80272808 0.83007884 0.71718235 0.53414053 0.99871578
 0.64982384 0.5509869 ]


### Picking rows and columns, tuples for specific values or ranges can be used.

In [42]:
print(large_matrix[(1, 3), 0:2])

[[0.09387146 0.66036727]
 [0.23498198 0.89310614]]


In [43]:
print(large_matrix[1, :])

[0.09387146 0.66036727 0.08284177 0.03835385 0.39657769]


In [44]:
# print(large_matrix[, 2]) -> error.
print(large_matrix[:, 2])

[0.95468631 0.08284177 0.30304754 0.83581501 0.3921439  0.55335102
 0.66931024 0.40306159 0.54647839 0.98425477 0.44673242 0.22514152
 0.87582169 0.78281156 0.62088324 0.48702903 0.73957356 0.93002858
 0.03472283 0.5509869 ]


### Dropping columns. Second argument is the indexes to be dropped and third argument is "axis", 0 is rows and 1 is columns. In the code below, second argument is a tuple, indexes are manually specified.

In [45]:
large_matrix_dropped = np.delete(large_matrix, (1, 2), 0)
print(large_matrix_dropped)

[[0.01450485 0.46935426 0.95468631 0.06753839 0.90455841]
 [0.23498198 0.89310614 0.83581501 0.07222996 0.079412  ]
 [0.73140092 0.86070634 0.3921439  0.20889967 0.54337815]
 [0.94459809 0.03177107 0.55335102 0.34726017 0.72609543]
 [0.63089613 0.92904222 0.66931024 0.62187252 0.17801659]
 [0.35746421 0.04393292 0.40306159 0.35992502 0.0349381 ]
 [0.71832143 0.41914372 0.54647839 0.20491556 0.4775229 ]
 [0.54097873 0.72038657 0.98425477 0.41073367 0.28756995]
 [0.75395661 0.63899878 0.44673242 0.82681947 0.63395598]
 [0.63291818 0.48822844 0.22514152 0.31748971 0.2672816 ]
 [0.85007628 0.52551704 0.87582169 0.57391274 0.77748673]
 [0.59419333 0.67771757 0.78281156 0.1588898  0.51140268]
 [0.78927128 0.44309971 0.62088324 0.0600722  0.39624365]
 [0.2487962  0.59562283 0.48702903 0.38873877 0.30251831]
 [0.65700721 0.17714735 0.73957356 0.09648378 0.35907713]
 [0.19754972 0.90846994 0.93002858 0.80272808 0.10772805]
 [0.83007884 0.71718235 0.03472283 0.53414053 0.47165162]
 [0.99871578 0

### Dropping columns with a "slice" object, which is returned from np.s_ function.

In [46]:
large_matrix_dropped = np.delete(large_matrix, np.s_[1:3], 1)
print(large_matrix_dropped)

[[0.01450485 0.06753839 0.90455841]
 [0.09387146 0.03835385 0.39657769]
 [0.69938588 0.03373264 0.13651504]
 [0.23498198 0.07222996 0.079412  ]
 [0.73140092 0.20889967 0.54337815]
 [0.94459809 0.34726017 0.72609543]
 [0.63089613 0.62187252 0.17801659]
 [0.35746421 0.35992502 0.0349381 ]
 [0.71832143 0.20491556 0.4775229 ]
 [0.54097873 0.41073367 0.28756995]
 [0.75395661 0.82681947 0.63395598]
 [0.63291818 0.31748971 0.2672816 ]
 [0.85007628 0.57391274 0.77748673]
 [0.59419333 0.1588898  0.51140268]
 [0.78927128 0.0600722  0.39624365]
 [0.2487962  0.38873877 0.30251831]
 [0.65700721 0.09648378 0.35907713]
 [0.19754972 0.80272808 0.10772805]
 [0.83007884 0.53414053 0.47165162]
 [0.99871578 0.13546956 0.32088381]]


### min - max and the index of min - max elements.

In [47]:
np.max(two_dim_transpoze)

9.0

In [48]:
np.min(two_dim_transpoze)

4.0

In [49]:
np.argmax(two_dim_transpoze)

0

In [50]:
np.argmin(two_dim_transpoze)

5

### Sum of all elements

In [51]:
print(np.sum(large_matrix))

48.67291416906743


### Sum across columns. For each column, intersecting cells with all rows are summed. A vector with size = # of columns is returned.

In [52]:
print(np.sum(large_matrix, axis=0))

[11.51896711 11.5622053  11.41872188  6.26020608  7.91281381]


### Sum across rows. For each row, intersecting cells with all columns are summed. A vector with size = # of rows is returned.

In [53]:
print(np.sum(large_matrix, axis=1))

[2.41064221 1.27201204 1.88526805 2.11554509 2.73652897 2.60307577
 3.0291377  1.19932184 2.366382   2.9439237  3.30046324 1.93105945
 3.60281449 2.72501493 2.30957008 2.02270514 2.02928903 2.94650436
 2.58777617 2.6558799 ]


In [54]:
print(np.mean(large_matrix))

0.4867291416906743


In [55]:
print(np.std(large_matrix))

0.2809320040244828


In [56]:
print(np.var(large_matrix))

0.07892279088521204


# Arithmetic Operations

All sign operators (including @) are elementwise and have their function counterparts (matmul).

## With normal vectors

In [57]:
normal1 = np.arange(10, 20)

In [58]:
print(normal1)

[10 11 12 13 14 15 16 17 18 19]


In [59]:
normal2 = np.arange(20, 30)

In [60]:
print(normal2)

[20 21 22 23 24 25 26 27 28 29]


### Multiplication: * and multiply are elementwise, dot, matmul and @ are dot product. Order is not important for any of them.

In [61]:
normal1 * normal2

array([200, 231, 264, 299, 336, 375, 416, 459, 504, 551])

In [62]:
print(np.multiply(normal1, normal2))

[200 231 264 299 336 375 416 459 504 551]


In [63]:
normal1.dot(normal2)

3635

In [64]:
np.matmul(normal1, normal2)

3635

In [65]:
normal1 @ normal2

3635

### Addition, subtraction and division are all elementwise.

In [66]:
normal1 + normal2

array([30, 32, 34, 36, 38, 40, 42, 44, 46, 48])

In [67]:
print(np.add(normal1, normal2))

[30 32 34 36 38 40 42 44 46 48]


In [68]:
normal1 - normal2

array([-10, -10, -10, -10, -10, -10, -10, -10, -10, -10])

In [69]:
print(np.subtract(normal1, normal2))

[-10 -10 -10 -10 -10 -10 -10 -10 -10 -10]


In [70]:
normal1 / normal2

array([0.5       , 0.52380952, 0.54545455, 0.56521739, 0.58333333,
       0.6       , 0.61538462, 0.62962963, 0.64285714, 0.65517241])

In [71]:
print(np.divide(normal1, normal2))

[0.5        0.52380952 0.54545455 0.56521739 0.58333333 0.6
 0.61538462 0.62962963 0.64285714 0.65517241]


## With row - column vectors

### Multiplication: * and multiply tries to do elementwise multiplication, but cannot, since dimensions do not match. So it does broadcasting (http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html), makes their dimensions same, and then does elementwise multiplication. dot, @ and matmul are matrix multiplication.

In [72]:
print(np.multiply(column, row))

[[1 2 3]
 [2 4 6]
 [3 6 9]]


In [73]:
print(np.multiply(row, column))

[[1 2 3]
 [2 4 6]
 [3 6 9]]


In [74]:
print(row * column)

[[1 2 3]
 [2 4 6]
 [3 6 9]]


In [75]:
print(column * row)

[[1 2 3]
 [2 4 6]
 [3 6 9]]


In [76]:
print(row.dot(column))

[[14]]


In [77]:
print(column.dot(row))

[[1 2 3]
 [2 4 6]
 [3 6 9]]


In [78]:
row @ column

array([[14]])

In [79]:
column @ row

array([[1, 2, 3],
       [2, 4, 6],
       [3, 6, 9]])

In [80]:
np.matmul(row, column)

array([[14]])

In [81]:
np.matmul(column, row)

array([[1, 2, 3],
       [2, 4, 6],
       [3, 6, 9]])

### Addition, subtraction and division are like matrix multiplication. 1st element of row + 2nd element of column -> (1, 2) of the matrix

In [82]:
row + column

array([[2, 3, 4],
       [3, 4, 5],
       [4, 5, 6]])

In [83]:
print(np.add(row, column))

[[2 3 4]
 [3 4 5]
 [4 5 6]]


In [84]:
row - column

array([[ 0,  1,  2],
       [-1,  0,  1],
       [-2, -1,  0]])

In [85]:
print(np.subtract(row, column))

[[ 0  1  2]
 [-1  0  1]
 [-2 -1  0]]


In [86]:
row / column

array([[1.        , 2.        , 3.        ],
       [0.5       , 1.        , 1.5       ],
       [0.33333333, 0.66666667, 1.        ]])

In [87]:
print(np.divide(row, column))

[[1.         2.         3.        ]
 [0.5        1.         1.5       ]
 [0.33333333 0.66666667 1.        ]]


## With matrices and multidimensional arrays

### Multiplication: * and multiply are elementwise, so the order is not important. dot, matmul and @ are matrix multiplication, so the order is important.

In [88]:
x = np.array([[1, 2], [3, 4]])
y = np.array([[5, 6], [7, 8]])

In [89]:
print(x)

[[1 2]
 [3 4]]


In [90]:
print(y)

[[5 6]
 [7 8]]


In [91]:
x * y

array([[ 5, 12],
       [21, 32]])

In [92]:
y * x

array([[ 5, 12],
       [21, 32]])

In [93]:
print(np.multiply(x, y))

[[ 5 12]
 [21 32]]


In [94]:
print(np.multiply(y, x))

[[ 5 12]
 [21 32]]


In [95]:
x.dot(y)

array([[19, 22],
       [43, 50]])

In [96]:
y.dot(x)

array([[23, 34],
       [31, 46]])

In [97]:
np.matmul(x, y)

array([[19, 22],
       [43, 50]])

In [98]:
np.matmul(y, x)

array([[23, 34],
       [31, 46]])

In [99]:
x @ y

array([[19, 22],
       [43, 50]])

In [100]:
y @ x

array([[23, 34],
       [31, 46]])

### Addition, subtraction and division are all elementwise.

In [101]:
x + y

array([[ 6,  8],
       [10, 12]])

In [102]:
print(np.add(x, y))

[[ 6  8]
 [10 12]]


In [103]:
x - y

array([[-4, -4],
       [-4, -4]])

In [104]:
print(np.subtract(x, y))

[[-4 -4]
 [-4 -4]]


In [105]:
x / y

array([[0.2       , 0.33333333],
       [0.42857143, 0.5       ]])

In [106]:
print(np.divide(x, y))

[[0.2        0.33333333]
 [0.42857143 0.5       ]]


### Scalar multiplication

In [107]:
normal1 * 2

array([20, 22, 24, 26, 28, 30, 32, 34, 36, 38])

In [108]:
row * 2

array([[2, 4, 6]])

In [109]:
column * 2

array([[2],
       [4],
       [6]])

In [110]:
x * 2

array([[2, 4],
       [6, 8]])

### No recycling

In [111]:
np.array([1, 2]) + np.array([1, 2, 3, 4])

ValueError: operands could not be broadcast together with shapes (2,) (4,) 