# Data Science with Python SOLOLEARN
### Getting started with Numpy

Data of past presidents of the United States

In [1]:
heights = [189, 170, 189, 163, 183, 171, 185, 168, 173, 183, 173, 173, 175, 178, 183, 193, 178, 173, 174, 183, 183, 180, 168, 180, 170, 178, 182, 180, 183, 178, 182, 188, 175, 179, 183, 193, 182, 183, 177, 185, 188, 188, 182, 185, 191]

In [2]:
import numpy as np
heights_arr = np.array(heights)
print((heights_arr > 188).sum())

5


`array.size` yields similar result as `len(list)`

In [3]:
heights_arr.size

45

In [4]:
heights_arr.shape

(45,)

In [5]:
heights_arr.ndim  # number of dimensions

1

In [6]:
ages = [57, 61, 57, 57, 58, 57, 61, 54, 68, 51, 49, 64, 50, 48, 65, 52, 56, 46, 54, 49, 51, 47, 55, 55, 54, 42, 51, 56, 55, 51, 54, 51, 60, 62, 43, 55, 56, 61, 52, 69, 64, 46, 54, 47, 70]

In [7]:
heights_and_ages = heights + ages
heights_and_ages_arr = np.array(heights_and_ages)
heights_and_ages_arr.reshape((2, 45))

array([[189, 170, 189, 163, 183, 171, 185, 168, 173, 183, 173, 173, 175,
        178, 183, 193, 178, 173, 174, 183, 183, 180, 168, 180, 170, 178,
        182, 180, 183, 178, 182, 188, 175, 179, 183, 193, 182, 183, 177,
        185, 188, 188, 182, 185, 191],
       [ 57,  61,  57,  57,  58,  57,  61,  54,  68,  51,  49,  64,  50,
         48,  65,  52,  56,  46,  54,  49,  51,  47,  55,  55,  54,  42,
         51,  56,  55,  51,  54,  51,  60,  62,  43,  55,  56,  61,  52,
         69,  64,  46,  54,  47,  70]])

Use -1 as a placeholder for one dimension

In [8]:
heights_and_ages_arr = heights_and_ages_arr.reshape((-1, 45))
heights_and_ages_arr

array([[189, 170, 189, 163, 183, 171, 185, 168, 173, 183, 173, 173, 175,
        178, 183, 193, 178, 173, 174, 183, 183, 180, 168, 180, 170, 178,
        182, 180, 183, 178, 182, 188, 175, 179, 183, 193, 182, 183, 177,
        185, 188, 188, 182, 185, 191],
       [ 57,  61,  57,  57,  58,  57,  61,  54,  68,  51,  49,  64,  50,
         48,  65,  52,  56,  46,  54,  49,  51,  47,  55,  55,  54,  42,
         51,  56,  55,  51,  54,  51,  60,  62,  43,  55,  56,  61,  52,
         69,  64,  46,  54,  47,  70]])

In [9]:
heights_and_ages_arr.dtype

dtype('int64')

Numpy arrays are **homogeneous**, which means all elements are of the same type

In [10]:
heights_float = [189.0, 170, 189, 163, 183, 171, 185, 168, 173, 183, 173, 173, 175, 178, 183, 193, 178, 173, 174, 183, 183, 180, 168, 180, 170, 178, 182, 180, 183, 178, 182, 188, 175, 179, 183, 193, 182, 183, 177, 185, 188, 188, 182, 185, 191]

In [11]:
heights_float_arr = np.array(heights_float)
print(heights_float_arr)
heights_float_arr.dtype

[189. 170. 189. 163. 183. 171. 185. 168. 173. 183. 173. 173. 175. 178.
 183. 193. 178. 173. 174. 183. 183. 180. 168. 180. 170. 178. 182. 180.
 183. 178. 182. 188. 175. 179. 183. 193. 182. 183. 177. 185. 188. 188.
 182. 185. 191.]


dtype('float64')

Without special declaration, Numpy adapts data type to fit all the data
---
### Indexing and slicing

In [12]:
heights_arr[0]

189

In a 2darray, there are two axes, axis 0 and 1. Axis 0 runs downward down the rows whereas axis 1 runs horizontally across the columns.

In [13]:
heights_and_ages_arr[1,2]

57

In [14]:
heights_and_ages_arr[0, 0:3]

array([189, 170, 189])

start from 0 can be omitted, which is equavalent to `heights_and_ages_arr[0, :3]`

In [15]:
heights_and_ages_arr[:, 0]

array([189,  57])

Use `:` as a placeholder to see the entire line.
Numpy slicing follows the conventions of Python list slicing.

### Assining values

In [16]:
print(heights_arr)
heights_arr[3] = 165
heights_arr

[189 170 189 163 183 171 185 168 173 183 173 173 175 178 183 193 178 173
 174 183 183 180 168 180 170 178 182 180 183 178 182 188 175 179 183 193
 182 183 177 185 188 188 182 185 191]


array([189, 170, 189, 165, 183, 171, 185, 168, 173, 183, 173, 173, 175,
       178, 183, 193, 178, 173, 174, 183, 183, 180, 168, 180, 170, 178,
       182, 180, 183, 178, 182, 188, 175, 179, 183, 193, 182, 183, 177,
       185, 188, 188, 182, 185, 191])

In [17]:
heights_and_ages_arr[0, 3] = 165
heights_and_ages_arr

array([[189, 170, 189, 165, 183, 171, 185, 168, 173, 183, 173, 173, 175,
        178, 183, 193, 178, 173, 174, 183, 183, 180, 168, 180, 170, 178,
        182, 180, 183, 178, 182, 188, 175, 179, 183, 193, 182, 183, 177,
        185, 188, 188, 182, 185, 191],
       [ 57,  61,  57,  57,  58,  57,  61,  54,  68,  51,  49,  64,  50,
         48,  65,  52,  56,  46,  54,  49,  51,  47,  55,  55,  54,  42,
         51,  56,  55,  51,  54,  51,  60,  62,  43,  55,  56,  61,  52,
         69,  64,  46,  54,  47,  70]])

In [18]:
heights_and_ages_arr[0,:] = 180
heights_and_ages_arr

array([[180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180,
        180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180,
        180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180,
        180, 180, 180, 180, 180, 180],
       [ 57,  61,  57,  57,  58,  57,  61,  54,  68,  51,  49,  64,  50,
         48,  65,  52,  56,  46,  54,  49,  51,  47,  55,  55,  54,  42,
         51,  56,  55,  51,  54,  51,  60,  62,  43,  55,  56,  61,  52,
         69,  64,  46,  54,  47,  70]])

In [19]:
ages_arr = np.array(ages)
ages_arr.shape

(45,)

In [20]:
heights_arr.shape

(45,)

In [21]:
height_age_arr = np.vstack((heights_arr, ages_arr))
height_age_arr

array([[189, 170, 189, 165, 183, 171, 185, 168, 173, 183, 173, 173, 175,
        178, 183, 193, 178, 173, 174, 183, 183, 180, 168, 180, 170, 178,
        182, 180, 183, 178, 182, 188, 175, 179, 183, 193, 182, 183, 177,
        185, 188, 188, 182, 185, 191],
       [ 57,  61,  57,  57,  58,  57,  61,  54,  68,  51,  49,  64,  50,
         48,  65,  52,  56,  46,  54,  49,  51,  47,  55,  55,  54,  42,
         51,  56,  55,  51,  54,  51,  60,  62,  43,  55,  56,  61,  52,
         69,  64,  46,  54,  47,  70]])

In [22]:
height_age_arr = np.hstack((heights_arr.reshape(-1, 1), ages_arr.reshape(-1, 1)))
height_age_arr

array([[189,  57],
       [170,  61],
       [189,  57],
       [165,  57],
       [183,  58],
       [171,  57],
       [185,  61],
       [168,  54],
       [173,  68],
       [183,  51],
       [173,  49],
       [173,  64],
       [175,  50],
       [178,  48],
       [183,  65],
       [193,  52],
       [178,  56],
       [173,  46],
       [174,  54],
       [183,  49],
       [183,  51],
       [180,  47],
       [168,  55],
       [180,  55],
       [170,  54],
       [178,  42],
       [182,  51],
       [180,  56],
       [183,  55],
       [178,  51],
       [182,  54],
       [188,  51],
       [175,  60],
       [179,  62],
       [183,  43],
       [193,  55],
       [182,  56],
       [183,  61],
       [177,  52],
       [185,  69],
       [188,  64],
       [188,  46],
       [182,  54],
       [185,  47],
       [191,  70]])

`height_age_arr = np.concatenate((heights_arr, ages_arr), axis=1)  # hstack
height_age_arr = np.concatenate((heights_arr, ages_arr), axis=0)  # vstack`

Math operation on arrays

In [24]:
height_age_arr[:,0] * 0.0328084

array([6.2007876, 5.577428 , 6.2007876, 5.413386 , 6.0039372, 5.6102364,
       6.069554 , 5.5118112, 5.6758532, 6.0039372, 5.6758532, 5.6758532,
       5.74147  , 5.8398952, 6.0039372, 6.3320212, 5.8398952, 5.6758532,
       5.7086616, 6.0039372, 6.0039372, 5.905512 , 5.5118112, 5.905512 ,
       5.577428 , 5.8398952, 5.9711288, 5.905512 , 6.0039372, 5.8398952,
       5.9711288, 6.1679792, 5.74147  , 5.8727036, 6.0039372, 6.3320212,
       5.9711288, 6.0039372, 5.8070868, 6.069554 , 6.1679792, 6.1679792,
       5.9711288, 6.069554 , 6.2664044])

In [26]:
height_age_arr.sum()

10577

In [29]:
height_age_arr.sum(axis=0)

array([8102, 2475])

In [30]:
height_age_arr[:, 1] < 55

array([False, False, False, False, False, False, False,  True, False,
        True,  True, False,  True,  True, False,  True, False,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
       False, False,  True,  True,  True, False, False,  True, False,
       False, False,  True, False, False,  True,  True,  True, False])

In [35]:
(height_age_arr[:, 1] == 55).sum()

4

In [40]:
mask = height_age_arr[:, 0] >= 182
mask.sum()

23

In [41]:
tall_presidents = height_age_arr[mask,]
print(tall_presidents.shape)
tall_presidents

(23, 2)


array([[189,  57],
       [189,  57],
       [183,  58],
       [185,  61],
       [183,  51],
       [183,  65],
       [193,  52],
       [183,  49],
       [183,  51],
       [182,  51],
       [183,  55],
       [182,  54],
       [188,  51],
       [183,  43],
       [193,  55],
       [182,  56],
       [183,  61],
       [185,  69],
       [188,  64],
       [188,  46],
       [182,  54],
       [185,  47],
       [191,  70]])

In [45]:
mask = (height_age_arr[:, 0]>=182) & (height_age_arr[:,1]<=50)
height_age_arr[mask,]

array([[183,  49],
       [183,  43],
       [188,  46],
       [185,  47]])

* | or
* & and