In [1]:
# Import two external packages : numpy, that handles multi-dimensionnal arrays, and pandas, that manages indexes data frames
import pandas as pd
import numpy as np

# Numpy

## Operations on arrays

In [2]:
arr1 = np.array([1, 2, 3])
arr2 = np.array([2, 4, 6])
arr1

array([1, 2, 3])

### Use the reshape method to transform it into a 'column' vector
The first argument is the number of lines, the second is the number of column. '-1' tells Numpy to guess the number of lines from the number of columns.

In [3]:
arr1.reshape((-1, 1))

array([[1],
       [2],
       [3]])

### Use the reshape method to transform it into a 'line' vector

In [4]:
arr1.reshape((1, -1))

array([[1, 2, 3]])

### Product of two 1-D arrays : element-wise multiplication (both vectors must have the same size)

In [5]:
arr1 * arr2

array([ 2,  8, 18])

Try division, addition, substraction, power of a scalar... anything

### Matrix multiplication ('@' symbol) : scalar product.

In [6]:
arr1 @ arr2

28

### Column vector times line vector : makes a matrix
Numpy has a weird convention here : doing line vector times column vector goves the same result, while we could expect it to be the scalar product too. Can also use 'numpy.outer' method.

In [7]:
arr1.reshape((-1, 1)) * arr2.reshape((1, -1))

array([[ 2,  4,  6],
       [ 4,  8, 12],
       [ 6, 12, 18]])

In [8]:
arr1.reshape((1, -1)) * arr2.reshape((-1, 1))

array([[ 2,  4,  6],
       [ 4,  8, 12],
       [ 6, 12, 18]])

In [9]:
np.outer(arr1, arr2)

array([[ 2,  4,  6],
       [ 4,  8, 12],
       [ 6, 12, 18]])

Numpy is not limited to 2-D arrays ! Try 3D, or more, try different array sizes...

In [10]:
arr3 = np.array([1, 2])
arr3

array([1, 2])

In [11]:
np.outer(arr3, np.outer(arr1, arr2)).reshape((2, 3, 3))

array([[[ 2,  4,  6],
        [ 4,  8, 12],
        [ 6, 12, 18]],

       [[ 4,  8, 12],
        [ 8, 16, 24],
        [12, 24, 36]]])

In [12]:
np.outer(arr3, np.outer(arr1, arr2)).reshape((3, 2, 3))

array([[[ 2,  4,  6],
        [ 4,  8, 12]],

       [[ 6, 12, 18],
        [ 4,  8, 12]],

       [[ 8, 16, 24],
        [12, 24, 36]]])

In [13]:
np.outer(arr3, np.outer(arr1, arr2)).reshape((3, 3, 2))

array([[[ 2,  4],
        [ 6,  4],
        [ 8, 12]],

       [[ 6, 12],
        [18,  4],
        [ 8, 12]],

       [[ 8, 16],
        [24, 12],
        [24, 36]]])

In [14]:
np.outer(arr1, arr2)

array([[ 2,  4,  6],
       [ 4,  8, 12],
       [ 6, 12, 18]])

In [15]:
np.outer(arr1, arr2).T  # Transpose

array([[ 2,  4,  6],
       [ 4,  8, 12],
       [ 6, 12, 18]])

## Usefull init methods with Numpy

### Create a table full of zeros

In [16]:
np.zeros(shape=(10, 5))

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

### Create a table full of ones

In [17]:
np.ones(shape=(10, 5))

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

### Create a table full of whatever float you want

In [18]:
np.ones(shape=(10, 5)) * 2.31

array([[2.31, 2.31, 2.31, 2.31, 2.31],
       [2.31, 2.31, 2.31, 2.31, 2.31],
       [2.31, 2.31, 2.31, 2.31, 2.31],
       [2.31, 2.31, 2.31, 2.31, 2.31],
       [2.31, 2.31, 2.31, 2.31, 2.31],
       [2.31, 2.31, 2.31, 2.31, 2.31],
       [2.31, 2.31, 2.31, 2.31, 2.31],
       [2.31, 2.31, 2.31, 2.31, 2.31],
       [2.31, 2.31, 2.31, 2.31, 2.31],
       [2.31, 2.31, 2.31, 2.31, 2.31]])

### Create a linear space

In [19]:
np.linspace(11, 15, 1000)  # 1000 points evenly spaced, from 11 to 15

array([11.        , 11.004004  , 11.00800801, 11.01201201, 11.01601602,
       11.02002002, 11.02402402, 11.02802803, 11.03203203, 11.03603604,
       11.04004004, 11.04404404, 11.04804805, 11.05205205, 11.05605606,
       11.06006006, 11.06406406, 11.06806807, 11.07207207, 11.07607608,
       11.08008008, 11.08408408, 11.08808809, 11.09209209, 11.0960961 ,
       11.1001001 , 11.1041041 , 11.10810811, 11.11211211, 11.11611612,
       11.12012012, 11.12412412, 11.12812813, 11.13213213, 11.13613614,
       11.14014014, 11.14414414, 11.14814815, 11.15215215, 11.15615616,
       11.16016016, 11.16416416, 11.16816817, 11.17217217, 11.17617618,
       11.18018018, 11.18418418, 11.18818819, 11.19219219, 11.1961962 ,
       11.2002002 , 11.2042042 , 11.20820821, 11.21221221, 11.21621622,
       11.22022022, 11.22422422, 11.22822823, 11.23223223, 11.23623624,
       11.24024024, 11.24424424, 11.24824825, 11.25225225, 11.25625626,
       11.26026026, 11.26426426, 11.26826827, 11.27227227, 11.27

In [20]:
# 2 matrices of 2 lines and 5 columns. First filled with values from 0 to 10 alors columns, the second filled with values from -1 to -5 along lines
np.meshgrid(
    np.linspace(0, 10, 5),
    np.linspace(-1, -5, 2)
)

[array([[ 0. ,  2.5,  5. ,  7.5, 10. ],
        [ 0. ,  2.5,  5. ,  7.5, 10. ]]),
 array([[-1., -1., -1., -1., -1.],
        [-5., -5., -5., -5., -5.]])]

### Create random numbers

In [21]:
np.random.normal()  # Single sample drawn on a Gaussian distribution of mean 0 and std 1

-0.8621685942143492

In [22]:
np.random.normal(10, 100)  # Single sample drawn on a Gaussian distribution of mean 10 and std 100

-49.25752872383347

In [23]:
np.random.normal(10, 100, (10, 10))  # 10x10 samples drawn on a Gaussian distribution of mean 10 and std 100

array([[ 264.72271038,   13.25642172,  165.33850793,  -13.67488754,
         121.41333336, -128.70355058,  -80.9471369 ,  -79.79136844,
         -92.90481739,  142.36818958],
       [ -38.8218019 ,  -59.60735469,  -20.07797954,   88.22203267,
        -144.62763208,  -74.61220579,   17.48053144,  -46.44694088,
          80.22308271,    3.45168518],
       [ 100.42055005,  -45.75331778,  -69.90469876, -118.45267992,
          78.26682604, -122.27204057,   65.51110986, -178.91192514,
          23.04232591,  138.88128463],
       [  77.01654017,   14.76524028, -101.30574667,  -82.1757426 ,
         -33.09157525,  -19.91225501,   26.7667035 ,  -46.72400009,
          63.04886874,  127.02798274],
       [ -24.81200857,  -91.08396531,  -67.93939686,  -25.8673151 ,
          40.54303771,  -32.74448035,   79.9886998 ,   98.90692371,
         165.87975673,  131.51457106],
       [ -85.36805823,   94.49868253,    5.19334447,   56.20709711,
         -17.39383819,   24.17606952,  -37.76682103,   61

You can try out other distributions

## Accessing data in a table

In [24]:
matrix = np.random.randint(0, 10, size=(4, 3))

In [25]:
matrix

array([[7, 2, 5],
       [3, 3, 5],
       [6, 4, 1],
       [5, 0, 6]])

In [26]:
matrix[0]  # First line

array([7, 2, 5])

In [27]:
matrix[:, 0]  # First column

array([7, 3, 6, 5])

In [28]:
matrix[1:4]  # Lines 1 to 3 (line 1 is the second line, line 0 is the first line)

array([[3, 3, 5],
       [6, 4, 1],
       [5, 0, 6]])

In [29]:
matrix[1:4, 0:2]  # Same lines but only the first two columns

array([[3, 3],
       [6, 4],
       [5, 0]])

In [30]:
matrix[::-1, ::-1]  # Inverse all lines and all rows

array([[6, 0, 5],
       [1, 4, 6],
       [5, 3, 3],
       [5, 2, 7]])

In [31]:
matrix[::-1]  # Inverse only lines

array([[5, 0, 6],
       [6, 4, 1],
       [3, 3, 5],
       [7, 2, 5]])

In [32]:
matrix[:, ::-1]  # Inverse only rows

array([[5, 2, 7],
       [5, 3, 3],
       [1, 4, 6],
       [6, 0, 5]])

In [33]:
matrix[::-2]  # Inverse only lines, and only take one line every two lines

array([[5, 0, 6],
       [3, 3, 5]])

In [34]:
matrix > 3  # Same array with values replaced by True where the conditions is met and False everywhere else.

array([[ True, False,  True],
       [False, False,  True],
       [ True,  True, False],
       [ True, False,  True]])

In [35]:
matrix[matrix > 3]  # only keep elements greater than 3 (flattens the array)

array([7, 5, 5, 6, 4, 5, 6])

In [36]:
(matrix > 3) * 1  # Same as matrix > 3 but with 1 instead of True and 0 instead of False

array([[1, 0, 1],
       [0, 0, 1],
       [1, 1, 0],
       [1, 0, 1]])

In [37]:
(matrix > 3) * matrix  # Original matrix with 0 where the value is not greater than 3

array([[7, 0, 5],
       [0, 0, 5],
       [6, 4, 0],
       [5, 0, 6]])

## Modifying data in a table

In [38]:
arr1[0] = 10
arr1

array([10,  2,  3])

In [39]:
matrix[0] = [1, 1, 1]
matrix

array([[1, 1, 1],
       [3, 3, 5],
       [6, 4, 1],
       [5, 0, 6]])

In [40]:
matrix[:, 0] = [1, 1, 1, 1]
matrix

array([[1, 1, 1],
       [1, 3, 5],
       [1, 4, 1],
       [1, 0, 6]])

# Pandas

## Series and DataFrames
Pandas support 1-D arrays (called "Series") and 2-D arrays (called "DataFrames"). The difference with numpy : the columns and lines can have labels.
In a DataFrame, lines are called indexes, but both line labels and column labels are instances of the pd.Index class

Pandas is *very* usefull when working with time series

### Series

In [41]:
s1 = pd.Series(data=arr1)

In [42]:
s1

0    10
1     2
2     3
dtype: int64

In [43]:
s1.values

array([10,  2,  3])

In [44]:
s1.index

RangeIndex(start=0, stop=3, step=1)

In [45]:
s1 = pd.Series(data=arr1, index=["a", "b", "c"])

In [46]:
s1

a    10
b     2
c     3
dtype: int64

In [47]:
s1.index

Index(['a', 'b', 'c'], dtype='object')

In [48]:
s1["a"]

10

In [49]:
s1 = pd.Series(data=arr1, index=pd.DatetimeIndex(["2020-01-01", "2020-01-03", "2020-01-06"]))

In [50]:
s1

2020-01-01    10
2020-01-03     2
2020-01-06     3
dtype: int64

In [51]:
s1[s1.index > "2020-01-03"]  # Pandas recognizes "2020-01-03" as a date

2020-01-06    3
dtype: int64

### DataFrames
They are just multiple Series stackes together

In [52]:
df = pd.DataFrame(data=[arr1, arr2])  # arr1 is the first line, arr2 the second

In [53]:
df

Unnamed: 0,0,1,2
0,10,2,3
1,2,4,6


In [54]:
df = pd.DataFrame(data=[arr1, arr2], index=["A", "B"], columns=pd.DatetimeIndex(["2020-01-01", "2020-01-03", "2020-01-06"]))

In [55]:
df

Unnamed: 0,2020-01-01,2020-01-03,2020-01-06
A,10,2,3
B,2,4,6


In [56]:
s2 = pd.Series(data=arr2, index=pd.DatetimeIndex(["2020-01-01", "2020-01-03", "2020-01-06"]))
s1.name = "A"
s2.name = "B"
df = pd.DataFrame(data=[s1, s2])

In [57]:
df

Unnamed: 0,2020-01-01,2020-01-03,2020-01-06
A,10,2,3
B,2,4,6


In [58]:
df.T  # More useful in this form

Unnamed: 0,A,B
2020-01-01,10,2
2020-01-03,2,4
2020-01-06,3,6


In [59]:
pd.concat([s1, s2], axis=1)

Unnamed: 0,A,B
2020-01-01,10,2
2020-01-03,2,4
2020-01-06,3,6


In [60]:
pd.concat([s1, s2], axis=1).T

Unnamed: 0,2020-01-01,2020-01-03,2020-01-06
A,10,2,3
B,2,4,6


In [61]:
df.loc["A"]

2020-01-01    10
2020-01-03     2
2020-01-06     3
Name: A, dtype: int64

In [62]:
df["2020-01-01"]

A    10
B     2
Name: 2020-01-01 00:00:00, dtype: int64

In [63]:
df.loc["A", "2020-01-01"]

10