## Numpy

From [cs231n note](http://cs231n.github.io/python-numpy-tutorial/).

In [None]:
import numpy as np

a = np.array([1, 2, 3])   # Create a rank 1 array
print(type(a))            # Prints "<class 'numpy.ndarray'>"
print(a.shape)            # Prints "(3,)"
print(a[0], a[1], a[2])   # Prints "1 2 3"

In [None]:
a[0] = 5                  # Change an element of the array
print(a)                  # Prints "[5, 2, 3]"

b = np.array([[1, 2, 3],[4, 5, 6]])    # Create a rank 2 array
print(b.shape)                     # Prints "(2, 3)"
print(b[0, 0], b[0, 1], b[1, 0])   # Prints "1 2 4"

In [None]:
a = np.zeros((2,2))   # Create an array of all zeros
print(a)              # Prints "[[ 0.  0.]
                      #          [ 0.  0.]]"

b = np.ones((1,2))    # Create an array of all ones
print(b)              # Prints "[[ 1.  1.]]"

In [None]:
c = np.full((2,2), 7)  # Create a constant array
print(c)               # Prints "[[ 7.  7.]
                       #          [ 7.  7.]]"

d = np.eye(2)         # Create a 2x2 identity matrix
print(d)              # Prints "[[ 1.  0.]
                      #          [ 0.  1.]]"

e = np.random.random((2,2))  # Create an array filled with random values
print(e)                     # Might print "[[ 0.91940167  0.08143941]
                             #               [ 0.68744134  0.87236687]]"

In [None]:
# Create the following rank 2 array with shape (3, 4)
# [[ 1  2  3  4]
#  [ 5  6  7  8]
#  [ 9 10 11 12]]
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])

# Use slicing to pull out the subarray consisting of the first 2 rows
# and columns 1 and 2; b is the following array of shape (2, 2):
# [[2 3]
#  [6 7]]
b = a[:2, 1:3]

# A slice of an array is a view into the same data, so modifying it
# will modify the original array.
print(a[0, 1])   # Prints "2"
b[0, 0] = 77     # b[0, 0] is the same piece of data as a[0, 1]
print(a[0, 1])   # Prints "77"

In [None]:
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

# Elementwise sum; both produce the array
# [[ 6.0  8.0]
#  [10.0 12.0]]
print(x + y)
print(np.add(x, y))

# Elementwise difference; both produce the array
# [[-4.0 -4.0]
#  [-4.0 -4.0]]
print(x - y)
print(np.subtract(x, y))

# Elementwise product; both produce the array
# [[ 5.0 12.0]
#  [21.0 32.0]]
print(x * y)
print(np.multiply(x, y))

# Elementwise division; both produce the array
# [[ 0.2         0.33333333]
#  [ 0.42857143  0.5       ]]
print(x / y)
print(np.divide(x, y))

# Elementwise square root; produces the array
# [[ 1.          1.41421356]
#  [ 1.73205081  2.        ]]
print(np.sqrt(x))

In [None]:
x = np.array([[1,2],[3,4]])
y = np.array([[5,6],[7,8]])

v = np.array([9,10])
w = np.array([11, 12])

# Inner product of vectors; both produce 219
print(v.dot(w))
print(np.dot(v, w))

# Matrix / vector product; both produce the rank 1 array [29 67]
print(x.dot(v))
print(np.dot(x, v))

# Matrix / matrix product; both produce the rank 2 array
# [[19 22]
#  [43 50]]
print(x.dot(y))
print(np.dot(x, y))

In [None]:
x = np.array([[1,2],[3,4]])

print(np.sum(x))  # Compute sum of all elements; prints "10"
print(np.sum(x, axis=0))  # Compute sum of each column; prints "[4 6]"
print(np.sum(x, axis=1))  # Compute sum of each row; prints "[3 7]"

### Optional: Advanced Numpy

In [None]:
# Check http://cs231n.github.io/python-numpy-tutorial/

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Compute the x and y coordinates for points on a sine curve
x = np.arange(0, 3 * np.pi, 0.1)
y = np.sin(x)

# Plot the points using matplotlib
plt.plot(x, y)
plt.show()  # You must call plt.show() to make graphics appear.

In [None]:
# Compute the x and y coordinates for points on sine and cosine curves
x = np.arange(0, 3 * np.pi, 0.1)
y_sin = np.sin(x)
y_cos = np.cos(x)

# Plot the points using matplotlib
plt.plot(x, y_sin)
plt.plot(x, y_cos)
plt.xlabel('x axis label')
plt.ylabel('y axis label')
plt.title('Sine and Cosine')
plt.legend(['Sine', 'Cosine'])
plt.show()

In [None]:
# Compute the x and y coordinates for points on sine and cosine curves
x = np.arange(0, 3 * np.pi, 0.1)
y_sin = np.sin(x)
y_cos = np.cos(x)

# Set up a subplot grid that has height 2 and width 1,
# and set the first such subplot as active.
plt.subplot(2, 1, 1)

# Make the first plot
plt.plot(x, y_sin)
plt.title('Sine')

# Set the second subplot as active, and make the second plot.
plt.subplot(2, 1, 2)
plt.plot(x, y_cos)
plt.title('Cosine')

# Show the figure.
plt.show()

## Pandas - powerful Python data analysis toolkit

Pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational” or “labeled” data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, real world data analysis in Python.

http://pandas.pydata.org/pandas-docs/stable/index.html

When you need to install a package you can do it here as well

```python
import sys
!{sys.executable} -m pip install numpy
```

This is doing

` python -m pip install <package>`

rather than

`$ pip install <package>`


In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("pandas version: ", pd.__version__)

pandas version:  0.22.0


In [10]:
# First let's create a DataFrame

dates = pd.date_range('20170101', periods=6)
dates

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2017-01-01,-0.104268,0.160382,-0.342482,0.952325
2017-01-02,-0.299426,-0.791099,2.076924,0.511261
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699
2017-01-04,-1.610679,-0.259682,0.951348,0.302981
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664
2017-01-06,0.241598,0.475707,1.022993,-0.204034


In [14]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })

In [15]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [16]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [18]:
### show available "column names" and "public attributes"

# df2.<Press Tab>

In [19]:
df.head()

Unnamed: 0,A,B,C,D
2017-01-01,-0.104268,0.160382,-0.342482,0.952325
2017-01-02,-0.299426,-0.791099,2.076924,0.511261
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699
2017-01-04,-1.610679,-0.259682,0.951348,0.302981
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664


In [21]:
df.tail(3)

Unnamed: 0,A,B,C,D
2017-01-04,-1.610679,-0.259682,0.951348,0.302981
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664
2017-01-06,0.241598,0.475707,1.022993,-0.204034


In [22]:
df.index

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06'],
              dtype='datetime64[ns]', freq='D')

In [23]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [24]:
df.values

array([[-0.10426804,  0.16038224, -0.3424818 ,  0.95232544],
       [-0.29942581, -0.79109943,  2.07692409,  0.51126098],
       [-0.35429399, -1.73747962, -0.60349297,  0.7446988 ],
       [-1.61067908, -0.25968172,  0.95134791,  0.3029807 ],
       [-0.24430433, -0.39754876, -0.00593623, -1.20566374],
       [ 0.2415979 ,  0.47570731,  1.02299288, -0.20403441]])

In [25]:
# describe a quick statistic summary
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.395229,-0.424953,0.516559,0.183595
std,0.632639,0.779572,1.014468,0.788378
min,-1.610679,-1.73748,-0.603493,-1.205664
25%,-0.340577,-0.692712,-0.258345,-0.077281
50%,-0.271865,-0.328615,0.472706,0.407121
75%,-0.139277,0.055366,1.005082,0.686339
max,0.241598,0.475707,2.076924,0.952325


In [26]:
# transpose
df.T

Unnamed: 0,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00
A,-0.104268,-0.299426,-0.354294,-1.610679,-0.244304,0.241598
B,0.160382,-0.791099,-1.73748,-0.259682,-0.397549,0.475707
C,-0.342482,2.076924,-0.603493,0.951348,-0.005936,1.022993
D,0.952325,0.511261,0.744699,0.302981,-1.205664,-0.204034


In [27]:
df

Unnamed: 0,A,B,C,D
2017-01-01,-0.104268,0.160382,-0.342482,0.952325
2017-01-02,-0.299426,-0.791099,2.076924,0.511261
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699
2017-01-04,-1.610679,-0.259682,0.951348,0.302981
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664
2017-01-06,0.241598,0.475707,1.022993,-0.204034


In [33]:
# sorting by an axis, availability: 0 or 1
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2017-01-06,0.241598,0.475707,1.022993,-0.204034
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664
2017-01-04,-1.610679,-0.259682,0.951348,0.302981
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699
2017-01-02,-0.299426,-0.791099,2.076924,0.511261
2017-01-01,-0.104268,0.160382,-0.342482,0.952325


In [37]:
# sorting by values
df.sort_values('B')

Unnamed: 0,A,B,C,D
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699
2017-01-02,-0.299426,-0.791099,2.076924,0.511261
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664
2017-01-04,-1.610679,-0.259682,0.951348,0.302981
2017-01-01,-0.104268,0.160382,-0.342482,0.952325
2017-01-06,0.241598,0.475707,1.022993,-0.204034


### Selection

In [43]:
df['A']

2017-01-01   -0.104268
2017-01-02   -0.299426
2017-01-03   -0.354294
2017-01-04   -1.610679
2017-01-05   -0.244304
2017-01-06    0.241598
Freq: D, Name: A, dtype: float64

In [47]:
df[0:3]

Unnamed: 0,A,B,C,D
2017-01-01,-0.104268,0.160382,-0.342482,0.952325
2017-01-02,-0.299426,-0.791099,2.076924,0.511261
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699


In [49]:
df['20170102':'20170104']

Unnamed: 0,A,B,C,D
2017-01-02,-0.299426,-0.791099,2.076924,0.511261
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699
2017-01-04,-1.610679,-0.259682,0.951348,0.302981


In [51]:
# By Label
df.loc[dates[0]]

A   -0.104268
B    0.160382
C   -0.342482
D    0.952325
Name: 2017-01-01 00:00:00, dtype: float64

In [54]:
# By position
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2017-01-04,-1.610679,-0.259682
2017-01-05,-0.244304,-0.397549


#### Boolean Indexing
Using a single column's values to select data

In [55]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2017-01-06,0.241598,0.475707,1.022993,-0.204034


In [56]:
df.A

2017-01-01   -0.104268
2017-01-02   -0.299426
2017-01-03   -0.354294
2017-01-04   -1.610679
2017-01-05   -0.244304
2017-01-06    0.241598
Freq: D, Name: A, dtype: float64

In [58]:
df[df > 0]

Unnamed: 0,A,B,C,D
2017-01-01,,0.160382,,0.952325
2017-01-02,,,2.076924,0.511261
2017-01-03,,,,0.744699
2017-01-04,,,0.951348,0.302981
2017-01-05,,,,
2017-01-06,0.241598,0.475707,1.022993,


In [59]:
# isin()

df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2017-01-01,-0.104268,0.160382,-0.342482,0.952325,one
2017-01-02,-0.299426,-0.791099,2.076924,0.511261,one
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699,two
2017-01-04,-1.610679,-0.259682,0.951348,0.302981,three
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664,four
2017-01-06,0.241598,0.475707,1.022993,-0.204034,three


In [60]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699,two
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664,four


In [74]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20170102', periods=6))
s1

2017-01-02    1
2017-01-03    2
2017-01-04    3
2017-01-05    4
2017-01-06    5
2017-01-07    6
Freq: D, dtype: int64

In [75]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2017-01-01,0.0,0.160382,-0.342482,0.952325,
2017-01-02,-0.299426,-0.791099,2.076924,0.511261,1.0
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699,2.0
2017-01-04,-1.610679,-0.259682,0.951348,0.302981,3.0
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664,4.0
2017-01-06,0.241598,0.475707,1.022993,-0.204034,5.0


In [78]:
df.at[dates[0],'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2017-01-01,0.0,0.160382,-0.342482,0.952325,
2017-01-02,-0.299426,-0.791099,2.076924,0.511261,1.0
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699,2.0
2017-01-04,-1.610679,-0.259682,0.951348,0.302981,3.0
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664,4.0
2017-01-06,0.241598,0.475707,1.022993,-0.204034,5.0


In [81]:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2017-01-01,0.0,0.0,-0.342482,0.952325,
2017-01-02,-0.299426,-0.791099,2.076924,0.511261,1.0
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699,2.0
2017-01-04,-1.610679,-0.259682,0.951348,0.302981,3.0
2017-01-05,-0.244304,-0.397549,-0.005936,-1.205664,4.0
2017-01-06,0.241598,0.475707,1.022993,-0.204034,5.0


### Missing Data

pandas primarily uses the value np.nan to represent missing data. It is by default not included in computations. 

Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data.


In [83]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2017-01-01,0.0,0.0,-0.342482,0.952325,,1.0
2017-01-02,-0.299426,-0.791099,2.076924,0.511261,1.0,1.0
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699,2.0,
2017-01-04,-1.610679,-0.259682,0.951348,0.302981,3.0,


In [84]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2017-01-02,-0.299426,-0.791099,2.076924,0.511261,1.0,1.0


In [85]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2017-01-01,0.0,0.0,-0.342482,0.952325,5.0,1.0
2017-01-02,-0.299426,-0.791099,2.076924,0.511261,1.0,1.0
2017-01-03,-0.354294,-1.73748,-0.603493,0.744699,2.0,5.0
2017-01-04,-1.610679,-0.259682,0.951348,0.302981,3.0,5.0


In [86]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2017-01-01,False,False,False,False,True,False
2017-01-02,False,False,False,False,False,False
2017-01-03,False,False,False,False,False,True
2017-01-04,False,False,False,False,False,True


### Stats

In [87]:
# Stats
df.mean()

A   -0.377851
B   -0.451684
C    0.516559
D    0.183595
F    3.000000
dtype: float64

In [88]:
# on other axis
df.mean(1)

2017-01-01    0.152461
2017-01-02    0.499532
2017-01-03    0.009886
2017-01-04    0.476794
2017-01-05    0.429309
2017-01-06    1.307253
Freq: D, dtype: float64

### Optional Reading

How to visualize and preprocess data

https://www.kaggle.com/yassineghouzam/titanic-top-4-with-ensemble-modeling?scriptVersionId=1416377