In [1]:
import numpy as np
import pandas as pd

In [2]:
hr = pd.read_csv('HR_comma_sep.csv')

### Intro to data structures
* Series
* DataFrame
  <hr/>

### Series

1. From ndarray

In [None]:
s = pd.Series(data, index=index)

In [20]:
s1 = pd.Series(np.random.randn(5), index=['a','b','c','d','e'])
s1

a   -0.078632
b   -0.397386
c   -0.202743
d    0.136333
e    1.788507
dtype: float64

2. from dict

In [9]:
d = {'b': 1, 'a': 0, 'c': 2}
s2 = pd.Series(d, index=['z', 'x', 'c', 'v'])

* Series is ndarray-like

- Indexing & Slicing

In [23]:
s1[0]
s1[1:]
s1[:3]
s1[s1 >= s1.median()]
s1[[4, 3, 1]]
np.exp(s1)

a    0.924380
b    0.672075
c    0.816488
d    1.146063
e    5.980516
dtype: float64

In [24]:
s1.dtype

dtype('float64')

In [25]:
s1.to_numpy()

array([-0.07863187, -0.3973858 , -0.20274263,  0.13633262,  1.7885069 ])

* Series is dict-like

In [28]:
s1['z'] = 12
s1

a    -0.078632
b    -0.397386
c    -0.202743
d     0.136333
e     1.788507
z    12.000000
dtype: float64

In [29]:
s1.get('f', np.nan)

nan

In [31]:
s1.get('z', np.nan)

12.0

### Vectorized operations and label alignment with Series

In [32]:
s1 + s1

a    -0.157264
b    -0.794772
c    -0.405485
d     0.272665
e     3.577014
z    24.000000
dtype: float64

In [34]:
s1 * 3

a    -0.235896
b    -1.192157
c    -0.608228
d     0.408998
e     5.365521
z    36.000000
dtype: float64

In [35]:
np.exp(s1)

a         0.924380
b         0.672075
c         0.816488
d         1.146063
e         5.980516
z    162754.791419
dtype: float64

In [36]:
s1[1:] + s1[:-1]

a         NaN
b   -0.794772
c   -0.405485
d    0.272665
e    3.577014
z         NaN
dtype: float64

* Name attribute

In [39]:
s = pd.Series(np.random.randn(5), name='something')
s.name

'something'

In [42]:
s2 = s.rename("different")
s2.name

'different'

   * <hr />

### DataFrame

* From dict of series or dict

In [43]:
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
    'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

In [44]:
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [45]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [46]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [47]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [48]:
df.columns

Index(['one', 'two'], dtype='object')

* From dict of ndarrays / lists

In [49]:
d = {'one': [1., 2., 3., 4.],
     'two': [4., 3., 2., 1.]}

In [50]:
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [51]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


* from structured or record array 

In [52]:
data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])

data[:] = [(1, 2., 'Hello'), (2, 3., "World")]

pd.DataFrame(data)

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [53]:
pd.DataFrame(data, index=['first', 'second'])

Unnamed: 0,A,B,C
first,1,2.0,b'Hello'
second,2,3.0,b'World'


In [54]:
pd.DataFrame(data, columns=['C', 'A', 'B'])

Unnamed: 0,C,A,B
0,b'Hello',1,2.0
1,b'World',2,3.0


* From a list of dicts

In [55]:
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [56]:
pd.DataFrame(data2, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [57]:
pd.DataFrame(data2, columns=['a', 'b'])

Unnamed: 0,a,b
0,1,2
1,5,10


* From a dict of tuples

In [61]:
tuple = {('a', 'b'):{('A', 'B'): 1, ('A', 'C'): 2},
        ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
        ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
        ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
        ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}

In [63]:
res = pd.DataFrame(tuple)
res

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


### Missing data
* np.nan

In [66]:
res.get('b' ,np.nan)

Unnamed: 0,Unnamed: 1,a,b
A,B,8.0,10.0
A,C,7.0,
A,D,,9.0


  * <hr />

### 10 minutes to pandas

### Viewing data
* head()
* tail()
* index
* columns
* to_numpy()
* describe() Transposing
* T = Transposing
* sort_index(axis=1, ascending=False)
* sort_values(by='')

### Selection

### Setting

In [None]:

    - df['F'] = s1                             => Setting a new column automatically aligns the data by the indexes
    - df.at[dates[0], 'A'] = 0                 => Setting values by labe
    - df.iat[0, 1] = 0                         => Setting values by position
    - df.loc[:, 'D'] = np.array([5] * len(df)) => Setting by assigning with a NumPy array
    - df2[df2 > 0] = -df2                      => A where operation with setting

### Missing data
* dropna()
* fillna()
* isna()

### Operations
* Stats
    - mean()
    - shift()
    - sub()
* apply()
* Histogramming
    - value_counts()
* String Methods
    - cat()
    - split()
    - rsplit()
    - get()
    - join()
    - conains()
    - repeat()
    - replace()
    - pad()
    - center()
    - slice()
    - count()
    - lower()
    - upper()

### Merge
* Concat()
* Join
   - merge(d1,d2)

### Grouping
* split-apply-combine