In [2]:
import numpy as np
np.__version__

'1.18.1'

In [3]:
import pandas as pd
pd.__version__

'1.0.2'

## Pandas Series
### Constructing Series objects
> pd.Series(data, index=index)

In [4]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)
print(data.values)
print(data.index)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


In [5]:
data = pd.Series(1, index =['a', 'b', 'c', 'd'])
print(data)
data = pd.Series([0.25, 0.5, 0.75, 1.0], index =['a', 'b', 'c', 'd'])
print(data)
print(pd.Series(data, index = ['c','a']))


a    1
b    1
c    1
d    1
dtype: int64
a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
c    0.75
a    0.25
dtype: float64


> When creating pd Series, index array should be the <span style="color:Crimson">same lengths as data array </span>.
>
> *data = pd.Series([0.25, 0.5, 0.75, 1.0], index =['a', 'b'])*  would reture ERROR 
>
> However, we could set index array to indexing a created serise
> 

#### data can be a dictionary, in which index defaults to the sorted dictionary keys

In [6]:
print(pd.Series({2:'a', 1:'b', 3:'c'}))
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
print(population)

2    a
1    b
3    c
dtype: object
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64


## Pandas DataFrame
### 
#### DataFrame as a generalized NumPy array

In [7]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
print(population)
states = pd.DataFrame({'population': population, 'area': area})
states

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64


Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


#### DataFrame as specialized dictionary

In [8]:
print(states['area'])
print(states['population'])

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64


### Constructing DataFrame objects
#### From a single Series object.

In [9]:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


#### From a list of dicts

In [10]:
data = [{'a': i, 'b': 2 * i}
for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


#### From a two-dimensional NumPy array

In [11]:
pd.DataFrame(np.random.rand(3, 2),
columns=['foo', 'bar'],
index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.205062,0.146681
b,0.629234,0.111652
c,0.827486,0.679403


#### From a NumPy structured array

In [12]:
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')],)
print(A)
Data = pd.DataFrame(A)
print(Data)
pd.DataFrame(A,index=['a', 'b', 'c'])

[(0, 0.) (0, 0.) (0, 0.)]
   A    B
0  0  0.0
1  0  0.0
2  0  0.0


Unnamed: 0,A,B
a,0,0.0
b,0,0.0
c,0,0.0


### The Pandas Index Object

#### Data Selection in Series

In [13]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data)
print(data['b'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5


#### Data Selection in DataFrame

In [14]:
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [15]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [16]:
print(data.values)

[[4.23967000e+05 3.83325210e+07 9.04139261e+01]
 [6.95662000e+05 2.64481930e+07 3.80187404e+01]
 [1.41297000e+05 1.96511270e+07 1.39076746e+02]
 [1.70312000e+05 1.95528600e+07 1.14806121e+02]
 [1.49995000e+05 1.28821350e+07 8.58837628e+01]]


In [17]:
print(data.values[0])
print(data.loc["California":"Florida"])
print(data.iloc[0:4])
data.iloc[:3, :2]

[4.23967000e+05 3.83325210e+07 9.04139261e+01]
              area       pop     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
              area       pop     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [18]:
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [19]:
#Indexers: Series.ix and Series.ix and DataFrame.ix  are deprecated and removed in latest pandas 
#data.ix[:3, :'pop'] 

In [20]:
data.loc[data.density > 100, ['area', 'pop', 'density']]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [21]:
data.loc[data.density > 100, ['area', 'pop', 'density']]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [22]:
data.loc[data.density > 100, data.loc['New York'] > 140]

Unnamed: 0,area,pop
New York,141297,19651127
Florida,170312,19552860


In [23]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [24]:
data['Florida':'Illinois']

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [25]:
data[1:3]

Unnamed: 0,area,pop,density
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


> take care, <span style="color:Crimson">only one row slicing</span> 
>, like data[1], is not correct 

## Operating on Data in Pandas
### UFuncs: Index Alignment

In [26]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A + B)
print(A.add(B, fill_value=0))

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64
0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64


In [27]:
rng = np.random.RandomState(42)
A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=['A','B'])
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
print(A)
print(B)
print(A+B)
fill = A.stack().mean()
print(fill)
A.add(B, fill_value=fill)

    A   B
0   6  19
1  14  10
   B  A  C
0  7  4  6
1  9  2  6
2  7  4  3
      A     B   C
0  10.0  26.0 NaN
1  16.0  19.0 NaN
2   NaN   NaN NaN
12.25


Unnamed: 0,A,B,C
0,10.0,26.0,18.25
1,16.0,19.0,18.25
2,16.25,19.25,15.25


### Ufuncs: Operations Between DataFrame and Series

In [28]:
rng = np.random.RandomState(42)
A = rng.randint(10, size=(3, 4))
print(A)
A-A[0]

[[6 3 7 4]
 [6 9 2 6]
 [7 4 3 7]]


array([[ 0,  0,  0,  0],
       [ 0,  6, -5,  2],
       [ 1,  1, -4,  3]])

In [29]:
df = pd.DataFrame(A, index=[list('ABC')], columns=list('QRST'))
print(df)
print(df - df.iloc[0])
print(df - df.iloc[0:2])
print(df.iloc[0])
print(df.loc['A'])
print(df - df.loc['A'])

   Q  R  S  T
A  6  3  7  4
B  6  9  2  6
C  7  4  3  7
   Q  R  S  T
A  0  0  0  0
B  0  6 -5  2
C  1  1 -4  3
     Q    R    S    T
A  0.0  0.0  0.0  0.0
B  0.0  0.0  0.0  0.0
C  NaN  NaN  NaN  NaN
Q    6
R    3
S    7
T    4
Name: (A,), dtype: int64
   Q  R  S  T
A  6  3  7  4
     Q    R    S    T
A  0.0  0.0  0.0  0.0
B  NaN  NaN  NaN  NaN
C  NaN  NaN  NaN  NaN


> * df.iloc[n] iPhython native one row slicing support broadcasting rules
>
> * df.iloc[ ] multiple rows slicing and df.loc[ ] <span style="color:Crimson">DO NOT </span> support broadcast, they operate according to  missing data rules

In [30]:
print(df.subtract(df['R'], axis=0))
print(df - df.iloc[:,0:2])

   Q  R  S  T
A  3  0  4  1
B -3  0 -7 -3
C  3  0 -1  3
   Q  R   S   T
A  0  0 NaN NaN
B  0  0 NaN NaN
C  0  0 NaN NaN


In [31]:
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()

dtype = object
37.9 ms ± 583 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
dtype = int
1.2 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [32]:
vals1 = np.array([1, None, 3, 4])
print(vals1)
vals2 = np.array([1, np.nan, 3, 4])
print(vals2)
print(vals2.dtype)
print(vals2.sum(), vals2.min(), vals2.max())
print(np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2))

[1 None 3 4]
[ 1. nan  3.  4.]
float64
nan nan nan
8.0 1.0 4.0


In [33]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3,      5],
                   [np.nan, 4,6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [34]:
print(df.dropna())
print()
print(df.dropna(axis="columns"))
df[3] = np.nan
print()
print(df)
print()
print(df.dropna(axis='columns', how='all'))
print()
print(df.dropna(axis='columns', thresh=3))
print()
print(df.fillna(0))
print()
print(df.fillna(method='ffill', axis=0))

     0    1  2
1  2.0  3.0  5

   2
0  2
1  5
2  6

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6

   2
0  2
1  5
2  6

     0    1  2    3
0  1.0  0.0  2  0.0
1  2.0  3.0  5  0.0
2  0.0  4.0  6  0.0

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  2.0  4.0  6 NaN


## Hierarchical Indexing
### Methods of MultiIndex Creation

In [35]:
df = pd.DataFrame(np.random.rand(4, 2), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                  columns=['data1', 'data2'])
print(df.columns)
df

Index(['data1', 'data2'], dtype='object')


Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.725767,0.158989
a,2,0.612323,0.133591
b,1,0.198056,0.910229
b,2,0.978353,0.05873


In [36]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
Data = pd.Series(data)
Data

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

#### MultiIndex level names

In [37]:
Data.index.names = ['state', 'year']
Data

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

#### Explicit MultiIndex constructors

In [38]:
print(pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                                names=['Alpha', 'Num']))
print()
print(pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)],
                               names=['Alpha', 'Num']))
print()
print(pd.MultiIndex.from_product([['a', 'b'], [1, 2]]))
print()
print(pd.MultiIndex(levels=[['a', 'b'], [1, 2]], 
              codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           names=['Alpha', 'Num'])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           names=['Alpha', 'Num'])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


#### Pandas MultiIndex

In [39]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,18976457, 19378102,20851820, 25145561]
index = pd.MultiIndex.from_tuples(index)
pop = pd.Series(populations, index=index)
print(pop)
pop_df = pop.unstack()
print()
print(pop_df)
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,4687374, 4318033,5906301, 6879014]})
print()
print(pop_df)
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561

                    total  under18
California 2000  33871648  9267089
           2010  37253956  9284094
New York   2000  18976457  4687374
           2010  19378102  4318033
Texas      2000  20851820  5906301
           2010  25145561  6879014


Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


### Indexing and Slicing a MultiIndex
#### Multiply indexed DataFrames slicing

In [40]:
index = pd.MultiIndex.from_product([[2013, 2014], ['a', 'b']],names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,a,38.0,38.1,50.0,37.2,23.0,36.7
2013,b,27.0,36.0,39.0,37.3,33.0,36.0
2014,a,42.0,35.3,29.0,36.1,44.0,38.0
2014,b,43.0,37.1,27.0,34.5,31.0,37.1


In [41]:
print(health_data['Guido', 'HR'])
print()
print(health_data.iloc[:2, :3])
print()
idx = pd.IndexSlice
health_data.loc[idx[:, 'b'], idx[:, 'HR']]


year  visit
2013  a        50.0
      b        39.0
2014  a        29.0
      b        27.0
Name: (Guido, HR), dtype: float64

subject      Bob       Guido
type          HR  Temp    HR
year visit                  
2013 a      38.0  38.1  50.0
     b      27.0  36.0  39.0



Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,b,27.0,39.0,33.0
2014,b,43.0,27.0,31.0


> <span style="color:Crimson">pd.IndexSlice</span> is a good way for multiple indexed DataFrame slicing

### Data Aggregations on Multi-Indices

In [47]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
    for c in cols}
    return pd.DataFrame(data, ind)

In [42]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,a,38.0,38.1,50.0,37.2,23.0,36.7
2013,b,27.0,36.0,39.0,37.3,33.0,36.0
2014,a,42.0,35.3,29.0,36.1,44.0,38.0
2014,b,43.0,37.1,27.0,34.5,31.0,37.1


In [43]:
data_mean = health_data.mean(level='year')
print(data_mean)
print()
print(data_mean.mean(axis=1, level='type'))
print()
print(data_mean.mean(axis=1))

subject   Bob        Guido          Sue       
type       HR   Temp    HR   Temp    HR   Temp
year                                          
2013     32.5  37.05  44.5  37.25  28.0  36.35
2014     42.5  36.20  28.0  35.30  37.5  37.55

type    HR       Temp
year                 
2013  35.0  36.883333
2014  36.0  36.350000

year
2013    35.941667
2014    36.175000
dtype: float64


## Combining Datasets: Concat and Append

In [44]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [57]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1); print(df2); print(pd.concat([df1, df2]))
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3); print(df4); print(pd.concat([df3, df4], axis="columns"))

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4
    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
