In [None]:
import numpy as np
np.__version__

In [168]:
import pandas as pd
pd.__version__

'1.0.2'

## Pandas Series
### Constructing Series objects
> pd.Series(data, index=index)

In [None]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)
print(data.values)
print(data.index)

In [None]:
data = pd.Series(1, index =['a', 'b', 'c', 'd'])
print(data)
data = pd.Series([0.25, 0.5, 0.75, 1.0], index =['a', 'b', 'c', 'd'])
print(data)
print(pd.Series(data, index = ['c','a']))


> When creating pd Series, index array should be the <span style="color:Crimson">same lengths as data array </span>.
>
> *data = pd.Series([0.25, 0.5, 0.75, 1.0], index =['a', 'b'])*  would reture ERROR 
>
> However, we could set index array to indexing a created serise
> 

#### data can be a dictionary, in which index defaults to the sorted dictionary keys

In [None]:
print(pd.Series({2:'a', 1:'b', 3:'c'}))
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
print(population)

## Pandas DataFrame
### 
#### DataFrame as a generalized NumPy array

In [None]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
print(population)
states = pd.DataFrame({'population': population, 'area': area})
states

#### DataFrame as specialized dictionary

In [None]:
print(states['area'])
print(states['population'])

### Constructing DataFrame objects
#### From a single Series object.

In [None]:
pd.DataFrame(population, columns=['population'])

#### From a list of dicts

In [None]:
data = [{'a': i, 'b': 2 * i}
for i in range(3)]
pd.DataFrame(data)

#### From a two-dimensional NumPy array

In [None]:
pd.DataFrame(np.random.rand(3, 2),
columns=['foo', 'bar'],
index=['a', 'b', 'c'])

#### From a NumPy structured array

In [None]:
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')],)
print(A)
Data = pd.DataFrame(A)
print(Data)
pd.DataFrame(A,index=['a', 'b', 'c'])

### The Pandas Index Object

#### Data Selection in Series

In [None]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data)
print(data['b'])

#### Data Selection in DataFrame

In [None]:
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

In [None]:
data['density'] = data['pop'] / data['area']
data

In [None]:
print(data.values)

In [None]:
print(data.values[0])
print(data.loc["California":"Florida"])
print(data.iloc[0:4])
data.iloc[:3, :2]

In [None]:
data.loc[:'Illinois', :'pop']

In [None]:
#Indexers: Series.ix and Series.ix and DataFrame.ix  are deprecated and removed in latest pandas 
#data.ix[:3, :'pop'] 

In [None]:
data.loc[data.density > 100, ['area', 'pop', 'density']]

In [None]:
data.loc[data.density > 100, ['area', 'pop', 'density']]

In [None]:
data.loc[data.density > 100, data.loc['New York'] > 140]

In [None]:
data

In [None]:
data['Florida':'Illinois']

In [None]:
data[1:3]

> take care, <span style="color:Crimson">only one row slicing</span> 
>, like data[1], is not correct 

## Operating on Data in Pandas
### UFuncs: Index Alignment

In [None]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A + B)
print(A.add(B, fill_value=0))

In [None]:
rng = np.random.RandomState(42)
A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=['A','B'])
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
print(A)
print(B)
print(A+B)
fill = A.stack().mean()
print(fill)
A.add(B, fill_value=fill)

### Ufuncs: Operations Between DataFrame and Series

In [None]:
rng = np.random.RandomState(42)
A = rng.randint(10, size=(3, 4))
print(A)
A-A[0]

In [None]:
df = pd.DataFrame(A, index=[list('ABC')], columns=list('QRST'))
print(df)
print(df - df.iloc[0])
print(df - df.iloc[0:2])
print(df.iloc[0])
print(df.loc['A'])
print(df - df.loc['A'])

> * df.iloc[n] iPhython native one row slicing support broadcasting rules
>
> * df.iloc[ ] multiple rows slicing and df.loc[ ] <span style="color:Crimson">DO NOT </span> support broadcast, they operate according to  missing data rules

In [None]:
print(df.subtract(df['R'], axis=0))
print(df - df.iloc[:,0:2])

In [None]:
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()

In [None]:
vals1 = np.array([1, None, 3, 4])
print(vals1)
vals2 = np.array([1, np.nan, 3, 4])
print(vals2)
print(vals2.dtype)
print(vals2.sum(), vals2.min(), vals2.max())
print(np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2))

In [None]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3,      5],
                   [np.nan, 4,6]])
df

In [None]:
print(df.dropna())
print()
print(df.dropna(axis="columns"))
df[3] = np.nan
print()
print(df)
print()
print(df.dropna(axis='columns', how='all'))
print()
print(df.dropna(axis='columns', thresh=3))
print()
print(df.fillna(0))
print()
print(df.fillna(method='ffill', axis=0))

## Hierarchical Indexing
### Methods of MultiIndex Creation

In [None]:
df = pd.DataFrame(np.random.rand(4, 2), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                  columns=['data1', 'data2'])
print(df.columns)
df

In [None]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
Data = pd.Series(data)
Data

#### MultiIndex level names

In [None]:
Data.index.names = ['state', 'year']
Data

#### Explicit MultiIndex constructors

In [175]:
print(pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                                names=['Alpha', 'Num']))
print()
print(pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)],
                               names=['Alpha', 'Num']))
print()
print(pd.MultiIndex.from_product([['a', 'b'], [1, 2]]))
print()
print(pd.MultiIndex(levels=[['a', 'b'], [1, 2]], 
              codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           names=['Alpha', 'Num'])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           names=['Alpha', 'Num'])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


#### Pandas MultiIndex

In [188]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,18976457, 19378102,20851820, 25145561]
index = pd.MultiIndex.from_tuples(index)
pop = pd.Series(populations, index=index)
print(pop)
pop_df = pop.unstack()
print()
print(pop_df)
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,4687374, 4318033,5906301, 6879014]})
print()
print(pop_df)
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561

                    total  under18
California 2000  33871648  9267089
           2010  37253956  9284094
New York   2000  18976457  4687374
           2010  19378102  4318033
Texas      2000  20851820  5906301
           2010  25145561  6879014


Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


### Indexing and Slicing a MultiIndex
#### Multiply indexed DataFrames slicing

In [197]:
index = pd.MultiIndex.from_product([[2013, 2014], ['a', 'b']],names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,a,43.0,37.2,26.0,39.2,27.0,36.5
2013,b,28.0,36.8,57.0,36.0,44.0,38.0
2014,a,33.0,38.2,35.0,36.4,24.0,38.8
2014,b,44.0,35.4,40.0,37.1,46.0,36.5


In [199]:
print(health_data['Guido', 'HR'])
print()
print(health_data.iloc[:2, :3])
print()
idx = pd.IndexSlice
health_data.loc[idx[:, 'b'], idx[:, 'HR']]


year  visit
2013  a        26.0
      b        57.0
2014  a        35.0
      b        40.0
Name: (Guido, HR), dtype: float64

subject      Bob       Guido
type          HR  Temp    HR
year visit                  
2013 a      43.0  37.2  26.0
     b      28.0  36.8  57.0



Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,b,28.0,57.0,44.0
2014,b,44.0,40.0,46.0


> <span style="color:Crimson">pd.IndexSlice</span> is a good way for multiple indexed DataFrame slicing

### Data Aggregations on Multi-Indices

In [200]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,a,43.0,37.2,26.0,39.2,27.0,36.5
2013,b,28.0,36.8,57.0,36.0,44.0,38.0
2014,a,33.0,38.2,35.0,36.4,24.0,38.8
2014,b,44.0,35.4,40.0,37.1,46.0,36.5


In [202]:
data_mean = health_data.mean(level='year')
print(data_mean)
print()
print(data_mean.mean(axis=1, level='type'))
print()
print(data_mean.mean(axis=1))

subject   Bob       Guido          Sue       
type       HR  Temp    HR   Temp    HR   Temp
year                                         
2013     35.5  37.0  41.5  37.60  35.5  37.25
2014     38.5  36.8  37.5  36.75  35.0  37.65

type    HR       Temp
year                 
2013  37.5  37.283333
2014  37.0  37.066667

year
2013    37.391667
2014    37.033333
dtype: float64


## Combining Datasets: Concat and Append