### Selection in DataFrame


In [1]:
import pandas as pd
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'Illinois': 12882135, 'New York': 19651127, 'Florida': 19552860})
data = pd.DataFrame({'area': area, 'pop': pop})
data  # aligned and sorted


Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [2]:
# loc, iloc again
data.loc[:'Florida', :'pop']


Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860


In [3]:
# implicit iloc
data.iloc[:3, :2]


Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [4]:
# selection using > < etc. (row); fancy indexing (column)
data['density'] = data['pop'] / data['area']
data.loc[data.density > 100, ['pop', 'density']]


Unnamed: 0,pop,density
Florida,19552860,114.806121
New York,19651127,139.076746


In [5]:
# Note for []:
# indexing refers to columns, slicing refers to rows
print(data['Florida':'Illinois'], '\n')
print(data['area'])


            area       pop     density
Florida   170312  19552860  114.806121
Illinois  149995  12882135   85.883763 

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64


In [6]:
# refer to rows by implicit number rather than by index
data[1:3]


Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [7]:
# refer to rows for > < etc.
data[data.density > 100]


Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
New York,141297,19651127,139.076746


In [8]:
copy_df = data.drop('Florida')
copy_df


Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [9]:
del copy_df['pop']  # error if does not exist
copy_df


Unnamed: 0,area,density
California,423967,90.413926
Illinois,149995,85.883763
New York,141297,139.076746
Texas,695662,38.01874


In [10]:
# assign None does not remove
copy_df['density'] = None
copy_df


Unnamed: 0,area,density
California,423967,
Illinois,149995,
New York,141297,
Texas,695662,


### Index alignment


### Series

The resulting array contains the union of indices of the two input arrays; <br>
Any item for which one or the other does not have an entry is marked with NaN, or "Not a Number".


In [11]:
import pandas as pd
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')
population / area


Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [12]:
area.index.union(population.index)


Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [13]:
area.index.intersection(population.index)


Index(['Texas', 'California'], dtype='object')

In [14]:
# another example:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B


0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

If using NaN values is not the desired behavior, we can modify the fill value using appropriate object methods in place of the operators. For example, calling A.add(B) is equivalent to calling A + B, but allows optional explicit specification of the fill value for any elements in A or B that might be missing:


In [15]:
A.add(B, fill_value=0)


0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### DataFrame


In [16]:
import pandas as pd
import numpy as np
A = pd.DataFrame(np.random.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A


Unnamed: 0,A,B
0,0,1
1,11,10


In [17]:
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B


Unnamed: 0,B,A,C
0,3,0,3
1,2,5,4
2,9,0,4


In [18]:
# indices are aligned correctly irrespective of their order
A + B


Unnamed: 0,A,B,C
0,0.0,4.0,
1,16.0,12.0,
2,,,


In [19]:
Amean = A.stack().mean()  # stack by rows
Amean


5.5

In [20]:
A.stack()


0  A     0
   B     1
1  A    11
   B    10
dtype: int64

In [21]:
A.add(B, fill_value=Amean)


Unnamed: 0,A,B,C
0,0.0,4.0,8.5
1,16.0,12.0,9.5
2,5.5,14.5,9.5


**Pandas methods** <br>
`+` add() <br>
`-` sub(), subtract() <br>
`*` mul(), multiply() <br>
`/` truediv(), div(), divide() <br>
`//` floordiv() <br>
`%` mod() <br>
`**` pow()


**Operations between Series and DataFrames**


In [22]:
A = np.random.randint(10, size=(3, 4))
print(A, '\n')
A - A[0]


[[8 2 0 0]
 [1 2 6 9]
 [3 0 2 9]] 



array([[ 0,  0,  0,  0],
       [-7,  0,  6,  9],
       [-5, -2,  2,  9]])

In [23]:
# by default: operate on rows
df = pd.DataFrame(A, columns=list('QRST'))
df


Unnamed: 0,Q,R,S,T
0,8,2,0,0
1,1,2,6,9
2,3,0,2,9


In [24]:
df - df.iloc[0]


Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-7,0,6,9
2,-5,-2,2,9


In [25]:
# specify axis for column-wise operations
df.subtract(df['R'], axis=0)


Unnamed: 0,Q,R,S,T
0,6,0,-2,-2
1,-1,0,4,7
2,3,0,2,9


In [26]:
df


Unnamed: 0,Q,R,S,T
0,8,2,0,0
1,1,2,6,9
2,3,0,2,9


In [27]:
# automatically align indices
print(df.index)
halfrow = df.iloc[0, ::2]
halfrow


RangeIndex(start=0, stop=3, step=1)


Q    8
S    0
Name: 0, dtype: int64

In [28]:
df - halfrow


Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-7.0,,6.0,
2,-5.0,,2.0,


### Pandas: Missing data


NaN , not a number, is a numeric data type used to represent any value that is undefined or unpresentable. <br>
It is a special floating-point value recognized by all systems that use the standard IEEE floating-point representation. <br>
NaN is also assigned to variables, in a computation, that do not have values and have yet to be computed.


In [29]:
numbers = np.array([1, 2, None])
print(pd.Series(numbers))


0       1
1       2
2    None
dtype: object


In [30]:
# none returns error
numbers.sum()


TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [31]:
numbers = np.array([1, 2, np.nan])
print(pd.Series(numbers))


0    1.0
1    2.0
2    NaN
dtype: float64


In [32]:
# nan doesn't return errors, but not useful
numbers.sum()


nan

In [33]:
1 + np.nan


nan

In [34]:
0 * np.nan


nan

In [35]:
# use special aggregations
np.nansum(numbers), np.nanmin(numbers), np.nanmax(numbers)


(3.0, 1.0, 2.0)

One has to be mindful that in Python (and NumPy), the nan's don’t compare equal, but None's do.


In [36]:
import numpy as np
np.nan == None


False

In [37]:
np.nan == np.nan


False

In [38]:
np.isnan(np.nan)


True

### NaN in Pandas

For types that don’t have an available sentinel value, Pandas automatically type-casts when NA values are present. <br>
If we set a value in an integer array to np.nan, it will automatically be upcast to a floating-point type to accommodate the NA. <br>


In [39]:
pd.Series([1, np.nan, 2, None])


0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [40]:
x = pd.Series(range(2), dtype=int)
print(x)


0    0
1    1
dtype: int64


In [41]:
x[0] = None
x


0    NaN
1    1.0
dtype: float64

**Pandas handing NAs by type** <br>
floating **No change** np.nan <br>
object **No change** None or np.nan <br>
integer **Cast to float64** np.nan <br>
boolean **Cast to object** None or np.nan <br>


### Operation on Null Values


**isnull()** <br>
Generate a Boolean mask indicating missing values<br>
**notnull()**<br>
Opposite of isnull()<br>
**dropna()**<br>
Return a filtered version of the data<br>
**fillna()**<br>
Return a copy of the data with missing values filled or imputed<br>


In [42]:
data = pd.Series([1, np.nan, 'hello', None])
data


0        1
1      NaN
2    hello
3     None
dtype: object

In [43]:
data.isnull()


0    False
1     True
2    False
3     True
dtype: bool

In [44]:
data.notnull()


0     True
1    False
2     True
3    False
dtype: bool

In [45]:
data[data.notnull()]


0        1
2    hello
dtype: object

In [46]:
data.dropna()


0        1
2    hello
dtype: object

In [47]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3, 5],
                   [np.nan, 4, 6]])
df


Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [48]:
# dropna() will drop all rows in which
# any null value is present:
df.dropna()


Unnamed: 0,0,1,2
1,2.0,3.0,5


In [49]:
df.dropna(axis='columns')  # or axis = 1


Unnamed: 0,2
0,2
1,5
2,6


In [50]:
df.dropna(axis=1)


Unnamed: 0,2
0,2
1,5
2,6


In [51]:
# default is any NA present
df[3] = np.nan
df


Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [52]:
# change to how = 'all'
df.dropna(axis='columns', how='all')


Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [53]:
# thresh: require at least n non-null values
df.dropna(axis='rows', thresh=3)
# the number of non-null values fewer than 3 will be removed


Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


**Fill null values**


In [54]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data


a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [55]:
data.fillna(0)


a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [56]:
# forward-fill
data.fillna(method='ffill')


a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [57]:
# back-fill
data.fillna(method='bfill')


a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [58]:
df


Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [59]:
df.fillna(method='bfill', axis=1)


Unnamed: 0,0,1,2,3
0,1.0,2.0,2.0,
1,2.0,3.0,5.0,
2,4.0,4.0,6.0,


In [60]:
df.fillna(method='bfill', axis=0)


Unnamed: 0,0,1,2,3
0,1.0,3.0,2,
1,2.0,3.0,5,
2,,4.0,6,


### MultiIndex


In [61]:
index = [('California', '2000'), ('California', '2010'),
         ('New York', '2000'), ('New York', '2010'),
         ('Texas', '2000'), ('Texas', '2010')]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
# NOT recommended
pop = pd.Series(populations, index=index)
pop


(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [62]:
pop[[i for i in pop.index if i[1] == '2010']]


(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [63]:
# Pandas MultiIndex
index = pd.MultiIndex.from_tuples(index)
index


MultiIndex([('California', '2000'),
            ('California', '2010'),
            (  'New York', '2000'),
            (  'New York', '2010'),
            (     'Texas', '2000'),
            (     'Texas', '2010')],
           )

In [64]:
pop2 = pd.Series(populations, index=index)
pop2


California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [65]:
# reindex also works
pop = pop.reindex(index)
pop


California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [66]:
# selection based on index, i.e. 2010
pop[:, '2010']


California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [67]:
pop['California']


2000    33871648
2010    37253956
dtype: int64

In [68]:
# extra dimension to a conventionally indexed DataFrame
pop_df = pop.unstack()
pop_df


Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [69]:
pop_df.stack()  # by rows


California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

#### MultiIndex Creation


In [70]:
# pass a list of two or more index arrays
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df


Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.571514,0.808336
a,2,0.958098,0.694308
b,1,0.690151,0.208586
b,2,0.644219,0.224597


In [71]:
# pass a dictionary with appropriate tuples as keys
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(data)


California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

#### Construct MultiIndex


In [72]:
# array
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])


MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [73]:
# tuple
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])


MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [74]:
# product
pd.MultiIndex.from_product([['a', 'b'], [1, 2, 3]])


MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3)],
           )

In [75]:
# MultiIndex with names: keep track of the meaning of index values
pop.index.names = ['state', 'year']
pop


state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

#### MultiIndex for Columns


In [76]:
# hierarchical indices
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
index


MultiIndex([(2013, 1),
            (2013, 2),
            (2014, 1),
            (2014, 2)],
           names=['year', 'visit'])

In [77]:
# hierarchical columns
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'],
                                      ['HR', 'Temp']],
                                     names=['subject', 'type'])
columns


MultiIndex([(  'Bob',   'HR'),
            (  'Bob', 'Temp'),
            ('Guido',   'HR'),
            ('Guido', 'Temp'),
            (  'Sue',   'HR'),
            (  'Sue', 'Temp')],
           names=['subject', 'type'])

In [78]:
# mock some data
data = np.round(np.random.randn(4, 6), 1)  # standard normal; round to 0.1 (Important!)
data


array([[ 0.6,  0.3,  1.2, -1.1, -1. , -1.1],
       [ 1.1, -0.2,  0.5, -0.7,  3.1,  0. ],
       [ 0.5, -0.5,  0. , -0.3, -0.2, -1.7],
       [-1. , -0.6,  0.1,  3.2, -0.5,  0.2]])

In [79]:
data[:, ::2] *= 10
data += 37
data


array([[43. , 37.3, 49. , 35.9, 27. , 35.9],
       [48. , 36.8, 42. , 36.3, 68. , 37. ],
       [42. , 36.5, 37. , 36.7, 35. , 35.3],
       [27. , 36.4, 38. , 40.2, 32. , 37.2]])

In [80]:
# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data


Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,43.0,37.3,49.0,35.9,27.0,35.9
2013,2,48.0,36.8,42.0,36.3,68.0,37.0
2014,1,42.0,36.5,37.0,36.7,35.0,35.3
2014,2,27.0,36.4,38.0,40.2,32.0,37.2


#### Indexing and Slicing


For Series:


In [81]:
pop


state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [82]:
pop['California', '2000']


33871648

In [83]:
pop['California']


year
2000    33871648
2010    37253956
dtype: int64

In [84]:
# perform partial indexing on lower levels by passing an
# empty slice in the first index
pop[:, '2000']


state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [85]:
# selection based on Boolean
pop[pop > 22000000]


state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [86]:
# fancy indexing
pop[['California', 'Texas']]


state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

#### For DataFrame:


In [87]:
health_data


Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,43.0,37.3,49.0,35.9,27.0,35.9
2013,2,48.0,36.8,42.0,36.3,68.0,37.0
2014,1,42.0,36.5,37.0,36.7,35.0,35.3
2014,2,27.0,36.4,38.0,40.2,32.0,37.2


In [88]:
health_data['Guido', 'HR']


year  visit
2013  1        49.0
      2        42.0
2014  1        37.0
      2        38.0
Name: (Guido, HR), dtype: float64

In [89]:
#loc, iloc
health_data.iloc[:2, 2:4]


Unnamed: 0_level_0,subject,Guido,Guido
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,49.0,35.9
2013,2,42.0,36.3


In [90]:
health_data.loc[:, ('Bob', 'Temp')]


year  visit
2013  1        37.3
      2        36.8
2014  1        36.5
      2        36.4
Name: (Bob, Temp), dtype: float64

In [91]:
# building the desired slice explicitly using an IndexSlice
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]


Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,43.0,49.0,27.0
2014,1,42.0,37.0,35.0


#### Rearranging Multi_indices


In [92]:
# data example
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]],
                                   names=['char', 'int'])
data = pd.Series(np.random.rand(6), index=index)
# alternative way to specify index names
#data.index.names = ['char', 'int']
data


char  int
a     1      0.486286
      2      0.370967
c     1      0.944255
      2      0.098909
b     1      0.367670
      2      0.941575
dtype: float64

In [93]:
# does not work for unsorted index
data['a':'b']


UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [94]:
# sorting
data = data.sort_index()
data


char  int
a     1      0.486286
      2      0.370967
b     1      0.367670
      2      0.941575
c     1      0.944255
      2      0.098909
dtype: float64

In [95]:
# now working
data['a':'b']


char  int
a     1      0.486286
      2      0.370967
b     1      0.367670
      2      0.941575
dtype: float64

#### stacking and unstacking


In [96]:
pop


state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [97]:
pop.unstack(level=0)  # specify the level


state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [98]:
pop.unstack(level=1)


year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [99]:
pop.unstack(level=1).stack()


state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [100]:
pop.unstack().stack()


state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [101]:
pop.unstack(level=0).stack()


year  state     
2000  California    33871648
      New York      18976457
      Texas         20851820
2010  California    37253956
      New York      19378102
      Texas         25145561
dtype: int64

#### Index setting and resetting


In [102]:
pop


state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

reset_index: turn the index labels into columns;<br>
result in a DataFrame with columns holding the information that was formerly in the index.


In [103]:
# optionally specify the name of the data for the column representation
pop_flat = pop.reset_index(name='population')
pop_flat


Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [104]:
# on the other hand
# build a MultiIndex from the column values
pop_flat.set_index(['state', 'year'])


Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


#### Data Aggregation


In [105]:
health_data


Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,43.0,37.3,49.0,35.9,27.0,35.9
2013,2,48.0,36.8,42.0,36.3,68.0,37.0
2014,1,42.0,36.5,37.0,36.7,35.0,35.3
2014,2,27.0,36.4,38.0,40.2,32.0,37.2


In [106]:
# name the index level
health_data.mean(level='year')


  health_data.mean(level='year')


subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,45.5,37.05,45.5,36.1,47.5,36.45
2014,34.5,36.45,37.5,38.45,33.5,36.25


In [107]:
# axis=1: columns
health_data.mean(axis=1, level='type')


  health_data.mean(axis=1, level='type')


Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,39.666667,36.366667
2013,2,52.666667,36.7
2014,1,38.0,36.166667
2014,2,32.333333,37.933333


In [108]:
health_data.mean(level='year')


  health_data.mean(level='year')


subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,45.5,37.05,45.5,36.1,47.5,36.45
2014,34.5,36.45,37.5,38.45,33.5,36.25


In [109]:
# first year then type
health_data.mean(level='year').mean(axis=1, level='type')


  health_data.mean(level='year').mean(axis=1, level='type')
  health_data.mean(level='year').mean(axis=1, level='type')


type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,46.166667,36.533333
2014,35.166667,37.05


In [110]:
# first type then year: same
health_data.mean(axis=1, level='type').mean(level='year')


  health_data.mean(axis=1, level='type').mean(level='year')
  health_data.mean(axis=1, level='type').mean(level='year')


type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,46.166667,36.533333
2014,35.166667,37.05
