In [1]:
import pandas as pd
import numpy as np

# Pivot Tables

In [2]:
#We have seen how the GroupBy abstraction lets us explore relationships within a data‐
#set. A pivot table is a similar operation that is commonly seen in spreadsheets and
#other programs that operate on tabular data. The pivot table takes simple column-
#wise data as input, and groups the entries into a two-dimensional table that provides
#a multidimensional summarization of the data. The difference between pivot tables
#and GroupBy can sometimes cause confusion; it helps me to think of pivot tables as
#essentially a multidimensional version of GroupBy aggregation. That is, you split-
#apply-combine, but both the split and the combine happen across not a one-
#dimensional index, but across a two-dimensional grid.

In [4]:
import seaborn as sns

In [5]:
titanic=sns.load_dataset('titanic')

In [6]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# Pivot Tables by Hand

In [7]:
#To start learning more about this data, we might begin by grouping it according to
#gender, survival status, or some combination thereof. If you have read the previous
#section, you might be tempted to apply a GroupBy operation—for example, let’s look
#at survival rate by gender

In [9]:
titanic.groupby('sex')['survived'].mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [10]:
#This immediately gives us some insight: overall, three of every four females on board
#survived, while only one in five males survived!
#This is useful, but we might like to go one step deeper and look at survival by both sex
#and, say, class. Using the vocabulary of GroupBy , we might proceed using something
#like this: we group by class and gender, select survival, apply a mean aggregate, com‐
#bine the resulting groups, and then unstack the hierarchical index to reveal the hidden
#multidimensionality. In code:

In [12]:
titanic.groupby(['sex','class'])['survived'].aggregate('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


# Pivot Table Syntax

In [13]:
titanic.pivot_table('survived',index='sex',columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [14]:
#This is eminently more readable than the GroupBy approach, and produces the same result.

# Multilevel pivot tables

In [15]:
#Just as in the GroupBy , the grouping in pivot tables can be specified with multiple lev‐
#els, and via a number of options. For example, we might be interested in looking at
#age as a third dimension. We’ll bin the age using the pd.cut function

In [17]:
age=pd.cut(titanic['age'],[0,18,80])

In [18]:
age

0      (18.0, 80.0]
1      (18.0, 80.0]
2      (18.0, 80.0]
3      (18.0, 80.0]
4      (18.0, 80.0]
           ...     
886    (18.0, 80.0]
887    (18.0, 80.0]
888             NaN
889    (18.0, 80.0]
890    (18.0, 80.0]
Name: age, Length: 891, dtype: category
Categories (2, interval[int64]): [(0, 18] < (18, 80]]

In [20]:
titanic.pivot_table('survived',['sex',age],columns='class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


In [21]:
#We can apply this same strategy when working with the columns as well; let’s add info
#on the fare paid using pd.qcut to automatically compute quantiles:

In [23]:
fare=pd.qcut(titanic['fare'],2)

In [24]:
fare

0       (-0.001, 14.454]
1      (14.454, 512.329]
2       (-0.001, 14.454]
3      (14.454, 512.329]
4       (-0.001, 14.454]
             ...        
886     (-0.001, 14.454]
887    (14.454, 512.329]
888    (14.454, 512.329]
889    (14.454, 512.329]
890     (-0.001, 14.454]
Name: fare, Length: 891, dtype: category
Categories (2, interval[float64]): [(-0.001, 14.454] < (14.454, 512.329]]

In [25]:
titanic.pivot_table('survived',['sex',age],[fare,'class'])

Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(0, 18]",,1.0,0.714286,0.909091,1.0,0.318182
female,"(18, 80]",,0.88,0.444444,0.972973,0.914286,0.391304
male,"(0, 18]",,0.0,0.26087,0.8,0.818182,0.178571
male,"(18, 80]",0.0,0.098039,0.125,0.391304,0.030303,0.192308


# Additional pivot table options

In [26]:
#The full call signature of the pivot_table method of DataFrame s is as follows:
# call signature as of Pandas 0.18
#DataFrame.pivot_table(data, values=None, index=None, columns=None,
#aggfunc='mean', fill_value=None, margins=False,
#dropna=True, margins_name='All')
#We’ve already seen examples of the first three arguments; here we’ll take a quick look
#at the remaining ones. Two of the options, fill_value and dropna , have to do with
#missing data and are fairly straightforward; we will not show examples of them here.
#The aggfunc keyword controls what type of aggregation is applied, which is a mean
#by default. As in the GroupBy , the aggregation specification can be a string represent‐
#ing one of several common choices ( 'sum' , 'mean' , 'count' , 'min' , 'max' , etc.) or a
#function that implements an aggregation ( np.sum() , min() , sum() , etc.). Additionally,
#it can be specified as a dictionary mapping a column to any of the above desired
#options:

In [27]:
titanic.pivot_table(index='sex', columns='class',
aggfunc={'survived':sum, 'fare':'mean'})

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


In [28]:
#Notice also here that we’ve omitted the values keyword; when you’re specifying a
#mapping for aggfunc , this is determined automatically.
#At times it’s useful to compute totals along each grouping. This can be done via the
#margins keyword:

In [29]:
titanic.pivot_table('survived', index='sex', columns='class', margins=True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


# Thank You