# 8.1 Hierarchical Indexing

Hierarchical indexing is an important feature of pandas that enables you to have multiple (two or more) index levels on an axis. It provides a way for you to work with higher dimensional data in a lower dimensional form

In [2]:
import numpy as np
import pandas as pd

data = pd.Series(np.random.randn(9),
                 index = [['a','a','a','b','b','c','c','d','d'],
                          [1,2,3,1,3,1,2,2,3]])
data

a  1   -0.214941
   2    2.147522
   3    0.564280
b  1    1.059833
   3   -1.104780
c  1    0.210634
   2    1.423999
d  2   -1.256163
   3   -1.129026
dtype: float64

In [3]:
data.shape

(9,)

In [4]:
data.ndim

1

In [5]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [6]:
data['b']

1    1.059833
3   -1.104780
dtype: float64

In [7]:
data['b':'c']

b  1    1.059833
   3   -1.104780
c  1    0.210634
   2    1.423999
dtype: float64

In [8]:
data.loc[['b','d']]

b  1    1.059833
   3   -1.104780
d  2   -1.256163
   3   -1.129026
dtype: float64

In [9]:
data.loc[:,2] # inner level selection a2, c2, d2

a    2.147522
c    1.423999
d   -1.256163
dtype: float64

Hierarchical indexing plays an important role in reshaping data and group-based operations like forming a pivot table

In [10]:
data.unstack()

Unnamed: 0,1,2,3
a,-0.214941,2.147522,0.56428
b,1.059833,,-1.10478
c,0.210634,1.423999,
d,,-1.256163,-1.129026


In [11]:
data.unstack().stack()

a  1   -0.214941
   2    2.147522
   3    0.564280
b  1    1.059833
   3   -1.104780
c  1    0.210634
   2    1.423999
d  2   -1.256163
   3   -1.129026
dtype: float64

With a DataFrame, either axis can have a hierarchical index

In [12]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index = [['a', 'a', 'b', 'b'],
                              [1, 2, 1, 2]],
                     columns = [['Ohio', 'Ohio', 'Colorado'],
                                ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


The hierarchical levels can have names as strings or any python objects

In [13]:
frame.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [14]:
frame.columns

MultiIndex([(    'Ohio', 'Green'),
            (    'Ohio',   'Red'),
            ('Colorado', 'Green')],
           )

In [15]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [16]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [17]:
frame.loc[:, 'Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [18]:
frame.loc['b', :]

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,7,8
2,9,10,11


In [19]:
frame.loc[:, ['Ohio', 'Red']]

Unnamed: 0_level_0,state,Ohio,Ohio
Unnamed: 0_level_1,color,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [20]:
frame.loc[['b', 1], :]

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
b,1,6,7,8
b,2,9,10,11


In [21]:
pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
                          names = ['state', 'color'])

MultiIndex([(    'Ohio', 'Green'),
            (    'Ohio',   'Red'),
            ('Colorado', 'Green')],
           names=['state', 'color'])

## 8.1.1 Reordering and Sorting Levels

In [23]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [26]:
frame.index.levels

FrozenList([['a', 'b'], [1, 2]])

In [32]:
frame.index.names

FrozenList(['key1', 'key2'])

In [28]:
frame.columns.levels

FrozenList([['Colorado', 'Ohio'], ['Green', 'Red']])

In [33]:
frame.columns.names

FrozenList(['state', 'color'])

In [34]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [38]:
frame.swaplevel(0, 1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [35]:
frame.sort_index(level = 0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [36]:
frame.sort_index(level = 1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [41]:
frame.swaplevel(0, 1).sort_index(level = 0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [42]:
frame.swaplevel('key1', 'key2').sort_index(level = 0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


## 8.1.2 Summary Statistics by Level

In [44]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [45]:
frame.sum(level = 'key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [46]:
frame.sum(level = 1)

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [47]:
frame.sum(level = 'key1')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [48]:
frame.sum(level = 0)

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [49]:
frame.sum(level = 'color', axis = 1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [50]:
frame.sum(level = 1, axis = 1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


Under the hood, this utilizes pandas's groupby machinery

## 8.1.3 Indexing with a DataFrame's columns

In [51]:
frame = pd.DataFrame({'a':range(7), 
                      'b':range(7,0,-1),
                      'c':['one','one','one','two','two','two','two'],
                      'd':[0,1,2,0,1,2,3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


set_index function will create a new DataFrame using one or more of its columns as the index

In [55]:
frame2 = frame.set_index(['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [56]:
frame.set_index(['c', 'd'], drop = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


reset_index, on the other hand, does the opposite of set_index, the hierarchical index levels are moved into the columns

In [57]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [58]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


# 8.2 Combining and Merging Datasets

1. pandas.merge connects rows in DataFrames based on one or more keys. This will be familiar to users of SQL or other relational database, as it implements database join operations
2. pandas.concat concatenates or stacks together objects along an axis
3. the combine_first instance method enables splicing together overlapping data to fill in missing values in one object with values from another

## 8.2.1 Database-Style DataFrame Joins

Merge or join operations combine datasets by linking rows using one or more keys. These operations are central to relational databases (e.g. SQL based)

In [59]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [60]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


many-to-one join

In [61]:
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [62]:
pd.merge(df1, df2, on = 'key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [63]:
pd.merge(df1, df2, on = 'key', how = 'inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [66]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [67]:
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


By default, merge does an inner join. The keys in the result are the intersection, or the common set found in both tables.

In [68]:
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [72]:
pd.merge(df1, df2, how = 'outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [73]:
pd.merge(df1, df2, how = 'inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [74]:
pd.merge(df1, df2) # default inner join

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [75]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [76]:
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [78]:
pd.merge(left = df1, right = df2, on = 'key', how = 'left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [79]:
pd.merge(df1, df2, how = 'inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


To merge with multiple keys, pass a list of column names

In [80]:
left = pd.DataFrame({'key1':['foo','foo','bar'],
                     'key2':['one','two','one'],
                     'lval':[1, 2, 3]})
left

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [81]:
right = pd.DataFrame({'key1':['foo','foo','bar','bar'],
                      'key2':['one','one','one','two'],
                      'rval':[4, 5, 6, 7]})
right

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


In [82]:
pd.merge(left, right, on = ['key1', 'key2'], how = 'outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


When joining columns-on-columns, the indexes on the passed DataFrame objects are discarded

In [83]:
pd.merge(left, right, on = 'key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [84]:
pd.merge(left, right, on = 'key1', suffixes = ('_left', '_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


## 8.2.2 Merging on Index

In some cases, the merge key(s) in a DataFrame will be found in its index. In this case, pass left_index = True or right_index = True or both to indicate that the index should be used as the merge key

In [85]:
left1 = pd.DataFrame({'key':['a','b','a','a','b','c'],
                      'value':range(6)})
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [86]:
right1 = pd.DataFrame({'group_val':[3.5, 7]}, index = ['a','b'])
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [87]:
pd.merge(left1, right1, left_on = 'key', right_index = True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [88]:
pd.merge(left1, right1, left_on = 'key', right_index = True, how = 'outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [89]:
lefth = pd.DataFrame({'key1':['Ohio','Ohio','Ohio', 'Nevada','Nevada'],
                      'key2':[2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})
lefth

Unnamed: 0,key1,key2,data
0,Ohio,2000,0.0
1,Ohio,2001,1.0
2,Ohio,2002,2.0
3,Nevada,2001,3.0
4,Nevada,2002,4.0


In [90]:
righth = pd.DataFrame(np.arange(12).reshape((6,2)),
                      index = [['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],
                               [2001, 2000, 2000, 2000, 2001, 2002]],
                      columns = ['event1', 'event2'])
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [91]:
pd.merge(lefth, righth, left_on = ['key1', 'key2'], right_index = True)

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4,5
0,Ohio,2000,0.0,6,7
1,Ohio,2001,1.0,8,9
2,Ohio,2002,2.0,10,11
3,Nevada,2001,3.0,0,1


In [92]:
pd.merge(lefth, righth, left_on = ['key1', 'key2'], right_index = True, how = 'outer')

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4.0,5.0
0,Ohio,2000,0.0,6.0,7.0
1,Ohio,2001,1.0,8.0,9.0
2,Ohio,2002,2.0,10.0,11.0
3,Nevada,2001,3.0,0.0,1.0
4,Nevada,2002,4.0,,
4,Nevada,2000,,2.0,3.0


In [93]:
left2 = pd.DataFrame([[1.,2.],[3.,4.],[5.,6.]],
                     index = ['a','c','e'],
                     columns = ['Ohio','Nevada'])
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [94]:
right2 = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[13,14]],
                      index = ['b','c','d','e'],
                      columns = ['Missouri','Alabama'])
right2

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [95]:
pd.merge(left2, right2, how = 'outer', left_index = True, right_index = True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [97]:
left2.merge(right2, how = 'outer', left_index = True, right_index = True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [99]:
left2.join(right2, how = 'outer',)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [100]:
left1.join(right1, on = 'key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [101]:
another = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]],
                       index = ['a','c','e','f'],
                       columns = ['New York', 'Oregon'])
another

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


For simple index-on-index merges, pass a list of DataFrames to join as an alternative to using the more general concat function

In [107]:
left2.join([right2, another]) # .join allow join a list of multiple DataFrames, while .merge does not

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0


In [108]:
try: 
    left2.merge([right2, another])
except TypeError:
    print('can only merge Series or DataFrame objects, a list class was passed')

can only merge Series or DataFrame objects, a list class was passed


In [109]:
left2.join([right2, another], how = 'outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0
b,,,7.0,8.0,,
d,,,11.0,12.0,,
f,,,,,16.0,17.0


## 8.2.3 Concatenating Along an Axis

In [110]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [111]:
np.concatenate([arr,arr],axis = 1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [112]:
np.concatenate([arr, arr], axis = 0)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [113]:
s1 = pd.Series([0,1], index = ['a','b'])
s2 = pd.Series([2,3,4], index = ['c','d','e'])
s3 = pd.Series([5,6], index = ['f','g'])

In [114]:
s1

a    0
b    1
dtype: int64

In [115]:
s2

c    2
d    3
e    4
dtype: int64

In [116]:
s3

f    5
g    6
dtype: int64

Calling concat with these objects in a list glues together the values and indexes. By default concat works along axis = 0

In [117]:
pd.concat([s1, s1, s3])

a    0
b    1
a    0
b    1
f    5
g    6
dtype: int64

In [118]:
pd.concat([s1, s2, s3], axis = 1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [119]:
s4 = pd.concat([s1, s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

In [120]:
pd.concat([s1, s4], axis = 1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [121]:
pd.concat([s1, s4], axis = 1, join = 'inner')

Unnamed: 0,0,1
a,0,0
b,1,1


A potential issue is that the concatenated pieces are not identifiable in the result. Suppose instead you wanted to create a hierarchical index on the concatenation axis. To do this, use the keys argument

In [124]:
result = pd.concat([s1, s2, s3], keys = ['one', 'two', 'three'])
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

In [125]:
result.index

MultiIndex([(  'one', 'a'),
            (  'one', 'b'),
            (  'two', 'c'),
            (  'two', 'd'),
            (  'two', 'e'),
            ('three', 'f'),
            ('three', 'g')],
           )

In [126]:
result.unstack()

Unnamed: 0,a,b,c,d,e,f,g
one,0.0,1.0,,,,,
two,,,2.0,3.0,4.0,,
three,,,,,,5.0,6.0


In the case of combining Series along axis = 1, the keys become the DataFrame column headers

In [127]:
pd.concat([s1, s2 ,s3], axis = 1, keys = ['one','two','three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [128]:
df1 = pd.DataFrame(np.arange(6).reshape(3,2), 
                   index = ['a','b','c'],
                   columns = ['one','two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2), 
                   index = ['a','c'],
                   columns = ['three','four'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [129]:
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [131]:
pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [132]:
pd.concat([df1, df2], axis = 0, keys = ['level1', 'level2'])

Unnamed: 0,Unnamed: 1,one,two,three,four
level1,a,0.0,1.0,,
level1,b,2.0,3.0,,
level1,c,4.0,5.0,,
level2,a,,,5.0,6.0
level2,c,,,7.0,8.0


In [133]:
pd.concat({'level1': df1,
           'level2': df2}, 
          axis = 1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [135]:
pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'], names = ['upper', 'lower'])

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [136]:
df1 = pd.DataFrame(np.random.randn(3,4), columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.randn(2,3), columns = ['b','d','a'])
df1

Unnamed: 0,a,b,c,d
0,0.548356,-0.929035,0.195107,0.183757
1,-1.245655,-1.187501,0.241387,0.188675
2,0.0933,-0.268711,0.979465,-0.988318


In [137]:
df2

Unnamed: 0,b,d,a
0,1.090719,0.352714,0.815194
1,-1.703791,2.118459,0.99848


In [138]:
pd.concat([df1, df2], ignore_index = True)

Unnamed: 0,a,b,c,d
0,0.548356,-0.929035,0.195107,0.183757
1,-1.245655,-1.187501,0.241387,0.188675
2,0.0933,-0.268711,0.979465,-0.988318
3,0.815194,1.090719,,0.352714
4,0.99848,-1.703791,,2.118459


In [139]:
pd.concat([df1, df2], ignore_index = False)

Unnamed: 0,a,b,c,d
0,0.548356,-0.929035,0.195107,0.183757
1,-1.245655,-1.187501,0.241387,0.188675
2,0.0933,-0.268711,0.979465,-0.988318
0,0.815194,1.090719,,0.352714
1,0.99848,-1.703791,,2.118459


In [140]:
pd.concat([df1, df2], ignore_index = True, axis = 1)

Unnamed: 0,0,1,2,3,4,5,6
0,0.548356,-0.929035,0.195107,0.183757,1.090719,0.352714,0.815194
1,-1.245655,-1.187501,0.241387,0.188675,-1.703791,2.118459,0.99848
2,0.0933,-0.268711,0.979465,-0.988318,,,


In [141]:
pd.concat([df1, df2], ignore_index = False, axis = 1)

Unnamed: 0,a,b,c,d,b.1,d.1,a.1
0,0.548356,-0.929035,0.195107,0.183757,1.090719,0.352714,0.815194
1,-1.245655,-1.187501,0.241387,0.188675,-1.703791,2.118459,0.99848
2,0.0933,-0.268711,0.979465,-0.988318,,,


## 8.2.4 Combining Data with Overlap

In [142]:
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
              index = ['f','e','d','c','b','a'])
b = pd.Series(np.arange(len(a), dtype = np.float64), 
              index = ['f','e','d','c','b','a'])

In [143]:
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [144]:
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [145]:
b[-1] = np.nan

In [146]:
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

In [147]:
np.where(pd.isnull(a), b, a)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

In [148]:
b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [149]:
df1 = pd.DataFrame({'a':[1., np.nan, 5., np.nan],
                    'b':[np.nan, 2., np.nan, 6.],
                    'c':range(2, 18, 4)})
df2 = pd.DataFrame({'a':[5., 4., np.nan, 3., 7.],
                    'b':[np.nan, 3., 4., 6., 8.]})

In [150]:
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [151]:
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


combine_first, patching missing data in the calling object with data from the object that passed

In [152]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


# 8.3 Reshaping and Pivoting

## 8.3.1 Reshaping with Hierarchical Indexing

In [153]:
pd.options.display.max_rows = 10

stack: rotates or pivots from the columns in the data to the rows

unstack: pivots from the rows into the columns

In [154]:
data = pd.DataFrame(np.arange(6).reshape((2,3)),
                    index = pd.Index(['Ohio','Colorado'], 
                                      name = 'state'),
                    columns = pd.Index(['one','two','three'],
                                        name = 'number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [160]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64

In [161]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [162]:
result.unstack(level = -1)

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [163]:
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [164]:
result.unstack(1)

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [165]:
result.unstack(level = 'state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [166]:
result.unstack(level = 'number')

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [167]:
s1 = pd.Series([0,1,2,3], index = ['a','b','c','d'])
s2 = pd.Series([4,5,6], index = ['c','d','e'])
data2 = pd.concat([s1,s2], keys = ['one','two'])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [168]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [170]:
data2.unstack(1)

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [171]:
data2.unstack(0)

Unnamed: 0,one,two
a,0.0,
b,1.0,
c,2.0,4.0
d,3.0,5.0
e,,6.0


In [172]:
data2.unstack().stack() # default dropna = True

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [173]:
data2.unstack().stack(dropna = False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

When you unstack in a DataFrame, the level unstacked becomes the lowest level in the result 

In [175]:
df = pd.DataFrame({'left':result, 
                   'right': result + 5},
                  columns = pd.Index(['left','right'], 
                                     name = 'side'))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [176]:
df.unstack(level = 'state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [177]:
df.unstack(level = 'state').stack(level = 'side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


## 8.3.2 Pivoting "Long" to "Wide" Format

In [178]:
data = pd.read_csv('/Users/boyuan/Desktop/OneDrive/Python for data analysis 2nd/examples/macrodata.csv')
data.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [179]:
periods = pd.PeriodIndex(year = data.year, quarter = data.quarter, name = 'date')
columns = pd.Index(['realgdp','infl','unemp'], name = 'item')
data = data.reindex(columns = columns)
data.index = periods.to_timestamp('D','end')
ldata = data.stack().reset_index().rename(columns = {0:'value'})

This is the so-called long format for multiple time series, or other observational data with two or more keys (here keys are date and item). Each row in the table represents
a single observation

In [180]:
ldata

Unnamed: 0,date,item,value
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-03-31 23:59:59.999999999,infl,0.000
2,1959-03-31 23:59:59.999999999,unemp,5.800
3,1959-06-30 23:59:59.999999999,realgdp,2778.801
4,1959-06-30 23:59:59.999999999,infl,2.340
...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370
605,2009-06-30 23:59:59.999999999,unemp,9.200
606,2009-09-30 23:59:59.999999999,realgdp,12990.341
607,2009-09-30 23:59:59.999999999,infl,3.560


Data is frequently stored this way in relational databases like MySQL, as a fixed schema (column names and data types) allows the number of distinct values in the item column to change as data is added to the table. In the previous example, date and item would usually be the primary keys, offering both relational integrity and easier joins. In some cases, the data may be more difficult to work with in this format. You might prefer to have a DataFrame containing one column per distinct item value indexed by timestamps in the date column. DataFrame's pivot method performs exactly this transformation

In [181]:
ldata.item.value_counts()

realgdp    203
infl       203
unemp      203
Name: item, dtype: int64

In [188]:
pivoted = ldata.pivot(index = 'date', columns = 'item', values = 'value')
pivoted

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,0.00,2710.349,5.8
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2
...,...,...,...
2008-09-30 23:59:59.999999999,-3.16,13324.600,6.0
2008-12-31 23:59:59.999999999,-8.79,13141.920,6.9
2009-03-31 23:59:59.999999999,0.94,12925.410,8.1
2009-06-30 23:59:59.999999999,3.37,12901.504,9.2


The first two values passed are the columns to be used respectively as the row and column index, then finally an optional value column to fill the DataFrame

In [189]:
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]

Unnamed: 0,date,item,value,value2
0,1959-03-31 23:59:59.999999999,realgdp,2710.349,-0.045338
1,1959-03-31 23:59:59.999999999,infl,0.0,0.412427
2,1959-03-31 23:59:59.999999999,unemp,5.8,0.411913
3,1959-06-30 23:59:59.999999999,realgdp,2778.801,-0.284192
4,1959-06-30 23:59:59.999999999,infl,2.34,0.301771
5,1959-06-30 23:59:59.999999999,unemp,5.1,-0.257581
6,1959-09-30 23:59:59.999999999,realgdp,2775.488,0.673183
7,1959-09-30 23:59:59.999999999,infl,2.74,-0.786757
8,1959-09-30 23:59:59.999999999,unemp,5.3,0.035419
9,1959-12-31 23:59:59.999999999,realgdp,2785.204,-0.04231


Have two value columns that you wanted to reshape simultaneously. By omitting the last argument, you obtain a DataFrame with hierarchical columns

In [190]:
pivoted = ldata.pivot(index = 'date', columns = 'item')
pivoted

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31 23:59:59.999999999,0.00,2710.349,5.8,0.412427,-0.045338,0.411913
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1,0.301771,-0.284192,-0.257581
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3,-0.786757,0.673183,0.035419
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6,-0.562575,-0.042310,-1.293912
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2,0.164533,0.976746,0.111137
...,...,...,...,...,...,...
2008-09-30 23:59:59.999999999,-3.16,13324.600,6.0,0.211818,-0.322892,0.160844
2008-12-31 23:59:59.999999999,-8.79,13141.920,6.9,-1.080858,-0.497353,1.890131
2009-03-31 23:59:59.999999999,0.94,12925.410,8.1,0.733730,-1.356268,-2.877204
2009-06-30 23:59:59.999999999,3.37,12901.504,9.2,-0.673079,-0.287184,-0.188939


In [192]:
ldata.pivot(index = 'date', columns = 'item', values = ['value', 'value2'])

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31 23:59:59.999999999,0.00,2710.349,5.8,0.412427,-0.045338,0.411913
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1,0.301771,-0.284192,-0.257581
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3,-0.786757,0.673183,0.035419
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6,-0.562575,-0.042310,-1.293912
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2,0.164533,0.976746,0.111137
...,...,...,...,...,...,...
2008-09-30 23:59:59.999999999,-3.16,13324.600,6.0,0.211818,-0.322892,0.160844
2008-12-31 23:59:59.999999999,-8.79,13141.920,6.9,-1.080858,-0.497353,1.890131
2009-03-31 23:59:59.999999999,0.94,12925.410,8.1,0.733730,-1.356268,-2.877204
2009-06-30 23:59:59.999999999,3.37,12901.504,9.2,-0.673079,-0.287184,-0.188939


In [193]:
pivoted['value'][:5]

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,0.0,2710.349,5.8
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2


**Conversion from pivot to index and stack:**

Pivot is equivalent to creating a hierarchical index using set_index followed by a call to unstack

In [195]:
ldata

Unnamed: 0,date,item,value,value2
0,1959-03-31 23:59:59.999999999,realgdp,2710.349,-0.045338
1,1959-03-31 23:59:59.999999999,infl,0.000,0.412427
2,1959-03-31 23:59:59.999999999,unemp,5.800,0.411913
3,1959-06-30 23:59:59.999999999,realgdp,2778.801,-0.284192
4,1959-06-30 23:59:59.999999999,infl,2.340,0.301771
...,...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370,-0.673079
605,2009-06-30 23:59:59.999999999,unemp,9.200,-0.188939
606,2009-09-30 23:59:59.999999999,realgdp,12990.341,2.019450
607,2009-09-30 23:59:59.999999999,infl,3.560,0.598945


In [196]:
ldata.set_index(['date', 'item'])

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value2
date,item,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,realgdp,2710.349,-0.045338
1959-03-31 23:59:59.999999999,infl,0.000,0.412427
1959-03-31 23:59:59.999999999,unemp,5.800,0.411913
1959-06-30 23:59:59.999999999,realgdp,2778.801,-0.284192
1959-06-30 23:59:59.999999999,infl,2.340,0.301771
...,...,...,...
2009-06-30 23:59:59.999999999,infl,3.370,-0.673079
2009-06-30 23:59:59.999999999,unemp,9.200,-0.188939
2009-09-30 23:59:59.999999999,realgdp,12990.341,2.019450
2009-09-30 23:59:59.999999999,infl,3.560,0.598945


In [197]:
ldata.set_index(['date', 'item']).unstack(level = 'item')

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31 23:59:59.999999999,0.00,2710.349,5.8,0.412427,-0.045338,0.411913
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1,0.301771,-0.284192,-0.257581
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3,-0.786757,0.673183,0.035419
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6,-0.562575,-0.042310,-1.293912
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2,0.164533,0.976746,0.111137
...,...,...,...,...,...,...
2008-09-30 23:59:59.999999999,-3.16,13324.600,6.0,0.211818,-0.322892,0.160844
2008-12-31 23:59:59.999999999,-8.79,13141.920,6.9,-1.080858,-0.497353,1.890131
2009-03-31 23:59:59.999999999,0.94,12925.410,8.1,0.733730,-1.356268,-2.877204
2009-06-30 23:59:59.999999999,3.37,12901.504,9.2,-0.673079,-0.287184,-0.188939


## 8.3.3 Pivoting "Wide" to "Long" Format

An inverse operation to pivot for DataFrames is pandas.melt. Rather than transforming one column into many in a new DataFrame, it merges multiple columns into one, producing a DataFrame that is longer than the input

In [198]:
df = pd.DataFrame({'key':['foo','bar','baz'],
                   'A':[1,2,3],
                   'B':[4,5,6],
                   'C':[7,8,9]})
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


The 'key' column may be a group indicator, and the other columns are data values. When using pandas.melt, we must indicate which columns (if any) are group indicators

In [203]:
melted = pd.melt(df, id_vars = ['key'])
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [204]:
reshaped = melted.pivot(index = 'key', columns = 'variable', values = 'value')
reshaped

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


In [205]:
reshaped.reset_index()

variable,key,A,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


In [206]:
pd.melt(df, id_vars = ['key'], value_vars = ['A', 'B']) # specify a subset of columns to use as value columns 

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6


In [207]:
pd.melt(df, value_vars = ['A', 'B', 'C']) # melt without any group identifiers

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6
6,C,7
7,C,8
8,C,9


In [208]:
pd.melt(df, value_vars = ['key','A','B'])

Unnamed: 0,variable,value
0,key,foo
1,key,bar
2,key,baz
3,A,1
4,A,2
5,A,3
6,B,4
7,B,5
8,B,6


# 8.4 Conclusion