## The MultiIndex Object

### View as Copy

In [1]:
import pandas as pd

In [5]:
mlo = pd.read_csv('./Pandas Dataset - I/co2-mm-mlo.csv', na_values=-90.99, index_col='Date', parse_dates=True)

In [6]:
mlo.head()

Unnamed: 0_level_0,Decimal Date,Average,Interpolated,Trend,Number of Days
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1958-03-01,1958.208,315.71,315.71,314.62,-1
1958-04-01,1958.292,317.45,317.45,315.29,-1
1958-05-01,1958.375,317.5,317.5,314.71,-1
1958-06-01,1958.458,-99.99,317.1,314.85,-1
1958-07-01,1958.542,315.86,315.86,314.98,-1


In [7]:
s = mlo["Interpolated"]

In [8]:
s

Date
1958-03-01    315.71
1958-04-01    317.45
1958-05-01    317.50
1958-06-01    317.10
1958-07-01    315.86
               ...  
2016-08-01    402.25
2016-09-01    401.03
2016-10-01    401.57
2016-11-01    403.53
2016-12-01    404.48
Name: Interpolated, Length: 706, dtype: float64

In [9]:
mlo.assign(smooth=s.rolling(12).mean()).tail()

Unnamed: 0_level_0,Decimal Date,Average,Interpolated,Trend,Number of Days,smooth
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-08-01,2016.625,402.25,402.25,404.09,23,403.1575
2016-09-01,2016.708,401.03,401.03,404.52,24,403.440833
2016-10-01,2016.792,401.57,401.57,404.93,29,403.714167
2016-11-01,2016.875,403.53,403.53,405.57,27,403.995
2016-12-01,2016.958,404.48,404.48,405.25,29,404.214167


a copy is returned

In [10]:
mlo.head()

Unnamed: 0_level_0,Decimal Date,Average,Interpolated,Trend,Number of Days
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1958-03-01,1958.208,315.71,315.71,314.62,-1
1958-04-01,1958.292,317.45,317.45,315.29,-1
1958-05-01,1958.375,317.5,317.5,314.71,-1
1958-06-01,1958.458,-99.99,317.1,314.85,-1
1958-07-01,1958.542,315.86,315.86,314.98,-1


In [11]:
s2 = mlo.loc[:'1958-05', 'Average']
s2

Date
1958-03-01    315.71
1958-04-01    317.45
1958-05-01    317.50
Name: Average, dtype: float64

A view is returned

In [12]:
s2[:] = 313

In [13]:
s2

Date
1958-03-01    313.0
1958-04-01    313.0
1958-05-01    313.0
Name: Average, dtype: float64

In [14]:
mlo.head()

Unnamed: 0_level_0,Decimal Date,Average,Interpolated,Trend,Number of Days
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1958-03-01,1958.208,313.0,315.71,314.62,-1
1958-04-01,1958.292,313.0,317.45,315.29,-1
1958-05-01,1958.375,313.0,317.5,314.71,-1
1958-06-01,1958.458,-99.99,317.1,314.85,-1
1958-07-01,1958.542,315.86,315.86,314.98,-1


## Chained Indexing

In [15]:
mlo['Average']['1958-03']

Date
1958-03-01    313.0
Name: Average, dtype: float64

In [16]:
mlo['Average']['1958-03'] = 312

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mlo['Average']['1958-03'] = 312
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Generally speaking, chained indexing is not a good practice. To set a new value, use `mlo.loc[row_indexer, col_indexer]` because `mlo.loc` is guaranteed to be `mlo` itself.

In [17]:
mlo.loc['1958-03', 'Average']

Date
1958-03-01    312.0
Name: Average, dtype: float64

## Hierarchical indexing

In [19]:
h_index = pd.MultiIndex.from_product([['first', 'second'], ['A', 'B']])
h_index

MultiIndex([( 'first', 'A'),
            ( 'first', 'B'),
            ('second', 'A'),
            ('second', 'B')],
           )

In [20]:
x = pd.Series(range(4), index=h_index)
x

first   A    0
        B    1
second  A    2
        B    3
dtype: int64

In [21]:
x['first']

A    0
B    1
dtype: int64

In [22]:
x['first']['B']

1

In the above, there are two selection operations.

In [24]:
x.loc[('first', 'B')]

1

In the above, there is a single selection operation.

We can end up with a hierarchical index when stacking records.

In [26]:
gl = pd.read_csv('./Pandas Dataset - I/co2-mm-gl.csv', na_values=-99.99, index_col='Date', parse_dates=True)

gl = gl[['Average']]
gl.columns = ['Average_gl']
gl.head()

Unnamed: 0_level_0,Average_gl
Date,Unnamed: 1_level_1
1980-01-01,338.45
1980-02-01,339.14
1980-03-01,339.46
1980-04-01,339.86
1980-05-01,340.3


In [27]:
ml = mlo[['Average']]
ml.columns = ['Average_mlo']
ml.head()

Unnamed: 0_level_0,Average_mlo
Date,Unnamed: 1_level_1
1958-03-01,312.0
1958-04-01,313.0
1958-05-01,313.0
1958-06-01,-99.99
1958-07-01,315.86


In [28]:
ml = ml[ml.index >= '1980-01']

gl = gl.head()
ml = ml.head()

In [29]:
gl

Unnamed: 0_level_0,Average_gl
Date,Unnamed: 1_level_1
1980-01-01,338.45
1980-02-01,339.14
1980-03-01,339.46
1980-04-01,339.86
1980-05-01,340.3


In [30]:
ml

Unnamed: 0_level_0,Average_mlo
Date,Unnamed: 1_level_1
1980-01-01,337.9
1980-02-01,338.34
1980-03-01,340.01
1980-04-01,340.93
1980-05-01,341.48


In [31]:
multi = pd.concat([ml, gl], axis=1).stack()
multi

Date                   
1980-01-01  Average_mlo    337.90
            Average_gl     338.45
1980-02-01  Average_mlo    338.34
            Average_gl     339.14
1980-03-01  Average_mlo    340.01
            Average_gl     339.46
1980-04-01  Average_mlo    340.93
            Average_gl     339.86
1980-05-01  Average_mlo    341.48
            Average_gl     340.30
dtype: float64

In [32]:
multi.index

MultiIndex([('1980-01-01', 'Average_mlo'),
            ('1980-01-01',  'Average_gl'),
            ('1980-02-01', 'Average_mlo'),
            ('1980-02-01',  'Average_gl'),
            ('1980-03-01', 'Average_mlo'),
            ('1980-03-01',  'Average_gl'),
            ('1980-04-01', 'Average_mlo'),
            ('1980-04-01',  'Average_gl'),
            ('1980-05-01', 'Average_mlo'),
            ('1980-05-01',  'Average_gl')],
           names=['Date', None])

In [33]:
multi.index.get_level_values('Date')

DatetimeIndex(['1980-01-01', '1980-01-01', '1980-02-01', '1980-02-01',
               '1980-03-01', '1980-03-01', '1980-04-01', '1980-04-01',
               '1980-05-01', '1980-05-01'],
              dtype='datetime64[ns]', name='Date', freq=None)

In [34]:
multi.loc[multi.index.get_level_values('Date') < '1980-03']

Date                   
1980-01-01  Average_mlo    337.90
            Average_gl     338.45
1980-02-01  Average_mlo    338.34
            Average_gl     339.14
dtype: float64

## Reshaping

The `stack()` function compressed a level in the DataFrame’s columns to produce a Series (as a reminder, `multi = pd.concat([ml, gl], axis=1).stack()`).

In [36]:
pd.concat([ml, gl], axis=1)

Unnamed: 0_level_0,Average_mlo,Average_gl
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1980-01-01,337.9,338.45
1980-02-01,338.34,339.14
1980-03-01,340.01,339.46
1980-04-01,340.93,339.86
1980-05-01,341.48,340.3


In [37]:
multi

Date                   
1980-01-01  Average_mlo    337.90
            Average_gl     338.45
1980-02-01  Average_mlo    338.34
            Average_gl     339.14
1980-03-01  Average_mlo    340.01
            Average_gl     339.46
1980-04-01  Average_mlo    340.93
            Average_gl     339.86
1980-05-01  Average_mlo    341.48
            Average_gl     340.30
dtype: float64

In [38]:
multi.unstack()

Unnamed: 0_level_0,Average_mlo,Average_gl
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1980-01-01,337.9,338.45
1980-02-01,338.34,339.14
1980-03-01,340.01,339.46
1980-04-01,340.93,339.86
1980-05-01,341.48,340.3


## Pivoting

In [39]:
rec = pd.concat([ml, gl], axis=1).stack().reset_index()
rec.columns = ['date', 'variable', 'value']
rec

Unnamed: 0,date,variable,value
0,1980-01-01,Average_mlo,337.9
1,1980-01-01,Average_gl,338.45
2,1980-02-01,Average_mlo,338.34
3,1980-02-01,Average_gl,339.14
4,1980-03-01,Average_mlo,340.01
5,1980-03-01,Average_gl,339.46
6,1980-04-01,Average_mlo,340.93
7,1980-04-01,Average_gl,339.86
8,1980-05-01,Average_mlo,341.48
9,1980-05-01,Average_gl,340.3


The above data is in 'stacked' or 'record' format.

In [40]:
rec

Unnamed: 0,date,variable,value
0,1980-01-01,Average_mlo,337.9
1,1980-01-01,Average_gl,338.45
2,1980-02-01,Average_mlo,338.34
3,1980-02-01,Average_gl,339.14
4,1980-03-01,Average_mlo,340.01
5,1980-03-01,Average_gl,339.46
6,1980-04-01,Average_mlo,340.93
7,1980-04-01,Average_gl,339.86
8,1980-05-01,Average_mlo,341.48
9,1980-05-01,Average_gl,340.3


In [41]:
rec[rec.variable == 'Average_mlo']

Unnamed: 0,date,variable,value
0,1980-01-01,Average_mlo,337.9
2,1980-02-01,Average_mlo,338.34
4,1980-03-01,Average_mlo,340.01
6,1980-04-01,Average_mlo,340.93
8,1980-05-01,Average_mlo,341.48


In [42]:
pivot_table = rec.pivot(index='date', columns='variable', values='value')
pivot_table

variable,Average_gl,Average_mlo
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1980-01-01,338.45,337.9
1980-02-01,339.14,338.34
1980-03-01,339.46,340.01
1980-04-01,339.86,340.93
1980-05-01,340.3,341.48


The pivoted data is more suitable for timeseries analysis.

In [43]:
pivot_table['Average_gl']

date
1980-01-01    338.45
1980-02-01    339.14
1980-03-01    339.46
1980-04-01    339.86
1980-05-01    340.30
Name: Average_gl, dtype: float64

In [44]:
pivot_table['Average_mlo']

date
1980-01-01    337.90
1980-02-01    338.34
1980-03-01    340.01
1980-04-01    340.93
1980-05-01    341.48
Name: Average_mlo, dtype: float64

In [45]:
pivot_table.index

DatetimeIndex(['1980-01-01', '1980-02-01', '1980-03-01', '1980-04-01',
               '1980-05-01'],
              dtype='datetime64[ns]', name='date', freq=None)