In [1]:
import numpy as np
import pandas as pd

# Hierarchical Indexing

## Multiple Indexed Series

In [2]:
# Using Python Tuples as Keys

index = [('California', 2000), ('California', 2010),
        ('New York', 2000),('New York', 2010),
        ('Texas', 2000), ('Texas', 2010)]

In [3]:
population = [33871648, 37253956,
             18976457, 19378102,
             20851820, 25145561]

In [5]:
pop = pd.Series(population, index=index)
print(pop)

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64


In [6]:
pop[('New York', 2000): ('Texas', 2010)]

(New York, 2000)    18976457
(New York, 2010)    19378102
(Texas, 2000)       20851820
(Texas, 2010)       25145561
dtype: int64

In [7]:
# Converting python tuple multi-index into pandas multi-index

index = pd.MultiIndex.from_tuples(index)
print(index)

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )


In [8]:
pop = pop.reindex(index)
print(pop)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


In [9]:
pop['New York': 'Texas']

New York  2000    18976457
          2010    19378102
Texas     2000    20851820
          2010    25145561
dtype: int64

In [10]:
pop[:,2000]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [11]:
# Converting Multi-index series into indexed dataframe using unstack()
pop_df = pop.unstack()
print(pop_df)

                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561


In [12]:
# What if we want to add another column of demographic data (under 18)

pop_df = pd.DataFrame({'total': pop,
                      'under18': [9267089, 9284094,
                                 4687371, 4318033,
                                 5906301, 6879014]})
print(pop_df)

                    total  under18
California 2000  33871648  9267089
           2010  37253956  9284094
New York   2000  18976457  4687371
           2010  19378102  4318033
Texas      2000  20851820  5906301
           2010  25145561  6879014


In [13]:
f_u18 = pop_df['under18']/pop_df['total']

In [15]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


# Methods of MultiIndex Creation

- List
- Dictionary
- MultiIndex constructors

## Creating multi-index by List

In [16]:
df = pd.DataFrame(np.random.rand(4,2),
                 index=[['a', 'a', 'b', 'b'], [1,2,1,2]],
                 columns=['data1', 'data2'])

print(df)

        data1     data2
a 1  0.224705  0.665547
  2  0.460511  0.736936
b 1  0.320914  0.298263
  2  0.489936  0.992498


## Creating MultiIndex by Dictionary

In [17]:
data = {('California', 2000): 33871648,
            ('California', 2010): 37253956,
            (  'New York', 2000): 18976457,
            (  'New York', 2010): 19378102,
            (     'Texas', 2000):20851820,
            (     'Texas', 2010):25145561}

data

{('California', 2000): 33871648,
 ('California', 2010): 37253956,
 ('New York', 2000): 18976457,
 ('New York', 2010): 19378102,
 ('Texas', 2000): 20851820,
 ('Texas', 2010): 25145561}

In [19]:
pd.Series(data)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

## Explicit MultiIndex Constructors

In [21]:
# from the list of arrays
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1,2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [22]:
# from the list of tuples
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [23]:
# From a cartesian product of single series
pd.MultiIndex.from_product([['a', 'b'], [1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

## MultiIndex level names

In [24]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [27]:
pop.index.names = ['state', 'year']
print(pop)

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


## MultiIndex for Columns

In [36]:
# Creating medical data
# Hierarchical indices and columns

index = pd.MultiIndex.from_product([[2013,2014], [1,2]],
                                  names = ['year', 'visit'])

columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'],
                                      ['HR', 'Temp']],
                                    names=['patient', 'measurement'])

In [37]:
# Create some data

data = np.round(np.random.randn(4,6), 1) # round of 1
print(data)

[[-0.8  0.6  0.8 -0.3  1.3 -1.3]
 [ 1.2  0.2 -0.8  1.2 -1.6 -0.8]
 [-1.4 -0.9 -0.1  0.3  1.3  0.3]
 [ 0.2 -1.5 -0.2 -0.3  1.1 -0.1]]


In [38]:
data[:,::2] *=10
print(data)

[[ -8.    0.6   8.   -0.3  13.   -1.3]
 [ 12.    0.2  -8.    1.2 -16.   -0.8]
 [-14.   -0.9  -1.    0.3  13.    0.3]
 [  2.   -1.5  -2.   -0.3  11.   -0.1]]


In [39]:
data +=37

In [40]:
print(abs(data))

[[29.  37.6 45.  36.7 50.  35.7]
 [49.  37.2 29.  38.2 21.  36.2]
 [23.  36.1 36.  37.3 50.  37.3]
 [39.  35.5 35.  36.7 48.  36.9]]


In [41]:
# Create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
print(health_data)

patient       Bob       Guido         Sue      
measurement    HR  Temp    HR  Temp    HR  Temp
year visit                                     
2013 1       29.0  37.6  45.0  36.7  50.0  35.7
     2       49.0  37.2  29.0  38.2  21.0  36.2
2014 1       23.0  36.1  36.0  37.3  50.0  37.3
     2       39.0  35.5  35.0  36.7  48.0  36.9


In [42]:
# Access the data of a person
health_data['Guido']

Unnamed: 0_level_0,measurement,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,45.0,36.7
2013,2,29.0,38.2
2014,1,36.0,37.3
2014,2,35.0,36.7


In [43]:
# Access the data of a person from a particular year
health_data.loc[2013, 'Guido']

measurement,HR,Temp
visit,Unnamed: 1_level_1,Unnamed: 2_level_1
1,45.0,36.7
2,29.0,38.2


In [44]:
# Recover patient heart rate
health_data['Guido', 'HR']

year  visit
2013  1        45.0
      2        29.0
2014  1        36.0
      2        35.0
Name: (Guido, HR), dtype: float64

In [46]:
# Access first two rows and first four columns
health_data.iloc[:2,:4]

Unnamed: 0_level_0,patient,Bob,Bob,Guido,Guido
Unnamed: 0_level_1,measurement,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2013,1,29.0,37.6,45.0,36.7
2013,2,49.0,37.2,29.0,38.2


In [48]:
# Acces the heart rate of Guido using slicing
health_data.loc[:,('Guido', 'HR')]

year  visit
2013  1        45.0
      2        29.0
2014  1        36.0
      2        35.0
Name: (Guido, HR), dtype: float64

In [49]:
# Access the heart rate of all patients first visit using tuple of multiple 
# indices

health_data.loc[(:,1), (:, 'HR')]

SyntaxError: invalid syntax (2608783577.py, line 4)

In [50]:
# IndexSlice

idx = pd.IndexSlice
health_data.loc[idx[:,1], idx[:,'HR']]

Unnamed: 0_level_0,patient,Bob,Guido,Sue
Unnamed: 0_level_1,measurement,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,29.0,45.0,50.0
2014,1,23.0,36.0,50.0


In [51]:
# Access the heart rate of Bob's first visit in 2014 using IndexSlice

health_data.loc[idx[2014,1], idx['Bob', 'HR']]

23.0

In [52]:
# Access the heart rate and temperature of Bob and sue's first visit 
# in 2014 using IndexSlice

health_data.loc[idx[2014,1], idx[['Bob', 'Sue'], 'HR':'Temp']]

patient  measurement
Bob      HR             23.0
         Temp           36.1
Sue      HR             50.0
         Temp           37.3
Name: (2014, 1), dtype: float64

# Data Aggregations on Multi-Indices

In [53]:
health_data

Unnamed: 0_level_0,patient,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,measurement,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,29.0,37.6,45.0,36.7,50.0,35.7
2013,2,49.0,37.2,29.0,38.2,21.0,36.2
2014,1,23.0,36.1,36.0,37.3,50.0,37.3
2014,2,39.0,35.5,35.0,36.7,48.0,36.9


In [54]:
# Average out the measurement in the two visits each year
data_mean = health_data.mean(level='year') # row wise aggregation
print(data_mean)

patient       Bob       Guido          Sue       
measurement    HR  Temp    HR   Temp    HR   Temp
year                                             
2013         39.0  37.4  37.0  37.45  35.5  35.95
2014         31.0  35.8  35.5  37.00  49.0  37.10


  data_mean = health_data.mean(level='year')


In [55]:
data_mean.mean(axis=1, level='measurement')

  data_mean.mean(axis=1, level='measurement')


measurement,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,37.166667,36.933333
2014,38.5,36.633333


# Combining Datasets: Concatenate

In [57]:
# one dimensional array

x = [1,2,3]
y = [4,5,6]
z = [7,8,9]
np.concatenate([x,y,z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [59]:
# Two dimensional array
x = [[1,2],
    [3,4]]
np.concatenate([x,x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

# Concatenation in Series and DataFrame
- pd.concat()

In [60]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1,2,3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4,5,6])

pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [61]:
# Create the Data for Dataframe

df1 = pd.DataFrame(np.arange(0,9).reshape(3,3),
                  index=[1,2,3],
                  columns=['a','b','c'])
print(df1)

   a  b  c
1  0  1  2
2  3  4  5
3  6  7  8


In [62]:
df2 = pd.DataFrame(np.arange(9,18).reshape(3,3),
                  index=[4,5,6],
                  columns=['a','b','c'])
print(df2)

    a   b   c
4   9  10  11
5  12  13  14
6  15  16  17


In [63]:
# Combining two dataframe
pd.concat([df1, df2])

Unnamed: 0,a,b,c
1,0,1,2
2,3,4,5
3,6,7,8
4,9,10,11
5,12,13,14
6,15,16,17


In [64]:
df3 = pd.DataFrame(np.arange(18,27).reshape(3,3),
                  index=[1,2,3],
                  columns=['d','e','f'])
print(df3)

    d   e   f
1  18  19  20
2  21  22  23
3  24  25  26


In [65]:
# Combine two dataframe column wise
pd.concat([df1,df3], axis=1)

Unnamed: 0,a,b,c,d,e,f
1,0,1,2,18,19,20
2,3,4,5,21,22,23
3,6,7,8,24,25,26
