# Hierarchical Indexing

In [44]:
import numpy as np
import pandas as pd

data = pd.Series(np.random.randn(9), index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"],
                                            [1,2,3,1,3,1,2,2,3]])
display(data)


a  1    0.665610
   2    0.703964
   3    0.041599
b  1   -0.186030
   3    0.686565
c  1    0.102203
   2   -0.972260
d  2   -3.321155
   3    0.424970
dtype: float64

In [45]:
print(data["a"])

1    0.665610
2    0.703964
3    0.041599
dtype: float64


In [46]:
print(data["b":"c"])

b  1   -0.186030
   3    0.686565
c  1    0.102203
   2   -0.972260
dtype: float64


In [47]:
print(data.loc[:, 2])

a    0.703964
c   -0.972260
d   -3.321155
dtype: float64


In [48]:
print(data)
df = data.unstack()
display(df)

a  1    0.665610
   2    0.703964
   3    0.041599
b  1   -0.186030
   3    0.686565
c  1    0.102203
   2   -0.972260
d  2   -3.321155
   3    0.424970
dtype: float64


Unnamed: 0,1,2,3
a,0.66561,0.703964,0.041599
b,-0.18603,,0.686565
c,0.102203,-0.97226,
d,,-3.321155,0.42497


In [49]:
df.stack()

a  1    0.665610
   2    0.703964
   3    0.041599
b  1   -0.186030
   3    0.686565
c  1    0.102203
   2   -0.972260
d  2   -3.321155
   3    0.424970
dtype: float64

In [50]:
frame = pd.DataFrame(np.arange(12).reshape(4,3), index=[["a","a","b","b"],[1,2,1,2]], columns=[["Ohio","Ohio","Colarado"],
                                                                                              ["Green","Red", "Green"]])
display(frame)
frame.index.names=["key1","key2"]
frame.columns.names=["State","Color"]
display(frame)
display(frame["Ohio"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colarado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


Unnamed: 0_level_0,State,Ohio,Ohio,Colarado
Unnamed: 0_level_1,Color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


Unnamed: 0_level_0,Color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [51]:
frame= frame.swaplevel("key1","key2").sort_index()
display(frame)

Unnamed: 0_level_0,State,Ohio,Ohio,Colarado
Unnamed: 0_level_1,Color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


# Summary Statistics by Level

In [52]:
display(frame)

Unnamed: 0_level_0,State,Ohio,Ohio,Colarado
Unnamed: 0_level_1,Color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [53]:
display(frame.sum(level="key2"))

State,Ohio,Ohio,Colarado
Color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [54]:
display(frame.sum(level="Color", axis=1))

Unnamed: 0_level_0,Color,Green,Red
key2,key1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,a,2,1
1,b,14,7
2,a,8,4
2,b,20,10


In [55]:
display(frame.sum(level="key1"))

State,Ohio,Ohio,Colarado
Color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [56]:
frame = pd.DataFrame({"a": range(7),
                      "b": range(7,0,-1),
                      "c": ["one","one","one","two","two","two","two"],
                      "d": [0,1,2,0,1,2,3]}
                    )
display(frame)

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [57]:
 display(frame.set_index(["c","d"]))


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [58]:
display(frame.set_index(["c","d"], drop=False))


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


# Combining and Merging Datasets

In [59]:
df1 = pd.DataFrame({"key1":["b", "b", "a", "c", "a", "a", "b"],
                    "Data1": range(7)})
df2 = pd.DataFrame({"key1":["a", "b", "d"],
                    "Data1": range(3)})
pd.merge(df1,df2, on="key1")

Unnamed: 0,key1,Data1_x,Data1_y
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [60]:
df3 = pd.DataFrame({"lkey":["b", "b", "a", "c", "a", "a", "b"],
                    "Data1": range(7)})
df4 = pd.DataFrame({"rkey":["a", "b", "d"],
                    "Data1": range(3)})
pd.merge(df3,df4, left_on="lkey", right_on="rkey")

Unnamed: 0,lkey,Data1_x,rkey,Data1_y
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [61]:
left1 = pd.DataFrame({"key": ["a", "b", "a", "a", "b", "c"],
                      "value": range(6)})
right1 = pd.DataFrame({"group_val": [3.5, 7]}, index=["a", "b"])
print(left1)
print(right1)


  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
   group_val
a        3.5
b        7.0


In [62]:
pd.merge(left1,right1, left_on="key", right_index=True)            # By Default Inner Join

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [63]:
pd.merge(left1,right1, left_on="key", right_index=True, how="outer")

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [64]:
pd.merge(left1,right1, left_on="key", right_index=True, how="right")

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [65]:
pd.merge(left1,right1, left_on="key", right_index=True, how="left")

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [66]:
df2 = pd.DataFrame({"city": ["new york", "chicago", "orlando"],
                    "temperature": [21,14,35]})
df2 = pd.DataFrame({"city": [ "chicago", "new york", "orlando"],
                    "temperature": [65,68,71]})

In [67]:
pd.merge(df1,df2, on="city", suffixes=["_left", "_right"])

KeyError: 'city'

In [68]:
df3 = pd.DataFrame({"city": ["new york", "chicago", "orlando", "baltimore"],
                    "temperature": [21,14,35,32]})
df4 = pd.DataFrame({"city": [ "chicago", "new york", "san francisco"],
                    "humadity": [65,68,75]})
display(df3)
display(df4)

Unnamed: 0,city,temperature
0,new york,21
1,chicago,14
2,orlando,35
3,baltimore,32


Unnamed: 0,city,humadity
0,chicago,65
1,new york,68
2,san francisco,75


In [69]:
pd.merge(df3,df4, on="city")

Unnamed: 0,city,temperature,humadity
0,new york,21,68
1,chicago,14,65


In [76]:
pd.merge(df3,df4, on="city", how="outer", indicator=True)


Unnamed: 0,city,temperature,humadity,_merge
0,new york,21.0,68.0,both
1,chicago,14.0,65.0,both
2,orlando,35.0,,left_only
3,baltimore,32.0,,left_only
4,san francisco,,75.0,right_only


In [77]:
pd.merge(df3,df4, on="city", how="left")

Unnamed: 0,city,temperature,humadity
0,new york,21,68.0
1,chicago,14,65.0
2,orlando,35,
3,baltimore,32,


In [78]:
pd.merge(df3,df4, on="city", how="right")

Unnamed: 0,city,temperature,humadity
0,new york,21.0,68
1,chicago,14.0,65
2,san francisco,,75


# Concatinating along an axis

In [79]:
s1 = pd.Series([1,2], index=["a", "b"])
s2 = pd.Series([3,4,5], index=["c", "d","e"])
s3 = pd.Series([6,7], index=["f", "g"])

In [80]:
pd.concat([s1,s2,s3])

a    1
b    2
c    3
d    4
e    5
f    6
g    7
dtype: int64

In [81]:
pd.concat([s1,s2,s3], axis=1, sort= True)

Unnamed: 0,0,1,2
a,1.0,,
b,2.0,,
c,,3.0,
d,,4.0,
e,,5.0,
f,,,6.0
g,,,7.0


In [82]:
s4 = pd.concat([s1,s3])
display(s4)

a    1
b    2
f    6
g    7
dtype: int64

In [83]:
pd.concat([s1,s4], axis =1 , join="inner")

Unnamed: 0,0,1
a,1,1
b,2,2


In [84]:
pd.concat([s1,s4], axis =1 , join_axes=[["a","c","b","a"]])

Unnamed: 0,0,1
a,1.0,1.0
c,,
b,2.0,2.0
a,1.0,1.0


In [85]:
df1 = pd.DataFrame(np.random.randn(3,4), columns=["a","b","c","d"])
df2 = pd.DataFrame(np.random.randn(2,3), columns=["b","d","a"])
display(df1)
display(df2)

Unnamed: 0,a,b,c,d
0,-0.051366,-0.213408,0.718665,-0.769792
1,1.764942,0.036201,-1.356844,-0.034638
2,-0.022888,1.644118,-0.659398,-0.89922


Unnamed: 0,b,d,a
0,1.200528,-0.492136,-1.470591
1,0.02554,-1.412815,-0.284147


In [86]:
pd.concat([df1,df2], ignore_index=True, sort=True)

Unnamed: 0,a,b,c,d
0,-0.051366,-0.213408,0.718665,-0.769792
1,1.764942,0.036201,-1.356844,-0.034638
2,-0.022888,1.644118,-0.659398,-0.89922
3,-1.470591,1.200528,,-0.492136
4,-0.284147,0.02554,,-1.412815


# Pivoting Long to Wide Format

In [87]:
df = pd.read_csv("macro.csv")
display(df)

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [88]:
periods = pd.PeriodIndex(year = df.year, quarter= df.quarter, name="date")
columns = pd.Index(["realgdp", "infl", "unemp"], name="item")
data = data.reindex(columns=columns)
data.head()
data.index= periods.to_timestamp("D", "end")
ldata = data.stack().reset_index().rename(columns={0: 'value'})


AttributeError: 'DataFrame' object has no attribute 'quarter'

In [89]:
df = pd.DataFrame({"key": ["foo","bar","baz"],
                    "A":[1,2,3],
                    "B":[4,5,6],
                    "C":[7,8,9]})
display(df)


Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [90]:
melted = pd.melt(df, ["key"])
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [91]:
melted.pivot(index="key", columns="variable")

Unnamed: 0_level_0,value,value,value
variable,A,B,C
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
bar,2,5,8
baz,3,6,9
foo,1,4,7
