### 축 연결

In [17]:
import numpy as np
import pandas as pd

### ```np.concatenate( [배열1, 배열2], axis )``` : 배열을 연결
- ```axis``` : 연결 방향 (**1 : 행 방향 / 0 : 열 방향**

In [18]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [19]:
np.concatenate([arr, arr], axis = 0)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

### ```pd.concat([ s1, s2, s3 ], axis )``` : 객체를 연결

#### pd.concat() 함수 인자
|  |  |
| -- | -- |
| objs | 연결할 객체 |
| axis | 연결할 축 방향 **0 : 열 방향, 1 : 행 방향** |
| join | join 방식. 'inner', 'outer' (기본값은 'outer') |
| join_axes | 합집합 / 교집합을 수행하는 다른 n-1축으로 사용할 인덱스 지정 |
| keys | 연결할 객체나 연결한 축에 대한 계층적 인덱스를 생성하는 데 연관된 값. 리스트나 임의의 값이 들어있는 배열, 튜플의 배열 또는 배열의 리스트(levels 옵션에 다차원 배열이 넘어온 경우)가 될 수 있다.|
| levels | 계층 인덱스 레벨로 사용할 인덱스 지정. keys가 넘어온 경우 여러 개의 인덱스를 지정 |
| names | keys나 levels 혹은 둘 다 있을 경우 생성된 계층 레벨을 위한 이름 |
| verify_integrity | 연결한 객체에 중복되는 축이 잇는지 확인. 있다면 예외를 발생. (기본값 : Fasle, 중복 허용) |
| ignore_index | 연결한 축의 인덱스를 유지하지 않고 range(total_length)로 새로운 인덱스를 생성 |

In [20]:
s1 = pd.Series([0, 1], index = ['a', 'b'])
s2 = pd.Series([2, 3, 4], index = ['c', 'd', 'e'])
s3 = pd.Series([5, 6], index = ['f', 'g'])

print(pd.concat([s1, s2, s3]))
print(pd.concat([s1, s2, s3], axis = 1))

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0


In [21]:
s4 = pd.concat([s1, s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

In [22]:
print(pd.concat([s1, s4], axis = 1))

     0  1
a  0.0  0
b  1.0  1
f  NaN  5
g  NaN  6


- s1과 s4의 교집합

In [23]:
print(pd.concat([s1, s4], axis = 1, join = 'inner'))

   0  1
a  0  0
b  1  1


### ```.unstack()``` : 계층적 인덱스 melting (spread)
### ```.stack()``` : 계층적 인덱스 gather

In [27]:
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [28]:
result.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [29]:
pd.concat([s1, s2, s3], axis = 1, keys = ['one', 'two', 'three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [30]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                   columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                   columns=['three', 'four'])

pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [31]:
pd.concat({'level1': df1, 'level2': df2}, axis = 1)
pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'],
          names = ['upper', 'lower'])

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [32]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns = ['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns = ['b', 'd', 'a'])

pd.concat([df1, df2], ignore_index = True)

Unnamed: 0,a,b,c,d
0,1.56612,0.537015,0.622858,-0.186753
1,-1.349183,0.862339,1.020778,-2.028846
2,1.196424,0.242049,-0.580262,0.273209
3,0.180382,-0.983779,,2.336101
4,-1.334482,0.794997,,0.042793


### 
### 겹치는 데이터 병합

In [33]:
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
              index =[ 'f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
              index = ['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
print(a)
print(b)

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64


### ```np.where(조건문, a, b)``` : ifelse(조건문, a, b)

In [35]:
np.where(pd.isnull(a), b, a)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

### ```b.combine_first(a)``` : b가 있다면 b, 없다면 a

In [36]:
b[:-2].combine_first(a[2:]) 

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [41]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})
 
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


### 
### 재형성과 피벗(pivot) : 표 형식의 데이터를 재배치
### ```.unstack()``` : 행을 열로 피벗, 계층적 인덱스 melting (spread)
### ```.stack()``` : 데이터의 열을 행으로 피벗(회전), 계층적 인덱스 gather

In [47]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index = pd.Index(['Ohio', 'Colorado'], name = 'state'),
                    columns = pd.Index(['one', 'two', 'three'],
                    name = 'number'))

data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [48]:
result = data.stack(); result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [43]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [49]:
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


- **레벨 숫자나 이름을 전달해서 끄집어낼 단계를 지정 가능**

In [51]:
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


#### 해당 레벨에 있는 모든 값이 하위그룹에 속하지 않을 경우, ```.unstack()```을 하게되면 누락된 데이터가 발생가능

In [52]:
s1 = pd.Series([0, 1, 2, 3], index = ['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index = ['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys = ['one', 'two'])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [53]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


#### ```.stack()```메서드는 누락된 데이터를 자동으로 필터링하기 때문에, 연산을 쉽게 원상 복구 가능

In [54]:
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [55]:
data2.unstack().stack(dropna = False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

####  데이터 프레임을 ```.unstack()```할 때, unstack 레벨은 결과에서 가장 낮은 단계가 됨

In [57]:
df = pd.DataFrame({'left': result, 'right': result + 5},
                  columns = pd.Index(['left', 'right'], name = 'side'))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [58]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


### 
### 긴 형식에서 넓은 형식으로 피벗

In [72]:
data = pd.read_csv('/Users/이찬솔/Documents/Python_for_Data_Analysis/examples/macrodata.csv')

In [73]:
data.shape

(203, 14)

In [74]:
data.head(5)

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [75]:
periods = pd.PeriodIndex(year = data.year, quarter = data.quarter,
                         name = 'date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name = 'item')
data = data.reindex(columns = columns)
data.head(5)

item,realgdp,infl,unemp
0,2710.349,0.0,5.8
1,2778.801,2.34,5.1
2,2775.488,2.74,5.3
3,2785.204,0.27,5.6
4,2847.699,2.31,5.2


### ldata = 긴 형식

In [92]:
data.index = periods.to_timestamp('D', 'end')
ldata = data.stack().reset_index().rename(columns = {0: 'value'})
ldata.head(5)

Unnamed: 0,date,item,value
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-03-31 23:59:59.999999999,infl,0.0
2,1959-03-31 23:59:59.999999999,unemp,5.8
3,1959-06-30 23:59:59.999999999,realgdp,2778.801
4,1959-06-30 23:59:59.999999999,infl,2.34


### ```.pivot( 'a', 'b', 'c' )``` : 'a'열 별로 'b'열과 'c'열의 값들을 spread, r의 spread와 동일

In [78]:
pivoted = ldata.pivot('date', 'item', 'value')
pivoted.head(5)

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,0.0,2710.349,5.8
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2


In [79]:
ldata['value2'] = np.random.randn(len(ldata))
ldata.head(5)

Unnamed: 0,date,item,value,value2
0,1959-03-31 23:59:59.999999999,realgdp,2710.349,-0.411411
1,1959-03-31 23:59:59.999999999,infl,0.0,-0.025594
2,1959-03-31 23:59:59.999999999,unemp,5.8,0.924754
3,1959-06-30 23:59:59.999999999,realgdp,2778.801,-0.049331
4,1959-06-30 23:59:59.999999999,infl,2.34,-0.270773


- **마지막 인자를 생략해서 계층적 열을 가지는 데이터 프레임 생성**

In [80]:
pivoted = ldata.pivot('date', 'item')
pivoted.head(5)

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31 23:59:59.999999999,0.0,2710.349,5.8,-0.025594,-0.411411,0.924754
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1,-0.270773,-0.049331,-0.315129
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3,1.256172,0.863254,-0.331118
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6,0.714944,-0.52277,0.53177
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2,-2.026714,-0.628312,-1.228782


In [81]:
pivoted['value'].head(5)

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,0.0,2710.349,5.8
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2


In [82]:
unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked.head(5)

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31 23:59:59.999999999,0.0,2710.349,5.8,-0.025594,-0.411411,0.924754
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1,-0.270773,-0.049331,-0.315129
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3,1.256172,0.863254,-0.331118
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6,0.714944,-0.52277,0.53177
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2,-2.026714,-0.628312,-1.228782


#### 
### 넓은 형식에서 긴 형식으로 피벗

### ```pd.melt( 데이터, '열')``` : 데이터를 melting

In [83]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [84]:
melted = pd.melt(df, ['key'])
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


- **'key'별로, 'variable'과 'value'를 spread**

In [85]:
reshaped = melted.pivot('key', 'variable', 'value')
reshaped

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


- **데이터를 다시 열로 돌려놓음**

In [86]:
reshaped.reset_index()

variable,key,A,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


- **데이터 값으로 사용할 열들의 집합을 지정**
- **'key'별로, 'A'열과 'B'열의 값을 melt**

In [87]:
pd.melt(df, id_vars = ['key'], value_vars = ['A', 'B'])

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6


- **'A', 'B', 'C'**를 melt

In [88]:
pd.melt(df, value_vars = ['A', 'B', 'C'])

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6
6,C,7
7,C,8
8,C,9


In [89]:
pd.melt(df, value_vars = ['key', 'A', 'B'])

Unnamed: 0,variable,value
0,key,foo
1,key,bar
2,key,baz
3,A,1
4,A,2
5,A,3
6,B,4
7,B,5
8,B,6
