---
### DataFrame Merging (병합)

In [65]:
import pandas as pd

In [66]:
df1 = pd.DataFrame({
    'key':list('bbacaab'),
    'data1':range(7)
})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [67]:
df2 = pd.DataFrame({
    'key':list('abd'),
    'data2':range(3)
})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [68]:
# default: Inner Join
pd.merge(df1,df2,on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [69]:
# 모두 보이기
pd.merge(df1,df2,on='key',how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [70]:
# df1 기준으로 합치기
pd.merge(df1,df2,on='key',how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [71]:
# df2 기준으로 합치기
pd.merge(df1,df2,on='key',how='right')

Unnamed: 0,key,data1,data2
0,a,2.0,0
1,a,4.0,0
2,a,5.0,0
3,b,0.0,1
4,b,1.0,1
5,b,6.0,1
6,d,,2


### key name이 다를경우

In [72]:
df1 = pd.DataFrame({
    'lkey':list('bbacab'),
    'data1':range(6)
})
df1

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [73]:
df2 = pd.DataFrame({
    'rkey':list('ababd'),
    'data2':range(5)
})
df2

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [74]:
pd.merge(df1,df2, left_on='lkey',right_on='rkey',how='inner')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,0,b,3
2,b,1,b,1
3,b,1,b,3
4,b,5,b,1
5,b,5,b,3
6,a,2,a,0
7,a,2,a,2
8,a,4,a,0
9,a,4,a,2


---
### DataFrame의 Concatenation

In [75]:
import numpy as np

In [76]:
df1 = pd.DataFrame(
    np.arange(6).reshape(3,2),
    index=['a','b','c'],
    columns=['one','two']
)
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [77]:
df2 = pd.DataFrame(
    5 + np.arange(4).reshape(2,2),
    index=['a','c'],
    columns=['three','four']
)
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [78]:
# df1과 df2를 합치기
pd.concat([df1,df2],axis=1)
# index 기준으로 합쳐짐


Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [79]:
pd.concat([df1,df2],axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [80]:
pd.concat([df1,df2],axis=0)

Unnamed: 0,one,two,three,four
a,0.0,1.0,,
b,2.0,3.0,,
c,4.0,5.0,,
a,,,5.0,6.0
c,,,7.0,8.0


In [81]:
pd.concat([df1,df2],axis=0, ignore_index=True)

Unnamed: 0,one,two,three,four
0,0.0,1.0,,
1,2.0,3.0,,
2,4.0,5.0,,
3,,,5.0,6.0
4,,,7.0,8.0


### 데이터프레임 중복값 제거

In [82]:
df = pd.DataFrame(
    {
        'k1':['one'] * 3 + ['two'] * 4,
        'k2':[1,1,2,3,3,4,4]
    }
)
df

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [83]:
# 중복값 확인
df.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [84]:
# 중복값 제거
df.drop_duplicates()
# 모든 컬럼값이 동일한 중복row 제거

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [85]:
# 새로운 열 추가
df['v1'] = range(7)
df

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [86]:
df['k1'].drop_duplicates()

0    one
3    two
Name: k1, dtype: object

In [87]:
# k1 값들로 중복값 제거
df.drop_duplicates(['k1'])
# k1 컬럼 내에서 중복값이 있다면 전부 제거됨

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [88]:
# k1 값들로 중복의 마지막 값 출력
df.drop_duplicates(['k1'],keep='last')
# default는 첫번째 값

Unnamed: 0,k1,k2,v1
2,one,2,2
6,two,4,6


---
### Category

In [89]:
df3 = pd.DataFrame(
    {
        'id':[1,2,3,4,5,6],
        'raw_grade':['a','b','b','a','a','e']
    }
)
df3

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [90]:
# category 자료형으로 변환하기
df3['grade'] = df3['raw_grade'].astype('category')
df3

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [91]:
print(df3.raw_grade)
print(df3.grade)

0    a
1    b
2    b
3    a
4    a
5    e
Name: raw_grade, dtype: object
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']


In [93]:
# category를 이용하여 자료 변형
df3.grade.cat.categories = ['very good','good','very bad']
df3

  df3.grade.cat.categories = ['very good','good','very bad']


Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [94]:
df3.sort_values(by='grade')

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
3,4,a,very good
4,5,a,very good
1,2,b,good
2,3,b,good
5,6,e,very bad


In [95]:
df3.grade.info()

<class 'pandas.core.series.Series'>
RangeIndex: 6 entries, 0 to 5
Series name: grade
Non-Null Count  Dtype   
--------------  -----   
6 non-null      category
dtypes: category(1)
memory usage: 266.0 bytes


### 데이터의 범위 정하기

In [96]:
ages=[20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100]
# bins: 나이 범위 구분

In [97]:
cats = pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [98]:
# 범주 관련 Code 보기
cats.codes
# 0번 범위, 0번 범위, 0번 범위, 1번 범위, 0번 범위, ........

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [99]:
# 범주 관련 빈도수 보기
cats.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [102]:
# category 이름 정하기
group_names = ['Youth','YoungAdult','MiddleAged','Senior']
cat2 = pd.cut(ages,bins,labels=group_names)
cat2.value_counts()

Youth         5
YoungAdult    3
MiddleAged    3
Senior        1
dtype: int64