# 판다스 특성
* 결측 데이터들을 쉽게 처리
* Dataframe 및 고차원 객체에서 열을 삽입 및 삭제 가능
* 객체를 라벨 집합에 명시적으로 정렬하거나, 사용자가 라벨을 무시하고 Series Dataframe 등의 계산에서 자동으로 데이터 조정 가능
* 데이터 세트에서 집계 및 변환을 위한 split, apply, combine 작업을 수행할 수 있는 group_by합수 제공
* 누락된 데이터 또는 다른 Python, Numpy 데이터 구조에서 서로 다른 인덱싱 데이터를 Dataframe개체로 쉽게 변환
* 대용량 데이터 세트의 지능형 라벨 기반 슬라이싱, 고급 인덱싱 및 부분 집합 구하기 가능
* 직관적인 데이터 세트 병합 및 결합
* 데이터 세트의 유연한 재구성 및 피벗
* 축의 꼐층적 라벨링
* 플랫 파일(CSV 및 구분), Excel 파일, 데이터 베이스 로딩 및초고속 HDF5형식의 데이터 저장/로드에 사용되는 강력한 IO도구
* 시계열 특정 기능: 날짜 범위 생성 및 주파수 변환, 무빙 윈도우(moving window)통계, 날짜 이동 및 지연

In [9]:
import numpy as np
import pandas as pd
pd.__version__

'1.1.3'

 # 판다스 객체
 ## Series 객체

In [37]:
s= pd.Series([0, 0.25,0.5,0.75,1.0])
s

0    0.00
1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64

In [38]:
s.values

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [39]:
s.index

RangeIndex(start=0, stop=5, step=1)

In [42]:
s[1]

0.25

In [41]:
s[1:4]

1    0.25
2    0.50
3    0.75
dtype: float64

In [34]:
s= pd.Series([0, 0.25,0.5,0.75,1.0],
            index=['a','b','c','d','e'])
s

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64

In [36]:
s['a']

0.0

In [None]:
s[['a','b','c']]

In [None]:
'b' in s

In [27]:
s= pd.Series([0, 0.25,0.5,0.75,1.0],
            index=['a','b','c','d','e'])
s

2     False
4      True
6     False
8      True
10    False
dtype: bool

In [56]:
pop_tuple= {'서울특별시': 9720846,
            '부산광역시': 3804423,
            '인천광역시': 2947217,
            '대구광역시': 2427954,
            '대전광역시': 1471040,
            '광주광역시': 1455048}
population=pd.Series(pop_tuple)
population

서울특별시    9720846
부산광역시    3804423
인천광역시    2947217
대구광역시    2427954
대전광역시    1471040
광주광역시    1455048
dtype: int64

In [46]:
population['서울특별시']

9720846

In [49]:
population['서울특별시':'인천광역시']
population[0:3]

서울특별시    9720846
부산광역시    3804423
인천광역시    2947217
dtype: int64

## DataFrame 객체
* pd.DataDrame({시리즈, 시리즈, ...})

In [53]:
pd.DataFrame([{'a':2,'b':4,'d':3},{'a':4,'b':5,'c':7}])

Unnamed: 0,a,b,d,c
0,2,4,3.0,
1,4,5,,7.0


In [55]:
pd.DataFrame(np.random.rand(5,5),
            columns=['a','b','c','d','e'],
            index=[1,2,3,4,5])

Unnamed: 0,a,b,c,d,e
1,0.467845,0.326307,0.077559,0.564413,0.822174
2,0.273076,0.042126,0.101116,0.181815,0.460698
3,0.701311,0.06049,0.246041,0.811878,0.724621
4,0.414291,0.083142,0.956892,0.544452,0.044622
5,0.601455,0.948671,0.820452,0.648893,0.55549


In [58]:
male_tuple= {'서울특별시': 4732275,
             '부산광역시': 1668618,
             '인천광역시': 1476813,
             '대구광역시': 1198815,
             '대전광역시': 734441,
             '광주광역시': 720060}
male=pd.Series(male_tuple)
male

서울특별시    4732275
부산광역시    1668618
인천광역시    1476813
대구광역시    1198815
대전광역시     734441
광주광역시     720060
dtype: int64

In [59]:
female_tuple= {'서울특별시': 4988571,
             '부산광역시': 1735805,
             '인천광역시': 1470404,
             '대구광역시': 1229139,
             '대전광역시': 736599,
             '광주광역시': 734988}
female=pd.Series(female_tuple)
female

서울특별시    4988571
부산광역시    1735805
인천광역시    1470404
대구광역시    1229139
대전광역시     736599
광주광역시     734988
dtype: int64

In [65]:
korea_df= pd.DataFrame({'인구수':population,
                       '남자인구수':male,
                       '여자인구수':female})
korea_df

Unnamed: 0,인구수,남자인구수,여자인구수
서울특별시,9720846,4732275,4988571
부산광역시,3804423,1668618,1735805
인천광역시,2947217,1476813,1470404
대구광역시,2427954,1198815,1229139
대전광역시,1471040,734441,736599
광주광역시,1455048,720060,734988


In [66]:
korea_df['여자인구수']

서울특별시    4988571
부산광역시    1735805
인천광역시    1470404
대구광역시    1229139
대전광역시     736599
광주광역시     734988
Name: 여자인구수, dtype: int64

In [73]:
korea_df[0:3]
korea_df['서울특별시':'인천광역시']

Unnamed: 0,인구수,남자인구수,여자인구수
서울특별시,9720846,4732275,4988571
부산광역시,3804423,1668618,1735805
인천광역시,2947217,1476813,1470404


## Index 객체
* index: 일반적인 Index객체,Numpy 배열 형식으로 축의 이름 표현
* Int54Index: 정수 값을 위한 Index
* MultiIndex: 단일 축에 여러 단계 색인을 표현하는 계승적 Index객체
* DatetimeIndex: Numpy의 datetime64 타입으로 타임스탬프 저장
* PeriodIndex: 기간 데이터를 위한 Index

In [40]:
idx = pd.Index([2,4,6,8,10])
idx

Int64Index([2, 4, 6, 8, 10], dtype='int64')

In [41]:
idx[1]

4

In [42]:
idx[1:2:2]

Int64Index([4], dtype='int64')

In [43]:
idx[-1::]

Int64Index([10], dtype='int64')

In [82]:
idx[::2]

Int64Index([2, 6, 10], dtype='int64')

In [84]:
print(idx)
print(idx.shape)
print(idx.ndim)
print(idx.dtype)

Int64Index([2, 4, 6, 8, 10], dtype='int64')
(5,)
1
int64


### Index 연산
* append: 색인 추가
* difference: 차집합
* intersectino: 교집합
* union: 합집합
* isin: 색인이 존재하는지 여부
* delete: 해당 색인 삭제
* drop: 해당 값을 삭제
* insert: 색인이 추가된 새로운 색인 반환
* is_monotonic: 색인이 단조성을 가지면 true
* is_unique: 중복되는 색인이 없으면 True
* unique: 중복되는 것을 제거하고 unique한 것만 반환

In [101]:
idx1=pd.Index([1,2,4,6,8])
idx2=pd.Index([2,4,5,6,7])
print(idx1.append(idx2))
print(idx1.difference(idx2))
print(idx1-idx2)
print(idx1.intersection(idx2))
print(idx1 & idx2)
print(idx1.union(idx2))
print(idx1|idx2)
print(idx1.delete(0))
print(idx1.drop(1))
print(idx1^idx2) 

Int64Index([1, 2, 4, 6, 8, 2, 4, 5, 6, 7], dtype='int64')
Int64Index([1, 8], dtype='int64')
Int64Index([-1, -2, -1, 0, 1], dtype='int64')
Int64Index([2, 4, 6], dtype='int64')
Int64Index([2, 4, 6], dtype='int64')
Int64Index([1, 2, 4, 5, 6, 7, 8], dtype='int64')
Int64Index([1, 2, 4, 5, 6, 7, 8], dtype='int64')
Int64Index([2, 4, 6, 8], dtype='int64')
Int64Index([2, 4, 6, 8], dtype='int64')
Int64Index([1, 5, 7, 8], dtype='int64')


 # 인덱싱(indexing)

In [102]:
s= pd.Series([0, 0.25,0.5,0.75,1.0],
            index=['a','b','c','d','e'])
s

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64

In [103]:
s.keys()

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [105]:
list(s.items())

[('a', 0.0), ('b', 0.25), ('c', 0.5), ('d', 0.75), ('e', 1.0)]

In [106]:
s['f']=1.25

In [110]:
s[(s>0.4)&(s<0.8)]

c    0.50
d    0.75
dtype: float64

## Series 인덱싱

In [111]:
s=pd.Series(['a','b','c','d','e'],index=[1,3,5,7,9])
s

1    a
3    b
5    c
7    d
9    e
dtype: object

In [112]:
s[1]

'a'

In [113]:
s[2:4]

5    c
7    d
dtype: object

In [114]:
s.iloc[1] # iloc은 정수값을 그대로 반환

'b'

In [115]:
s.iloc[2:4]

5    c
7    d
dtype: object

In [119]:
s.reindex(range(10))

0    NaN
1      a
2    NaN
3      b
4    NaN
5      c
6    NaN
7      d
8    NaN
9      e
dtype: object

In [120]:
s.reindex(range(10),method='bfill')

0    a
1    a
2    b
3    b
4    c
5    c
6    d
7    d
8    e
9    e
dtype: object

## DataFrame인덱싱
 * df[val]: 칼럼선택
 * df.loc[val]: 라벨값으로 로우의 부분집합 선택
 * df.loc[:, val]: 라벨값으로 칼럼의 부분집합 선택
 * df.loc[val,val]: 라벨값으로 로우와 칼럼의 부분집합 선택
 * df.loc[val]: 라벨값으로 로우의 부분집합 선택
 * df.iloc[:where]: 정수색인으로 칼럼의 부분집합 선택
 * df.iloc[where,where]: 정수색인으로 로우와 칼럼의 부분집합 선택
 * df.iloc[where]: 정수색인으로 로우의 부분집합 선택
 * df.at[label_i,label_j]:라벨값으로 단일값 선택
 * df.i[i,j]:정수 색인으로 단일값 선택
 * reindex: 하나 이상의 축을 새로운 색인으로 재색인
 * get_value,set_value: 로우와 칼럼의 이름으로 값 선택

In [121]:
korea_df

Unnamed: 0,인구수,남자인구수,여자인구수
서울특별시,9720846,4732275,4988571
부산광역시,3804423,1668618,1735805
인천광역시,2947217,1476813,1470404
대구광역시,2427954,1198815,1229139
대전광역시,1471040,734441,736599
광주광역시,1455048,720060,734988


In [122]:
korea_df['남자인구수']

서울특별시    4732275
부산광역시    1668618
인천광역시    1476813
대구광역시    1198815
대전광역시     734441
광주광역시     720060
Name: 남자인구수, dtype: int64

In [123]:
korea_df.남자인구수


서울특별시    4732275
부산광역시    1668618
인천광역시    1476813
대구광역시    1198815
대전광역시     734441
광주광역시     720060
Name: 남자인구수, dtype: int64

In [128]:
korea_df['남여비율']=(korea_df['남자인구수']*100/korea_df['여자인구수'])
korea_df

Unnamed: 0,인구수,남자인구수,여자인구수,남여비율
서울특별시,9720846,4732275,4988571,94.862336
부산광역시,3804423,1668618,1735805,96.129346
인천광역시,2947217,1476813,1470404,100.435867
대구광역시,2427954,1198815,1229139,97.532907
대전광역시,1471040,734441,736599,99.707032
광주광역시,1455048,720060,734988,97.968946


In [129]:
korea_df.values

array([[9.72084600e+06, 4.73227500e+06, 4.98857100e+06, 9.48623363e+01],
       [3.80442300e+06, 1.66861800e+06, 1.73580500e+06, 9.61293463e+01],
       [2.94721700e+06, 1.47681300e+06, 1.47040400e+06, 1.00435867e+02],
       [2.42795400e+06, 1.19881500e+06, 1.22913900e+06, 9.75329072e+01],
       [1.47104000e+06, 7.34441000e+05, 7.36599000e+05, 9.97070319e+01],
       [1.45504800e+06, 7.20060000e+05, 7.34988000e+05, 9.79689464e+01]])

In [131]:
#행과 열 Transpose
korea_df.T

Unnamed: 0,서울특별시,부산광역시,인천광역시,대구광역시,대전광역시,광주광역시
인구수,9720846.0,3804423.0,2947217.0,2427954.0,1471040.0,1455048.0
남자인구수,4732275.0,1668618.0,1476813.0,1198815.0,734441.0,720060.0
여자인구수,4988571.0,1735805.0,1470404.0,1229139.0,736599.0,734988.0
남여비율,94.86234,96.12935,100.4359,97.53291,99.70703,97.96895


In [134]:
korea_df.values[0]

array([9.72084600e+06, 4.73227500e+06, 4.98857100e+06, 9.48623363e+01])

In [137]:
korea_df.loc[:"인천광역시",:"남자인구수"]

Unnamed: 0,인구수,남자인구수
서울특별시,9720846,4732275
부산광역시,3804423,1668618
인천광역시,2947217,1476813


In [141]:
korea_df.loc[(korea_df.여자인구수>1000000)]

Unnamed: 0,인구수,남자인구수,여자인구수,남여비율
서울특별시,9720846,4732275,4988571,94.862336
부산광역시,3804423,1668618,1735805,96.129346
인천광역시,2947217,1476813,1470404,100.435867
대구광역시,2427954,1198815,1229139,97.532907


In [143]:
korea_df.loc[(korea_df.인구수 > 2500000)]

Unnamed: 0,인구수,남자인구수,여자인구수,남여비율
서울특별시,9720846,4732275,4988571,94.862336
부산광역시,3804423,1668618,1735805,96.129346
인천광역시,2947217,1476813,1470404,100.435867


In [145]:
 korea_df.loc[(korea_df.남여비율>100)]

Unnamed: 0,인구수,남자인구수,여자인구수,남여비율
인천광역시,2947217,1476813,1470404,100.435867


In [148]:
korea_df.loc[(korea_df.인구수>2500000)&(korea_df.남여비율<100)]

Unnamed: 0,인구수,남자인구수,여자인구수,남여비율
서울특별시,9720846,4732275,4988571,94.862336
부산광역시,3804423,1668618,1735805,96.129346


In [149]:
korea_df.iloc[:3,:2]

Unnamed: 0,인구수,남자인구수
서울특별시,9720846,4732275
부산광역시,3804423,1668618
인천광역시,2947217,1476813


## 다중 인덱싱(Multi indexing)
### 다중 인덱스 Series

In [151]:
idx_tuples = [('서울특별시',2010),('서울특별시',2020),
            ('부산광역시',2010),('부산광역시',2020),
            ('인천광역시',2010),('인천광역시',2020),
            ('대구광역시',2010),('대구광역시',2020),
            ('대전광역시',2010),('대전광역시',2020),
            ('광주광역시',2010),('광주광역시',2020)]
idx_tuples # multiindex인척

[('서울특별시', 2010),
 ('서울특별시', 2020),
 ('부산광역시', 2010),
 ('부산광역시', 2020),
 ('인천광역시', 2010),
 ('인천광역시', 2020),
 ('대구광역시', 2010),
 ('대구광역시', 2020),
 ('대전광역시', 2010),
 ('대전광역시', 2020),
 ('광주광역시', 2010),
 ('광주광역시', 2020)]

In [154]:
pop_tuples=[10322545,9720846,
           3567910, 3404423,
           2758296, 2947217,
           2511676, 2427954,
           1503664, 1471040,
           1454636, 1455048]
population=pd.Series(pop_tuples,index=idx_tuples)
population

(서울특별시, 2010)    10322545
(서울특별시, 2020)     9720846
(부산광역시, 2010)     3567910
(부산광역시, 2020)     3404423
(인천광역시, 2010)     2758296
(인천광역시, 2020)     2947217
(대구광역시, 2010)     2511676
(대구광역시, 2020)     2427954
(대전광역시, 2010)     1503664
(대전광역시, 2020)     1471040
(광주광역시, 2010)     1454636
(광주광역시, 2020)     1455048
dtype: int64

In [159]:
midx=pd.MultiIndex.from_tuples(idx_tuples)
midx #multiindex

MultiIndex([('서울특별시', 2010),
            ('서울특별시', 2020),
            ('부산광역시', 2010),
            ('부산광역시', 2020),
            ('인천광역시', 2010),
            ('인천광역시', 2020),
            ('대구광역시', 2010),
            ('대구광역시', 2020),
            ('대전광역시', 2010),
            ('대전광역시', 2020),
            ('광주광역시', 2010),
            ('광주광역시', 2020)],
           )

In [168]:
#Multi index Series
population=population.reindex(midx)
population


서울특별시  2010    10322545
       2020     9720846
부산광역시  2010     3567910
       2020     3404423
인천광역시  2010     2758296
       2020     2947217
대구광역시  2010     2511676
       2020     2427954
대전광역시  2010     1503664
       2020     1471040
광주광역시  2010     1454636
       2020     1455048
dtype: int64

In [162]:
population[:,2010]

서울특별시    10322545
부산광역시     3567910
인천광역시     2758296
대구광역시     2511676
대전광역시     1503664
광주광역시     1454636
dtype: int64

In [166]:
 population['대전광역시']
 population['대전광역시',:]    

2010    1503664
2020    1471040
dtype: int64

In [173]:
#multiindex로 만들어 놓은 Series를 DataFrame으로 바꾸는 함수
korea_mdf=population.unstack()
korea_mdf

Unnamed: 0,2010,2020
광주광역시,1454636,1455048
대구광역시,2511676,2427954
대전광역시,1503664,1471040
부산광역시,3567910,3404423
서울특별시,10322545,9720846
인천광역시,2758296,2947217


In [175]:
#DataFrame을 MultiIndex Series로 빠꿔 놓는 함수를 stack()
korea_mdf.stack()

광주광역시  2010     1454636
       2020     1455048
대구광역시  2010     2511676
       2020     2427954
대전광역시  2010     1503664
       2020     1471040
부산광역시  2010     3567910
       2020     3404423
서울특별시  2010    10322545
       2020     9720846
인천광역시  2010     2758296
       2020     2947217
dtype: int64

In [182]:
male_tuples=[5111259,4732275,
           1773170,16668618,
           1390356,1476813,
           1255245,1198815,
           7533648,734441,
           721780,720060]
male_tuples

[5111259,
 4732275,
 1773170,
 16668618,
 1390356,
 1476813,
 1255245,
 1198815,
 7533648,
 734441,
 721780,
 720060]

In [184]:
koreamdf=pd.DataFrame({'총인구수':population,
                     '남자인구수':male_tuples})
koreamdf

Unnamed: 0,Unnamed: 1,총인구수,남자인구수
서울특별시,2010,10322545,5111259
서울특별시,2020,9720846,4732275
부산광역시,2010,3567910,1773170
부산광역시,2020,3404423,16668618
인천광역시,2010,2758296,1390356
인천광역시,2020,2947217,1476813
대구광역시,2010,2511676,1255245
대구광역시,2020,2427954,1198815
대전광역시,2010,1503664,7533648
대전광역시,2020,1471040,734441


In [186]:
female_tuples = [5201286,7988571,
                1794740,1735805,
                1367940,1470404,
                1256431,1229139,
                750016,736599,
                732856,734988]
female_tuples

[5201286,
 7988571,
 1794740,
 1735805,
 1367940,
 1470404,
 1256431,
 1229139,
 750016,
 736599,
 732856,
 734988]

In [242]:
koreamdf=pd.DataFrame({'총인구수':population,
                     '남자인구수':male_tuples,
                      '여자인구수':female_tuples})
koreamdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수
행정구역,연도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
서울특별시,2010,10322545,5111259,5201286
서울특별시,2020,9720846,4732275,7988571
부산광역시,2010,3567910,1773170,1794740
부산광역시,2020,3404423,16668618,1735805
인천광역시,2010,2758296,1390356,1367940
인천광역시,2020,2947217,1476813,1470404
대구광역시,2010,2511676,1255245,1256431
대구광역시,2020,2427954,1198815,1229139
대전광역시,2010,1503664,7533648,750016
대전광역시,2020,1471040,734441,736599


In [243]:
ratio=koreamdf.남자인구수 *100 / koreamdf.여자인구수
koreamdf=pd.DataFrame({'총인구수':population,
                     '남자인구수':male_tuples,
                      '여자인구수':female_tuples,
                      "남녀비율":ratio})
koreamdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남녀비율
행정구역,연도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
서울특별시,2010,10322545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,7988571,59.238066
부산광역시,2010,3567910,1773170,1794740,98.798155
부산광역시,2020,3404423,16668618,1735805,960.281714
인천광역시,2010,2758296,1390356,1367940,101.638668
인천광역시,2020,2947217,1476813,1470404,100.435867
대구광역시,2010,2511676,1255245,1256431,99.905606
대구광역시,2020,2427954,1198815,1229139,97.532907
대전광역시,2010,1503664,7533648,750016,1004.464971
대전광역시,2020,1471040,734441,736599,99.707032


In [244]:
koreamdf.unstack()

Unnamed: 0_level_0,총인구수,총인구수,남자인구수,남자인구수,여자인구수,여자인구수,남녀비율,남녀비율
연도,2010,2020,2010,2020,2010,2020,2010,2020
행정구역,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
광주광역시,1454636,1455048,721780,720060,732856,734988,98.488653,97.968946
대구광역시,2511676,2427954,1255245,1198815,1256431,1229139,99.905606,97.532907
대전광역시,1503664,1471040,7533648,734441,750016,736599,1004.464971,99.707032
부산광역시,3567910,3404423,1773170,16668618,1794740,1735805,98.798155,960.281714
서울특별시,10322545,9720846,5111259,4732275,5201286,7988571,98.26914,59.238066
인천광역시,2758296,2947217,1390356,1476813,1367940,1470404,101.638668,100.435867


### 다중 인덱스 생성

In [198]:
df= pd.DataFrame(np.random.rand(6,3),
                 index=[['a','a','b','b','c','c'],[1,2,1,2,1,2]],
                 columns=['c1','c2','c3'])
df

Unnamed: 0,Unnamed: 1,c1,c2,c3
a,1,0.983625,0.636918,0.561288
a,2,0.796314,0.589304,0.098692
b,1,0.669046,0.88758,0.07005
b,2,0.062521,0.15705,0.353485
c,1,0.83034,0.110598,0.557569
c,2,0.126903,0.103747,0.118485


In [200]:
 pd.MultiIndex.from_arrays([['a','a','b','b','c','c'],[1,2,1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [202]:
pd.MultiIndex.from_tuples([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [204]:
pd.MultiIndex.from_product([['a','b','c'],[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [205]:
pd.MultiIndex(levels=[['a','b','c'],[1,2]],
             codes=[[0,0,1,1,2,2],[0,1,0,1,0,1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [206]:
population

서울특별시  2010    10322545
       2020     9720846
부산광역시  2010     3567910
       2020     3404423
인천광역시  2010     2758296
       2020     2947217
대구광역시  2010     2511676
       2020     2427954
대전광역시  2010     1503664
       2020     1471040
광주광역시  2010     1454636
       2020     1455048
dtype: int64

In [208]:
population.index.names=["행정구역",'연도']
population

행정구역   연도  
서울특별시  2010    10322545
       2020     9720846
부산광역시  2010     3567910
       2020     3404423
인천광역시  2010     2758296
       2020     2947217
대구광역시  2010     2511676
       2020     2427954
대전광역시  2010     1503664
       2020     1471040
광주광역시  2010     1454636
       2020     1455048
dtype: int64

In [216]:
idx = pd.MultiIndex.from_product([['a','b','c'],[1,2]],
                                 names=['name1','name2'])
cols=pd.MultiIndex.from_product([['c1','c2','c3'],[1,2]],
                               names=['colname1','colname2'])
data= np.round(np.random.randn(6,6),2)
mdf=pd.DataFrame(data,index=idx,columns=cols)
mdf   

Unnamed: 0_level_0,colname1,c1,c1,c2,c2,c3,c3
Unnamed: 0_level_1,colname2,1,2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,-0.06,-1.56,-1.02,-0.75,0.05,1.12
a,2,1.02,0.53,0.32,-0.41,-0.13,-1.55
b,1,1.39,-0.88,0.53,0.02,-0.15,1.86
b,2,-1.27,0.03,-0.81,1.66,-0.05,0.6
c,1,0.89,-1.34,0.82,-0.95,-0.52,0.81
c,2,0.08,-2.15,-0.29,-0.13,0.85,-0.98


### 인덱싱 및 슬라이싱

In [217]:
population

행정구역   연도  
서울특별시  2010    10322545
       2020     9720846
부산광역시  2010     3567910
       2020     3404423
인천광역시  2010     2758296
       2020     2947217
대구광역시  2010     2511676
       2020     2427954
대전광역시  2010     1503664
       2020     1471040
광주광역시  2010     1454636
       2020     1455048
dtype: int64

In [219]:
population[:,2020]

행정구역
서울특별시    9720846
부산광역시    3404423
인천광역시    2947217
대구광역시    2427954
대전광역시    1471040
광주광역시    1455048
dtype: int64

In [220]:
population[population>3000000]

행정구역   연도  
서울특별시  2010    10322545
       2020     9720846
부산광역시  2010     3567910
       2020     3404423
dtype: int64

In [224]:
population[['대구광역시','대전광역시']]

행정구역   연도  
대구광역시  2010    2511676
       2020    2427954
대전광역시  2010    1503664
       2020    1471040
dtype: int64

In [225]:
mdf

Unnamed: 0_level_0,colname1,c1,c1,c2,c2,c3,c3
Unnamed: 0_level_1,colname2,1,2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,-0.06,-1.56,-1.02,-0.75,0.05,1.12
a,2,1.02,0.53,0.32,-0.41,-0.13,-1.55
b,1,1.39,-0.88,0.53,0.02,-0.15,1.86
b,2,-1.27,0.03,-0.81,1.66,-0.05,0.6
c,1,0.89,-1.34,0.82,-0.95,-0.52,0.81
c,2,0.08,-2.15,-0.29,-0.13,0.85,-0.98


In [226]:
mdf['c2',1]

name1  name2
a      1       -1.02
       2        0.32
b      1        0.53
       2       -0.81
c      1        0.82
       2       -0.29
Name: (c2, 1), dtype: float64

In [227]:
mdf.iloc[:3,:4]


Unnamed: 0_level_0,colname1,c1,c1,c2,c2
Unnamed: 0_level_1,colname2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,-0.06,-1.56,-1.02,-0.75
a,2,1.02,0.53,0.32,-0.41
b,1,1.39,-0.88,0.53,0.02


In [228]:
mdf.loc[:,('c2',1)]

name1  name2
a      1       -1.02
       2        0.32
b      1        0.53
       2       -0.81
c      1        0.82
       2       -0.29
Name: (c2, 1), dtype: float64

In [264]:
#슬라이스 객체를 사용해서 슬라이스한 것
idx_slice=pd.IndexSlice
mdf.loc[idx_slice[:,2],idx_slice[:,2]]

Unnamed: 0_level_0,colname1,c1,c2,c3
Unnamed: 0_level_1,colname2,2,2,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,2,0.53,-0.41,-1.55
b,2,0.03,1.66,0.6
c,2,-2.15,-0.13,-0.98


### 다중 인덱스 재정렬

In [245]:
# korea_mdf['서울특별시':'인천광역시'] #정렬이 안됐기 때문에 에러가 남
koreamdf=koreamdf.sort_index()
koreamdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남녀비율
행정구역,연도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,1454636,721780,732856,98.488653
광주광역시,2020,1455048,720060,734988,97.968946
대구광역시,2010,2511676,1255245,1256431,99.905606
대구광역시,2020,2427954,1198815,1229139,97.532907
대전광역시,2010,1503664,7533648,750016,1004.464971
대전광역시,2020,1471040,734441,736599,99.707032
부산광역시,2010,3567910,1773170,1794740,98.798155
부산광역시,2020,3404423,16668618,1735805,960.281714
서울특별시,2010,10322545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,7988571,59.238066


In [246]:
koreamdf['서울특별시':'인천광역시']

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남녀비율
행정구역,연도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
서울특별시,2010,10322545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,7988571,59.238066
인천광역시,2010,2758296,1390356,1367940,101.638668
인천광역시,2020,2947217,1476813,1470404,100.435867


In [250]:
koreamdf.unstack(level=0)

Unnamed: 0_level_0,총인구수,총인구수,총인구수,총인구수,총인구수,총인구수,남자인구수,남자인구수,남자인구수,남자인구수,...,여자인구수,여자인구수,여자인구수,여자인구수,남녀비율,남녀비율,남녀비율,남녀비율,남녀비율,남녀비율
행정구역,광주광역시,대구광역시,대전광역시,부산광역시,서울특별시,인천광역시,광주광역시,대구광역시,대전광역시,부산광역시,...,대전광역시,부산광역시,서울특별시,인천광역시,광주광역시,대구광역시,대전광역시,부산광역시,서울특별시,인천광역시
연도,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010,1454636,2511676,1503664,3567910,10322545,2758296,721780,1255245,7533648,1773170,...,750016,1794740,5201286,1367940,98.488653,99.905606,1004.464971,98.798155,98.26914,101.638668
2020,1455048,2427954,1471040,3404423,9720846,2947217,720060,1198815,734441,16668618,...,736599,1735805,7988571,1470404,97.968946,97.532907,99.707032,960.281714,59.238066,100.435867


In [251]:
koreamdf.unstack(level=1)  

Unnamed: 0_level_0,총인구수,총인구수,남자인구수,남자인구수,여자인구수,여자인구수,남녀비율,남녀비율
연도,2010,2020,2010,2020,2010,2020,2010,2020
행정구역,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
광주광역시,1454636,1455048,721780,720060,732856,734988,98.488653,97.968946
대구광역시,2511676,2427954,1255245,1198815,1256431,1229139,99.905606,97.532907
대전광역시,1503664,1471040,7533648,734441,750016,736599,1004.464971,99.707032
부산광역시,3567910,3404423,1773170,16668618,1794740,1735805,98.798155,960.281714
서울특별시,10322545,9720846,5111259,4732275,5201286,7988571,98.26914,59.238066
인천광역시,2758296,2947217,1390356,1476813,1367940,1470404,101.638668,100.435867


In [252]:
koreamdf.stack()

행정구역   연도         
광주광역시  2010  총인구수     1.454636e+06
             남자인구수    7.217800e+05
             여자인구수    7.328560e+05
             남녀비율     9.848865e+01
       2020  총인구수     1.455048e+06
             남자인구수    7.200600e+05
             여자인구수    7.349880e+05
             남녀비율     9.796895e+01
대구광역시  2010  총인구수     2.511676e+06
             남자인구수    1.255245e+06
             여자인구수    1.256431e+06
             남녀비율     9.990561e+01
       2020  총인구수     2.427954e+06
             남자인구수    1.198815e+06
             여자인구수    1.229139e+06
             남녀비율     9.753291e+01
대전광역시  2010  총인구수     1.503664e+06
             남자인구수    7.533648e+06
             여자인구수    7.500160e+05
             남녀비율     1.004465e+03
       2020  총인구수     1.471040e+06
             남자인구수    7.344410e+05
             여자인구수    7.365990e+05
             남녀비율     9.970703e+01
부산광역시  2010  총인구수     3.567910e+06
             남자인구수    1.773170e+06
             여자인구수    1.794740e+06
             남녀비율     9.879815e+01
 

In [253]:
koreamdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남녀비율
행정구역,연도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,1454636,721780,732856,98.488653
광주광역시,2020,1455048,720060,734988,97.968946
대구광역시,2010,2511676,1255245,1256431,99.905606
대구광역시,2020,2427954,1198815,1229139,97.532907
대전광역시,2010,1503664,7533648,750016,1004.464971
대전광역시,2020,1471040,734441,736599,99.707032
부산광역시,2010,3567910,1773170,1794740,98.798155
부산광역시,2020,3404423,16668618,1735805,960.281714
서울특별시,2010,10322545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,7988571,59.238066


In [255]:
idx_flat=koreamdf.reset_index(level=0)
idx_flat

Unnamed: 0_level_0,행정구역,총인구수,남자인구수,여자인구수,남녀비율
연도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010,광주광역시,1454636,721780,732856,98.488653
2020,광주광역시,1455048,720060,734988,97.968946
2010,대구광역시,2511676,1255245,1256431,99.905606
2020,대구광역시,2427954,1198815,1229139,97.532907
2010,대전광역시,1503664,7533648,750016,1004.464971
2020,대전광역시,1471040,734441,736599,99.707032
2010,부산광역시,3567910,1773170,1794740,98.798155
2020,부산광역시,3404423,16668618,1735805,960.281714
2010,서울특별시,10322545,5111259,5201286,98.26914
2020,서울특별시,9720846,4732275,7988571,59.238066


In [258]:
idx_flat=koreamdf.reset_index(level=(0,1))
idx_flat

Unnamed: 0,행정구역,연도,총인구수,남자인구수,여자인구수,남녀비율
0,광주광역시,2010,1454636,721780,732856,98.488653
1,광주광역시,2020,1455048,720060,734988,97.968946
2,대구광역시,2010,2511676,1255245,1256431,99.905606
3,대구광역시,2020,2427954,1198815,1229139,97.532907
4,대전광역시,2010,1503664,7533648,750016,1004.464971
5,대전광역시,2020,1471040,734441,736599,99.707032
6,부산광역시,2010,3567910,1773170,1794740,98.798155
7,부산광역시,2020,3404423,16668618,1735805,960.281714
8,서울특별시,2010,10322545,5111259,5201286,98.26914
9,서울특별시,2020,9720846,4732275,7988571,59.238066


In [263]:
idx_flat.set_index('행정구역','년도')

Unnamed: 0_level_0,연도,총인구수,남자인구수,여자인구수,남녀비율
행정구역,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,1454636,721780,732856,98.488653
광주광역시,2020,1455048,720060,734988,97.968946
대구광역시,2010,2511676,1255245,1256431,99.905606
대구광역시,2020,2427954,1198815,1229139,97.532907
대전광역시,2010,1503664,7533648,750016,1004.464971
대전광역시,2020,1471040,734441,736599,99.707032
부산광역시,2010,3567910,1773170,1794740,98.798155
부산광역시,2020,3404423,16668618,1735805,960.281714
서울특별시,2010,10322545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,7988571,59.238066


# 데이터 연산

In [268]:
s=pd.Series(np.random.randint(0,10,5))
s

0    2
1    0
2    2
3    7
4    3
dtype: int32

In [266]:
df= pd.DataFrame(np.random.randint(0,10,(3,3)),
                 columns=['A','B','C'])
df

Unnamed: 0,A,B,C
0,8,5,1
1,1,4,1
2,2,2,9


In [269]:
np.exp(s)

0       7.389056
1       1.000000
2       7.389056
3    1096.633158
4      20.085537
dtype: float64

In [271]:
np.cos(df * np.pi / 4)

Unnamed: 0,A,B,C
0,1.0,-0.7071068,0.707107
1,0.7071068,-1.0,0.707107
2,6.123234000000001e-17,6.123234000000001e-17,0.707107


In [276]:
s1=pd.Series([1,3,5,7,9],index=[0,1,2,3,4])
s2=pd.Series([2,4,6,8,10],index=[1,2,3,4,5])
s1 + s2

0     NaN
1     5.0
2     9.0
3    13.0
4    17.0
5     NaN
dtype: float64

In [278]:
s1.add(s2,fill_value=0)

0     1.0
1     5.0
2     9.0
3    13.0
4    17.0
5    10.0
dtype: float64

In [7]:
df1=pd.DataFrame(np.random.randint(0,20,(3,3)),
                columns=list('ACD'))
df1

Unnamed: 0,A,C,D
0,17,2,8
1,2,14,1
2,19,3,17


In [8]:
df2= pd.DataFrame(np.random.randint(0,20,(5,5)),
                 columns=list('BAECD'))
df2

Unnamed: 0,B,A,E,C,D
0,11,15,8,1,2
1,15,2,19,3,3
2,14,5,13,17,10
3,19,11,5,3,2
4,16,14,17,1,17


In [9]:
df1+df2

Unnamed: 0,A,B,C,D,E
0,32.0,,3.0,10.0,
1,4.0,,17.0,4.0,
2,24.0,,20.0,27.0,
3,,,,,
4,,,,,


In [14]:
fvalue = df1.stack().mean()
print(fvalue)
df1.add(df2,fill_value=fvalue)

9.222222222222221


Unnamed: 0,A,B,C,D,E
0,32.0,20.222222,3.0,10.0,17.222222
1,4.0,24.222222,17.0,4.0,28.222222
2,24.0,23.222222,20.0,27.0,22.222222
3,20.222222,28.222222,12.222222,11.222222,14.222222
4,23.222222,25.222222,10.222222,26.222222,26.222222


## 연산자 범용 함수

In [33]:
#add()
a=np.random.randint(1,10,(3,3))
a

array([[2, 6, 4],
       [4, 4, 7],
       [4, 5, 9]])

In [34]:
a+a[0]

array([[ 4, 12,  8],
       [ 6, 10, 11],
       [ 6, 11, 13]])

In [35]:
df=pd.DataFrame(a,columns=list('abc'))
df

Unnamed: 0,a,b,c
0,2,6,4
1,4,4,7
2,4,5,9


In [36]:
df+df.iloc[0]

Unnamed: 0,a,b,c
0,4,12,8
1,6,10,11
2,6,11,13


In [37]:
df.add(df.iloc[0])

Unnamed: 0,a,b,c
0,4,12,8
1,6,10,11
2,6,11,13


In [38]:
 #sub()/subtract()
a

array([[2, 6, 4],
       [4, 4, 7],
       [4, 5, 9]])

In [39]:
a-a[0]

array([[ 0,  0,  0],
       [ 2, -2,  3],
       [ 2, -1,  5]])

In [41]:
df-df.iloc[0]

Unnamed: 0,a,b,c
0,0,0,0
1,2,-2,3
2,2,-1,5


In [42]:
df.sub(df.iloc[0])

Unnamed: 0,a,b,c
0,0,0,0
1,2,-2,3
2,2,-1,5


In [43]:
df.subtract(df['b'],axis=0)

Unnamed: 0,a,b,c
0,-4,0,-2
1,0,0,3
2,-1,0,4


In [44]:
#mul()/multiply()
a

array([[2, 6, 4],
       [4, 4, 7],
       [4, 5, 9]])

In [46]:
a*a[1]

array([[ 8, 24, 28],
       [16, 16, 49],
       [16, 20, 63]])

In [47]:
df*df.iloc[1]

Unnamed: 0,a,b,c
0,8,24,28
1,16,16,49
2,16,20,63


In [50]:
df.mul(df.iloc[1])
df.multiply(df.iloc[1])

Unnamed: 0,a,b,c
0,8,24,28
1,16,16,49
2,16,20,63


In [51]:
#truediv()/dv()/divide()/floordiv()
a

array([[2, 6, 4],
       [4, 4, 7],
       [4, 5, 9]])

In [52]:
a/a[0]

array([[1.        , 1.        , 1.        ],
       [2.        , 0.66666667, 1.75      ],
       [2.        , 0.83333333, 2.25      ]])

In [53]:
df/df.iloc[0]

Unnamed: 0,a,b,c
0,1.0,1.0,1.0
1,2.0,0.666667,1.75
2,2.0,0.833333,2.25


In [56]:
df.truediv(df.iloc[0])
df.div(df.iloc[0])
df.divide(df.iloc[0])

Unnamed: 0,a,b,c
0,1.0,1.0,1.0
1,2.0,0.666667,1.75
2,2.0,0.833333,2.25


In [57]:
a//a[0]

array([[1, 1, 1],
       [2, 0, 1],
       [2, 0, 2]], dtype=int32)

In [59]:
df.floordiv(df.iloc[0])

Unnamed: 0,a,b,c
0,1,1,1
1,2,0,1
2,2,0,2


In [60]:
#mod()
a

array([[2, 6, 4],
       [4, 4, 7],
       [4, 5, 9]])

In [61]:
a%a[0]

array([[0, 0, 0],
       [0, 4, 3],
       [0, 5, 1]], dtype=int32)

In [64]:
df%df.iloc[0]
df.mod(df.iloc[0])

Unnamed: 0,a,b,c
0,0,0,0
1,0,4,3
2,0,5,1


In [67]:
#pow()
a

array([[2, 6, 4],
       [4, 4, 7],
       [4, 5, 9]])

In [68]:
a**a[0]

array([[    4, 46656,   256],
       [   16,  4096,  2401],
       [   16, 15625,  6561]], dtype=int32)

In [69]:
df**df.iloc[0]

Unnamed: 0,a,b,c
0,4,46656,256
1,16,4096,2401
2,16,15625,6561


In [71]:
df.pow(df.iloc[0])

Unnamed: 0,a,b,c
0,4,46656,256
1,16,4096,2401
2,16,15625,6561


In [75]:
row=df.iloc[0,::2]
row

a    2
c    4
Name: 0, dtype: int32

In [80]:
print(df)
df-row

   a  b  c
0  2  6  4
1  4  4  7
2  4  5  9


Unnamed: 0,a,b,c
0,0.0,,0.0
1,2.0,,3.0
2,2.0,,5.0


## 정렬
* (Sort_index(by=[기준칼럼,기준칼럼,]))
* (Sort_values(by=[기준칼럼,기준칼럼,]))

In [84]:
s=pd.Series(range(5),index=['a','d','b','c','e'])
s

a    0
d    1
b    2
c    3
e    4
dtype: int64

In [85]:
s.sort_index()

a    0
b    2
c    3
d    1
e    4
dtype: int64

In [88]:
s.sort_values()

a    0
d    1
b    2
c    3
e    4
dtype: int64

In [95]:
df=pd.DataFrame(np.random.randint(0,10,(4,4)),
               index=[2,4,1,3],
               columns=list('bdac'))
df

Unnamed: 0,b,d,a,c
2,3,7,8,4
4,6,4,6,8
1,8,2,4,4
3,8,9,9,0


In [99]:
df.sort_index()

Unnamed: 0,b,d,a,c
1,8,2,4,4
2,3,7,8,4
3,8,9,9,0
4,6,4,6,8


In [97]:
df.sort_values(by='a')

Unnamed: 0,b,d,a,c
1,8,2,4,4
4,6,4,6,8
2,3,7,8,4
3,8,9,9,0


In [98]:
df.sort_values(by=['a','c'])

Unnamed: 0,b,d,a,c
1,8,2,4,4
4,6,4,6,8
2,3,7,8,4
3,8,9,9,0


In [100]:
df.sort_index(axis=1)

Unnamed: 0,a,b,c,d
2,8,3,4,7
4,6,6,8,4
1,4,8,4,2
3,9,8,0,9


## 순위
* rank(method)는 인덱스 값을 반환함 
* average: 같은 값 가지는 항목의 평균
* max: 같은 값을 가지면 더 큰 순위로
* min: 낮은 순위로
* first : 같은 값이더라도 같은 순위 적용하지 않음 선착순 느낌
* dense

In [103]:
s=pd.Series([-2,4,7,3,0,7,5,-4,2,6])
s

0   -2
1    4
2    7
3    3
4    0
5    7
6    5
7   -4
8    2
9    6
dtype: int64

In [104]:
s.rank()

0    2.0
1    6.0
2    9.5
3    5.0
4    3.0
5    9.5
6    7.0
7    1.0
8    4.0
9    8.0
dtype: float64

In [105]:
s.rank(method='first')

0     2.0
1     6.0
2     9.0
3     5.0
4     3.0
5    10.0
6     7.0
7     1.0
8     4.0
9     8.0
dtype: float64

In [106]:
s.rank(method='max')

0     2.0
1     6.0
2    10.0
3     5.0
4     3.0
5    10.0
6     7.0
7     1.0
8     4.0
9     8.0
dtype: float64

## 고성능 연산
* pd.eval('연산')
* df.eval('연산')
* df.eval('x=연산'inplace=True): df에 연산값을 x column에 저장함

In [123]:
nrows, ncols = 10000, 100
df1, df2, df3, df4=(pd.DataFrame(np.random.rand(nrows,ncols))for i in range(4))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.649812,0.766744,0.199989,0.943117,0.490424,0.299733,0.750333,0.564656,0.779665,0.971011,...,0.665266,0.204280,0.203838,0.068076,0.265712,0.462857,0.499776,0.696607,0.477122,0.252438
1,0.342473,0.488939,0.385132,0.385291,0.505423,0.145795,0.156802,0.105122,0.467000,0.024561,...,0.129599,0.337661,0.394625,0.987408,0.011017,0.469926,0.940958,0.379821,0.589382,0.132758
2,0.315136,0.433613,0.237377,0.517319,0.549055,0.523768,0.572985,0.874950,0.956940,0.492165,...,0.613035,0.227818,0.315845,0.445636,0.238960,0.903907,0.965349,0.376359,0.879497,0.650985
3,0.539887,0.519381,0.132407,0.426150,0.490373,0.218723,0.247630,0.519124,0.031347,0.863382,...,0.414224,0.955653,0.537235,0.815059,0.477948,0.129323,0.907087,0.017576,0.630093,0.785345
4,0.005133,0.236678,0.234801,0.296795,0.786472,0.915470,0.880109,0.845770,0.237506,0.551621,...,0.131219,0.844952,0.974848,0.432688,0.665188,0.604447,0.638068,0.079907,0.728874,0.268560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.437824,0.117433,0.380701,0.257050,0.781791,0.005999,0.158560,0.799684,0.985152,0.577687,...,0.182694,0.716150,0.996160,0.234204,0.618322,0.717602,0.487186,0.628347,0.599250,0.047606
9996,0.469677,0.038177,0.055573,0.916015,0.466375,0.202118,0.713833,0.452262,0.517644,0.647468,...,0.649148,0.882677,0.227002,0.850812,0.613663,0.503303,0.576474,0.210671,0.511410,0.674108
9997,0.396649,0.026883,0.068747,0.498410,0.358163,0.935089,0.258717,0.201098,0.007706,0.393487,...,0.406033,0.000257,0.875016,0.586605,0.491582,0.538632,0.063613,0.264928,0.825971,0.681063
9998,0.411809,0.436789,0.870136,0.092924,0.001426,0.894260,0.442785,0.982807,0.216222,0.993971,...,0.976810,0.878163,0.767374,0.376571,0.193417,0.996962,0.225098,0.050275,0.935870,0.534449


In [113]:
%timeit df1+df2+df3+df4

20.7 ms ± 3.84 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [115]:
#eval은 str으로 쓰여진 연산을 수행
%timeit pd.eval('df1+df2+df3+df4')

16.1 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [116]:
%timeit df1 * -df2 / (-df3*df4)

35.5 ms ± 2.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [118]:
%timeit pd.eval('df1 * -df2 / (-df3*df4)')

17.6 ms ± 2.38 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [126]:
%timeit (df1<df2) & (df2<=df3) & (df3!=df4)

12 ms ± 789 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [124]:
%timeit pd.eval('(df1<df2) & (df2<=df3) & (df3!=df4)')

20 ms ± 4.14 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [130]:
df= pd.DataFrame(np.random.rand(1000000,5),
                 columns=list('abcde'))
df.head()

Unnamed: 0,a,b,c,d,e
0,0.198111,0.951826,0.531065,0.658998,0.296984
1,0.722597,0.572456,0.744927,0.732162,0.620877
2,0.325977,0.254924,0.569264,0.542011,0.883632
3,0.995046,0.2104,0.958339,0.802261,0.828907
4,0.819888,0.405983,0.998655,0.062184,0.237615


In [132]:
%timeit df['a'] + df['b']/df['c'] -df['d']*df['e']

35 ms ± 3.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [135]:
%timeit pd.eval('df.a + df.b/df.c -df.d*df.e')

11.4 ms ± 879 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [137]:
%timeit df.eval('a + b/c -d*e')

24.6 ms ± 3.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [143]:
#inplace
df.eval('R=a + b/c -d*e',inplace=True)
df.head()

Unnamed: 0,a,b,c,d,e,R
0,0.198111,0.951826,0.531065,0.658998,0.296984,1.794694
1,0.722597,0.572456,0.744927,0.732162,0.620877,1.036486
2,0.325977,0.254924,0.569264,0.542011,0.883632,0.294852
3,0.995046,0.2104,0.958339,0.802261,0.828907,0.549592
4,0.819888,0.405983,0.998655,0.062184,0.237615,1.211642


In [145]:
#값을 바꾸고 싶으면 같게 해서 덮으면 됨
df.eval('R=a - b/c -d*e',inplace=True)
df.head()

Unnamed: 0,a,b,c,d,e,R
0,0.198111,0.951826,0.531065,0.658998,0.296984,-1.789896
1,0.722597,0.572456,0.744927,0.732162,0.620877,-0.500458
2,0.325977,0.254924,0.569264,0.542011,0.883632,-0.600775
3,0.995046,0.2104,0.958339,0.802261,0.828907,0.110499
4,0.819888,0.405983,0.998655,0.062184,0.237615,0.398583


In [146]:
col_mean=df.mean(1)
df['a']+col_mean

0         0.339292
1         1.204690
2         0.655149
3         1.645954
4         1.307040
            ...   
999995    0.284297
999996    0.359278
999997    0.291068
999998    1.101588
999999   -0.015659
Length: 1000000, dtype: float64

In [147]:
#@로 외부의 값을 가져와서 연산할 수 있음
df.eval('a+@col_mean')

0         0.339292
1         1.204690
2         0.655149
3         1.645954
4         1.307040
            ...   
999995    0.284297
999996    0.359278
999997    0.291068
999998    1.101588
999999   -0.015659
Length: 1000000, dtype: float64

In [148]:
pd.eval('df[(df.a<0.5)&(df.b<0.5)&(df.c>0.5)]')

Unnamed: 0,a,b,c,d,e,R
2,0.325977,0.254924,0.569264,0.542011,0.883632,-0.600775
12,0.428479,0.309948,0.555078,0.431846,0.683280,-0.424979
14,0.115806,0.082621,0.805837,0.004698,0.769015,0.009664
24,0.013814,0.388143,0.585780,0.102411,0.564426,-0.706598
28,0.355580,0.345859,0.757724,0.957115,0.657778,-0.730434
...,...,...,...,...,...,...
999982,0.297997,0.201596,0.993361,0.155604,0.536530,0.011568
999986,0.340810,0.081188,0.540641,0.836821,0.260836,-0.027633
999987,0.465632,0.259551,0.795920,0.524096,0.370511,-0.054654
999991,0.171390,0.344302,0.997597,0.901098,0.637178,-0.747902


In [154]:
#df.query:조건에 해당하는 값을 뽑아낼때
df.query('(a<0.5)&(b<0.5)&(c>0.5)')

Unnamed: 0,a,b,c,d,e,R
2,0.325977,0.254924,0.569264,0.542011,0.883632,-0.600775
12,0.428479,0.309948,0.555078,0.431846,0.683280,-0.424979
14,0.115806,0.082621,0.805837,0.004698,0.769015,0.009664
24,0.013814,0.388143,0.585780,0.102411,0.564426,-0.706598
28,0.355580,0.345859,0.757724,0.957115,0.657778,-0.730434
...,...,...,...,...,...,...
999982,0.297997,0.201596,0.993361,0.155604,0.536530,0.011568
999986,0.340810,0.081188,0.540641,0.836821,0.260836,-0.027633
999987,0.465632,0.259551,0.795920,0.524096,0.370511,-0.054654
999991,0.171390,0.344302,0.997597,0.901098,0.637178,-0.747902


In [155]:
col_mean=df['d'].mean()
df[(df.a<col_mean)&(df.b<col_mean)]

Unnamed: 0,a,b,c,d,e,R
2,0.325977,0.254924,0.569264,0.542011,0.883632,-0.600775
9,0.224218,0.443811,0.243018,0.484768,0.675088,-1.929292
12,0.428479,0.309948,0.555078,0.431846,0.683280,-0.424979
14,0.115806,0.082621,0.805837,0.004698,0.769015,0.009664
17,0.469225,0.116377,0.019027,0.077234,0.487385,-5.684940
...,...,...,...,...,...,...
999987,0.465632,0.259551,0.795920,0.524096,0.370511,-0.054654
999991,0.171390,0.344302,0.997597,0.901098,0.637178,-0.747902
999992,0.326189,0.375170,0.721607,0.011025,0.820270,-0.202764
999996,0.292683,0.118584,0.132623,0.433343,0.041993,-0.619659


In [160]:
df.query('a<@col_mean & b<@col_mean')

Unnamed: 0,a,b,c,d,e,R
2,0.325977,0.254924,0.569264,0.542011,0.883632,-0.600775
9,0.224218,0.443811,0.243018,0.484768,0.675088,-1.929292
12,0.428479,0.309948,0.555078,0.431846,0.683280,-0.424979
14,0.115806,0.082621,0.805837,0.004698,0.769015,0.009664
17,0.469225,0.116377,0.019027,0.077234,0.487385,-5.684940
...,...,...,...,...,...,...
999987,0.465632,0.259551,0.795920,0.524096,0.370511,-0.054654
999991,0.171390,0.344302,0.997597,0.901098,0.637178,-0.747902
999992,0.326189,0.375170,0.721607,0.011025,0.820270,-0.202764
999996,0.292683,0.118584,0.132623,0.433343,0.041993,-0.619659


# 데이터 결합
## Concat([])/Append()

In [162]:
s1=pd.Series(['a','b'],index=[1,2])
s2=pd.Series(['c','d'],index=[3,4])
pd.concat([s1,s2])

1    a
2    b
3    c
4    d
dtype: object

In [166]:
def create_df(cols,idx):
    data = {c: [str(c.lower()) + str(i) for i in idx] for c in cols}
    return pd.DataFrame(data,idx)

In [170]:
df1=create_df('AB',[1,2])
df2=create_df('AB',[3,4])


In [171]:
pd.concat([df1,df2])

Unnamed: 0,A,B
1,a1,b1
2,a2,b2
3,a3,b3
4,a4,b4


In [172]:
df3=create_df('AB',[1,2])
df4=create_df('CD',[1,2])


In [186]:
pd.concat([df3,df4])

Unnamed: 0,A,B,C,D
1,a1,b1,,
2,a2,b2,,
1,,,c1,d1
2,,,c2,d2


In [187]:
pd.concat([df3,df4],axis=1)

Unnamed: 0,A,B,C,D
1,a1,b1,c1,d1
2,a2,b2,c2,d2


In [174]:
#pd.concat([df1,df3], verify_intefrity=True)
#오버랩이 되기 때문에 오류가 뜸

TypeError: concat() got an unexpected keyword argument 'verify_intefrity'

In [175]:
pd.concat([df1,df3],ignore_index=True)

Unnamed: 0,A,B
0,a1,b1
1,a2,b2
2,a1,b1
3,a2,b2


In [188]:
pd.concat([df1,df3],keys=['X','Y'])

Unnamed: 0,A,B
1,a1,b1
2,a2,b2
1,a1,b1
2,a2,b2


In [181]:
df5=create_df('ABC',[1,2])
df6=create_df('BCD',[3,4])
pd.concat([df5,df6],join='inner' ) # 둘 다 존재하는 데이터만 

Unnamed: 0,B,C
1,b1,c1
2,b2,c2
3,b3,c3
4,b4,c4


In [184]:
df5.append(df6)

Unnamed: 0,A,B,C,D
1,a1,b1,c1,
2,a2,b2,c2,
3,,b3,c3,d3
4,,b4,c4,d4


## 병합과 조인

In [190]:
df1=pd.DataFrame({'학생':['홍길동','이순신','임꺽정','김유신'],
                '학과':['경영학과','교육학과','컴퓨터학과','통계학과']})
df1

Unnamed: 0,학생,학과
0,홍길동,경영학과
1,이순신,교육학과
2,임꺽정,컴퓨터학과
3,김유신,통계학과


In [191]:
df2=pd.DataFrame({'학생':['홍길동','이순신','임꺽정','김유신'],
                 '입학년도':[2012,2016,2019,2020]})
df2

Unnamed: 0,학생,입학년도
0,홍길동,2012
1,이순신,2016
2,임꺽정,2019
3,김유신,2020


In [193]:
df3=pd.merge(df1,df2)
df3

Unnamed: 0,학생,학과,입학년도
0,홍길동,경영학과,2012
1,이순신,교육학과,2016
2,임꺽정,컴퓨터학과,2019
3,김유신,통계학과,2020


In [196]:
df4=pd.DataFrame({'학과':['경영학과','교육학과','컴퓨터학과','통계학과'],
                 '학과장':['황희','장영실','안창호','정약용']})
df4

Unnamed: 0,학과,학과장
0,경영학과,황희
1,교육학과,장영실
2,컴퓨터학과,안창호
3,통계학과,정약용


In [197]:
pd.merge(df3,df4)

Unnamed: 0,학생,학과,입학년도,학과장
0,홍길동,경영학과,2012,황희
1,이순신,교육학과,2016,장영실
2,임꺽정,컴퓨터학과,2019,안창호
3,김유신,통계학과,2020,정약용


In [199]:
df5=pd.DataFrame({'학과':['경영학과','교육학과','교육학과','컴퓨터학과','컴퓨터학과','통계학과'],
                '과목':['경영개론','기초수학','물리학','프로그래밍','운영체제','확률론']})
df5

Unnamed: 0,학과,과목
0,경영학과,경영개론
1,교육학과,기초수학
2,교육학과,물리학
3,컴퓨터학과,프로그래밍
4,컴퓨터학과,운영체제
5,통계학과,확률론


In [200]:
pd.merge(df1,df5)
# 학과를 기준으로 만들다 보니 학생이 중복이 되어버림

Unnamed: 0,학생,학과,과목
0,홍길동,경영학과,경영개론
1,이순신,교육학과,기초수학
2,이순신,교육학과,물리학
3,임꺽정,컴퓨터학과,프로그래밍
4,임꺽정,컴퓨터학과,운영체제
5,김유신,통계학과,확률론


In [201]:
pd.merge(df1,df2,on='학생') #r기준할 것을 on

Unnamed: 0,학생,학과,입학년도
0,홍길동,경영학과,2012
1,이순신,교육학과,2016
2,임꺽정,컴퓨터학과,2019
3,김유신,통계학과,2020


In [203]:
df6=pd.DataFrame({'이름':['홍길동','이순신','임꺽정','김유신'],
                '성적':['a','a+','b','a+']})
df6

Unnamed: 0,이름,성적
0,홍길동,a
1,이순신,a+
2,임꺽정,b
3,김유신,a+


In [216]:
pd.merge(df1,df6,left_on="학생",right_on="이름")

Unnamed: 0,학생,학과,이름,성적
0,홍길동,경영학과,홍길동,a
1,이순신,교육학과,이순신,a+
2,임꺽정,컴퓨터학과,임꺽정,b
3,김유신,통계학과,김유신,a+


In [207]:
pd.merge(df1,df6,left_on="학생",right_on="이름").drop("이름",axis=1)

Unnamed: 0,학생,학과,성적
0,홍길동,경영학과,a
1,이순신,교육학과,a+
2,임꺽정,컴퓨터학과,b
3,김유신,통계학과,a+


In [209]:
mdf1=df1.set_index('학생')
mdf2=df2.set_index('학생')
mdf1

Unnamed: 0_level_0,학과
학생,Unnamed: 1_level_1
홍길동,경영학과
이순신,교육학과
임꺽정,컴퓨터학과
김유신,통계학과


In [210]:
mdf2

Unnamed: 0_level_0,입학년도
학생,Unnamed: 1_level_1
홍길동,2012
이순신,2016
임꺽정,2019
김유신,2020


In [218]:
#left_index, right_index 인덱스 기준으로 df 합치기
pd.merge(mdf1,mdf2,left_index=True,right_index=True)

Unnamed: 0_level_0,학과,입학년도
학생,Unnamed: 1_level_1,Unnamed: 2_level_1
홍길동,경영학과,2012
이순신,교육학과,2016
임꺽정,컴퓨터학과,2019
김유신,통계학과,2020


In [212]:
mdf1.join(mdf2)

Unnamed: 0_level_0,학과,입학년도
학생,Unnamed: 1_level_1,Unnamed: 2_level_1
홍길동,경영학과,2012
이순신,교육학과,2016
임꺽정,컴퓨터학과,2019
김유신,통계학과,2020


In [213]:
pd.merge(mdf1,df6,left_index=True,right_on='이름')

Unnamed: 0,학과,이름,성적
0,경영학과,홍길동,a
1,교육학과,이순신,a+
2,컴퓨터학과,임꺽정,b
3,통계학과,김유신,a+


In [219]:
df7=pd.DataFrame({'이름': ['홍길동','이순신','임꺽정'],
                 '주문음식': ['햄버거','피자','짜장면']})
df7

Unnamed: 0,이름,주문음식
0,홍길동,햄버거
1,이순신,피자
2,임꺽정,짜장면


In [221]:
df8=pd.DataFrame({'이름': ['홍길동','이순신','김유신'],
                 '주문음료': ['콜라','사이다','커피']})
df8

Unnamed: 0,이름,주문음료
0,홍길동,콜라
1,이순신,사이다
2,김유신,커피


In [222]:
pd.merge(df7,df8)

Unnamed: 0,이름,주문음식,주문음료
0,홍길동,햄버거,콜라
1,이순신,피자,사이다


In [223]:
pd.merge(df7,df8,how='outer')

Unnamed: 0,이름,주문음식,주문음료
0,홍길동,햄버거,콜라
1,이순신,피자,사이다
2,임꺽정,짜장면,
3,김유신,,커피


In [227]:
pd.merge(df7,df8,how='left')

Unnamed: 0,이름,주문음식,주문음료
0,홍길동,햄버거,콜라
1,이순신,피자,사이다
2,임꺽정,짜장면,


In [229]:
pd.merge(df7,df8,how='right')

Unnamed: 0,이름,주문음식,주문음료
0,홍길동,햄버거,콜라
1,이순신,피자,사이다
2,김유신,,커피


In [232]:
df9=pd.DataFrame({'이름':['홍길동','이순신','임꺽정','김유신'],
                 '순위': [3,2,4,1]})
df9

Unnamed: 0,이름,순위
0,홍길동,3
1,이순신,2
2,임꺽정,4
3,김유신,1


In [233]:
df10=pd.DataFrame({'이름':['홍길동','이순신','임꺽정','김유신'],
                 '순위': [4,1,3,2]})
df10

Unnamed: 0,이름,순위
0,홍길동,4
1,이순신,1
2,임꺽정,3
3,김유신,2


In [234]:
pd.merge(df9,df10,on='이름')

Unnamed: 0,이름,순위_x,순위_y
0,홍길동,3,4
1,이순신,2,1
2,임꺽정,4,3
3,김유신,1,2


In [235]:
#suffixes=['col_name1','col_name2']
pd.merge(df9,df10,on='이름',suffixes=["_인기","_성적"])

Unnamed: 0,이름,순위_인기,순위_성적
0,홍길동,3,4
1,이순신,2,1
2,임꺽정,4,3
3,김유신,1,2


# 데이터 집계와 그룹 연산
## 집계 연산(Aggregation)
* count(): 전체 개수
* head, tail()
* describe(): Series, DataFrame에 대한 요약 통계
* min/max
* cumin/cumax: 누적 최대값/최소값
* argmin/argmax(): 최소값과 최대값의 index위치
* mean/median/std/var
* skew: 외도
* kurt: 첨도
* mad: Mean Absolute Deviation (절대 평균 편차)
* sum,cumsum
* prod, cumprod
* quantile: 0부터 1까지의 분위수
* diff: 1차 산술차 계산
* pct_change:퍼센트 변화율 계산
* corr,cov


In [11]:
df= pd.DataFrame([[1,1.2, np.nan],
                 [2.4, 5.5, 4.2],
                 [np.nan,np.nan,np.nan],
                 [0.44,-3.1,-4.1]],
                index=[1,2,3,4],
                columns=list("ABC"))
df

Unnamed: 0,A,B,C
1,1.0,1.2,
2,2.4,5.5,4.2
3,,,
4,0.44,-3.1,-4.1


In [13]:
df.head(2)

Unnamed: 0,A,B,C
1,1.0,1.2,
2,2.4,5.5,4.2


In [14]:
df.tail(2)

Unnamed: 0,A,B,C
3,,,
4,0.44,-3.1,-4.1


In [15]:
df.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,2.0
mean,1.28,1.2,0.05
std,1.009554,4.3,5.868986
min,0.44,-3.1,-4.1
25%,0.72,-0.95,-2.025
50%,1.0,1.2,0.05
75%,1.7,3.35,2.125
max,2.4,5.5,4.2


In [51]:
print(df)
print(np.argmax(df),np.argmin(df))

      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
2 2


In [22]:
print(df)
print(df.idxmin())
print(df.idxmax())


      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
A    4
B    4
C    4
dtype: int64
A    2
B    2
C    2
dtype: int64


In [52]:
print(df)
print(df.std())
print(df.var())


      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
A    1.009554
B    4.300000
C    5.868986
dtype: float64
A     1.0192
B    18.4900
C    34.4450
dtype: float64


In [53]:
print(df)
print(df.skew())
print(df.kurt())


      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
A    1.15207
B    0.00000
C        NaN
dtype: float64
A   NaN
B   NaN
C   NaN
dtype: float64


In [54]:
print(df)
print(df.sum())
print(df.cumsum())


      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
A    3.84
B    3.60
C    0.10
dtype: float64
      A    B    C
1  1.00  1.2  NaN
2  3.40  6.7  4.2
3   NaN  NaN  NaN
4  3.84  3.6  0.1


In [55]:
print(df)
print(df.prod())
print(df.cumprod())


      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
A     1.056
B   -20.460
C   -17.220
dtype: float64
       A      B      C
1  1.000   1.20    NaN
2  2.400   6.60   4.20
3    NaN    NaN    NaN
4  1.056 -20.46 -17.22


In [56]:
df.diff()

Unnamed: 0,A,B,C
1,,,
2,1.4,4.3,
3,,,
4,,,


In [57]:
df.quantile()

A    1.00
B    1.20
C    0.05
Name: 0.5, dtype: float64

In [58]:
df.pct_change()

Unnamed: 0,A,B,C
1,,,
2,1.4,3.583333,
3,0.0,0.0,0.0
4,-0.816667,-1.563636,-1.97619


In [69]:
df.corrwith(df.B)
df.corr()
df.cov()


Unnamed: 0,A,B,C
A,1.0192,4.214,8.134
B,4.214,18.49,35.69
C,8.134,35.69,34.445


In [70]:
df['B'].unique()

array([ 1.2,  5.5,  nan, -3.1])

In [72]:
df['A'].value_counts()

0.44    1
2.40    1
1.00    1
Name: A, dtype: int64

## GroupBy 연산

In [74]:
df=pd.DataFrame({"c1":['a','a','b','b','c','d','b'],
                 "c2":['A','B','B','A','D','C','C'],
                 "c3":np.random.randint(7),
                 "c4":np.random.random(7)})
df

Unnamed: 0,c1,c2,c3,c4
0,a,A,2,0.330012
1,a,B,2,0.767079
2,b,B,2,0.498327
3,b,A,2,0.980073
4,c,D,2,0.77383
5,d,C,2,0.098504
6,b,C,2,0.956876


In [75]:
df.dtypes

c1     object
c2     object
c3      int64
c4    float64
dtype: object

In [76]:
df['c4'].groupby(df['c2']).std()

c2
A    0.459662
B    0.190036
C    0.606960
D         NaN
Name: c4, dtype: float64

In [79]:
df['c4'].groupby([df['c1'],df['c2']]).mean()

c1  c2
a   A     0.330012
    B     0.767079
b   A     0.980073
    B     0.498327
    C     0.956876
c   D     0.773830
d   C     0.098504
Name: c4, dtype: float64

In [80]:
df['c4'].groupby([df['c1'],df['c2']]).mean().unstack()

c2,A,B,C,D
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,0.330012,0.767079,,
b,0.980073,0.498327,0.956876,
c,,,,0.77383
d,,,0.098504,


In [90]:
df.groupby('c1').mean()

Unnamed: 0_level_0,c3,c4
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2,0.548546
b,2,0.811759
c,2,0.77383
d,2,0.098504


In [88]:
df.groupby(['c1','c2']).size()

c1  c2
a   A     1
    B     1
b   A     1
    B     1
    C     1
c   D     1
d   C     1
dtype: int64

In [94]:
#그룹출력
for c1,group in df.groupby('c1'):
    print(c1)
    print(group)

a
  c1 c2  c3        c4
0  a  A   2  0.330012
1  a  B   2  0.767079
b
  c1 c2  c3        c4
2  b  B   2  0.498327
3  b  A   2  0.980073
6  b  C   2  0.956876
c
  c1 c2  c3       c4
4  c  D   2  0.77383
d
  c1 c2  c3        c4
5  d  C   2  0.098504


In [96]:
for (c1,c2),group in df.groupby(['c1','c2']):
    print(c1)
    print(c2)
    print(group)

a
A
  c1 c2  c3        c4
0  a  A   2  0.330012
a
B
  c1 c2  c3        c4
1  a  B   2  0.767079
b
A
  c1 c2  c3        c4
3  b  A   2  0.980073
b
B
  c1 c2  c3        c4
2  b  B   2  0.498327
b
C
  c1 c2  c3        c4
6  b  C   2  0.956876
c
D
  c1 c2  c3       c4
4  c  D   2  0.77383
d
C
  c1 c2  c3        c4
5  d  C   2  0.098504


In [105]:
#c1 c2를 기준으로 c4의 어떤것을 보여주는 것
#뒤에 대상에 대괄호를 한개 치면 Seiries의 형태로 두개 치면 DataFrame으로 나타남
df.groupby(['c1','c2'])[['c4']].mean()

pandas.core.frame.DataFrame

In [109]:
df.groupby('c1')['c3'].count()
df.groupby('c1')['c3'].median()

c1
a    2
b    2
c    2
d    2
Name: c3, dtype: int64

In [113]:
df.groupby(['c1','c2'])['c4'].agg(['mean','min','max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,min,max
c1,c2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,A,0.330012,0.330012,0.330012
a,B,0.767079,0.767079,0.767079
b,A,0.980073,0.980073,0.980073
b,B,0.498327,0.498327,0.498327
b,C,0.956876,0.956876,0.956876
c,D,0.77383,0.77383,0.77383
d,C,0.098504,0.098504,0.098504


In [116]:
#groubby하는 기준들 index를 따로 출력하지 않음
df.groupby(['c1','c2'],as_index=False)['c4'].mean()

Unnamed: 0,c1,c2,c4
0,a,A,0.330012
1,a,B,0.767079
2,b,A,0.980073
3,b,B,0.498327
4,b,C,0.956876
5,c,D,0.77383
6,d,C,0.098504


In [118]:
# 결과 대상의 column을 없앨 수 있음
df.groupby(['c1','c2'],group_keys=False)['c4'].mean()

c1  c2
a   A     0.330012
    B     0.767079
b   A     0.980073
    B     0.498327
    C     0.956876
c   D     0.773830
d   C     0.098504
Name: c4, dtype: float64

In [131]:
def top(df,n=3,column='c1'):
    return df.sort_values(by=column)[0:n]
top(df,n=5)


Unnamed: 0,c1,c2,c3,c4
0,a,A,2,0.330012
1,a,B,2,0.767079
2,b,B,2,0.498327
3,b,A,2,0.980073
6,b,C,2,0.956876


In [132]:
df.groupby('c1').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,c1,c2,c3,c4
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,0,a,A,2,0.330012
a,1,a,B,2,0.767079
b,2,b,B,2,0.498327
b,3,b,A,2,0.980073
b,6,b,C,2,0.956876
c,4,c,D,2,0.77383
d,5,d,C,2,0.098504


## 피벗 테이블(Pivot Table)
* values: 집계하려는 칼럼 이름 혹은 이름의 리스트  
* index: 피벗 테이블의 로우를 그룹으로 묶을 이름 혹은 키  
* columns: 피벗 테이블의 칼럼을 그룹으로 묶을 이름 혹은 키 
* aggfunc: 집계 함수나 함수 리스트, 기본 값으로 mean사용  
* fill_value: 결과 테이블에서 누락된 값 대체를 위한 값  
dropna: True인 경우 모든 항목이 Na인 칼롬은 포함하지 않음  
* margins: 부분합이나 총계를 담기 위한 로우/칼럼 추가 여부. 기본값은 False

In [133]:
df.pivot_table(['c3','c4'],
              index=['c1'],
              columns=['c2'])

Unnamed: 0_level_0,c3,c3,c3,c3,c4,c4,c4,c4
c2,A,B,C,D,A,B,C,D
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
a,2.0,2.0,,,0.330012,0.767079,,
b,2.0,2.0,2.0,,0.980073,0.498327,0.956876,
c,,,,2.0,,,,0.77383
d,,,2.0,,,,0.098504,


In [134]:
df.pivot_table(['c3','c4'],
              index=['c1'],
              columns=['c2'],
              margins=True)

Unnamed: 0_level_0,c3,c3,c3,c3,c3,c4,c4,c4,c4,c4
c2,A,B,C,D,All,A,B,C,D,All
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
a,2.0,2.0,,,2,0.330012,0.767079,,,0.548546
b,2.0,2.0,2.0,,2,0.980073,0.498327,0.956876,,0.811759
c,,,,2.0,2,,,,0.77383,0.77383
d,,,2.0,,2,,,0.098504,,0.098504
All,2.0,2.0,2.0,2.0,2,0.655043,0.632703,0.52769,0.77383,0.629243


In [135]:
df.pivot_table(['c3','c4'],
              index=['c1'],
              columns=['c2'],
              margins=True,
              aggfunc=sum)

Unnamed: 0_level_0,c3,c3,c3,c3,c3,c4,c4,c4,c4,c4
c2,A,B,C,D,All,A,B,C,D,All
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
a,2.0,2.0,,,4,0.330012,0.767079,,,1.097092
b,2.0,2.0,2.0,,6,0.980073,0.498327,0.956876,,2.435276
c,,,,2.0,2,,,,0.77383,0.77383
d,,,2.0,,2,,,0.098504,,0.098504
All,4.0,4.0,4.0,2.0,14,1.310085,1.265407,1.05538,0.77383,4.404702


In [137]:
df.pivot_table(['c3','c4'],
              index=['c1'],
              columns=['c2'],
              margins=True,
              aggfunc=sum,
              fill_value=0)

Unnamed: 0_level_0,c3,c3,c3,c3,c3,c4,c4,c4,c4,c4
c2,A,B,C,D,All,A,B,C,D,All
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
a,2,2,0,0,4,0.330012,0.767079,0.0,0.0,1.097092
b,2,2,2,0,6,0.980073,0.498327,0.956876,0.0,2.435276
c,0,0,0,2,2,0.0,0.0,0.0,0.77383,0.77383
d,0,0,2,0,2,0.0,0.0,0.098504,0.0,0.098504
All,4,4,4,2,14,1.310085,1.265407,1.05538,0.77383,4.404702


In [140]:
pd.crosstab(df.c1,df.c2)

c2,A,B,C,D
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,1,1,0,0
b,1,1,1,0
c,0,0,0,1
d,0,0,1,0


In [142]:
pd.crosstab(df.c1,df.c2,values=df.c3,aggfunc=sum,margins=True)

c2,A,B,C,D,All
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,2.0,2.0,,,4
b,2.0,2.0,2.0,,6
c,,,,2.0,2
d,,,2.0,,2
All,4.0,4.0,4.0,2.0,14


## 범주형 데이터
* add_categories
* as_ordered:카테고리 순서 지정
* as_unordered: 카테고리에 순서 미지정
* remove_categories
* remove_unused_categories: 사용안하는 카테고리 제거
* rename_categories: 카테고리 이름 변경
* reoeder_categories: 새로운 카테고리에 순서 지정
* set_categories: 새로운 카테고리로 변경

In [143]:
s= pd.Series(['c1','c2','c1','c2','c1']*2)
s

0    c1
1    c2
2    c1
3    c2
4    c1
5    c1
6    c2
7    c1
8    c2
9    c1
dtype: object

In [144]:
pd.unique(s)

array(['c1', 'c2'], dtype=object)

In [145]:
pd.value_counts(s)

c1    6
c2    4
dtype: int64

In [146]:
code=pd.Series([0,1,0,1]*2)
code

0    0
1    1
2    0
3    1
4    0
5    1
6    0
7    1
dtype: int64

In [148]:
d=pd.Series(['c1','c2'])
d

0    c1
1    c2
dtype: object

In [151]:
#범주형 변수를 만들어 줄 때 code를 붙여주는 방법 take
d.take(code)

0    c1
1    c2
0    c1
1    c2
0    c1
1    c2
0    c1
1    c2
dtype: object

In [155]:
df=pd.DataFrame({'id':np.arange(len(s)),
               'c':s,
               'v':np.random.randint(1000,5000,len(s))})
df

Unnamed: 0,id,c,v
0,0,c1,2484
1,1,c2,2598
2,2,c1,3603
3,3,c2,3971
4,4,c1,3685
5,5,c1,1083
6,6,c2,1085
7,7,c1,3451
8,8,c2,1160
9,9,c1,3149


In [157]:
c=df['c'].astype('category')
c

0    c1
1    c2
2    c1
3    c2
4    c1
5    c1
6    c2
7    c1
8    c2
9    c1
Name: c, dtype: category
Categories (2, object): ['c1', 'c2']

In [158]:
c.values

['c1', 'c2', 'c1', 'c2', 'c1', 'c1', 'c2', 'c1', 'c2', 'c1']
Categories (2, object): ['c1', 'c2']

In [159]:
c.values.categories

Index(['c1', 'c2'], dtype='object')

In [160]:
c.values.codes

array([0, 1, 0, 1, 0, 0, 1, 0, 1, 0], dtype=int8)

In [162]:
df['c']=c
df.c

0    c1
1    c2
2    c1
3    c2
4    c1
5    c1
6    c2
7    c1
8    c2
9    c1
Name: c, dtype: category
Categories (2, object): ['c1', 'c2']

In [165]:
#c=df['c'].astype('category') 같은 기능인데 더 짧은 코드
c=pd.Categorical(['c1','c2','c3','c1','c2'])
c

['c1', 'c2', 'c3', 'c1', 'c2']
Categories (3, object): ['c1', 'c2', 'c3']

In [170]:
categories=['c1','c2','c3']
codes=[0,1,2,0,1]
c=pd.Categorical.from_codes(codes,categories)
c

['c1', 'c2', 'c3', 'c1', 'c2']
Categories (3, object): ['c1', 'c2', 'c3']

In [171]:
#순서지정
pd.Categorical.from_codes(codes,categories,ordered=True)
c.as_ordered()

['c1', 'c2', 'c3', 'c1', 'c2']
Categories (3, object): ['c1' < 'c2' < 'c3']

In [173]:
c.codes

array([0, 1, 2, 0, 1], dtype=int8)

In [174]:
c.categories

Index(['c1', 'c2', 'c3'], dtype='object')

In [175]:
#카테고리 추가
c=c.set_categories(['c1','c2','c3','c4','c5'])
c.categories

Index(['c1', 'c2', 'c3', 'c4', 'c5'], dtype='object')

In [177]:
c.value_counts()

c1    2
c2    2
c3    1
c4    0
c5    0
dtype: int64

In [178]:
c[c.isin(['c1','c3'])]

['c1', 'c3', 'c1']
Categories (5, object): ['c1', 'c2', 'c3', 'c4', 'c5']

In [180]:
#category 제거
c=c.remove_unused_categories()

['c1', 'c2', 'c3', 'c1', 'c2']
Categories (3, object): ['c1', 'c2', 'c3']

In [181]:
c.categories

Index(['c1', 'c2', 'c3'], dtype='object')

# 문자열 연산
## 문자열 연산자 
* 차이썬 문자열 연산자 거의 모두 반영
* 문자열 접근에는 str.함수() 식으로 접근해야 함

    * captitalize()  
    * casefold(): 대소문자 구분 제거  
    * count(sub, [ , start[, end]]): start end범위에서 sub의 중복되지 않은 수를 반환  
    * find(sub, [ , start[, end]]): start end범위에서 sub의 가장 작은 인덱스 반환, 없으면 -1반환  
    * rfind(sub, [ , start[, end]]):  start end범위에서 sub의 가장 큰 인덱스 반환 , 없으면 -1반환  
    * index(sub, [ , start[, end]]): find와 비슷 하지만 없으면 에러발생    
    * rindex(sub, [ , start[, end]]): rfind와 비슷 하지만 없으면 에러발생
    * isalnum(): 영숫자로 한개 이상 존재할 시 True  
    * isalpha(): 영문자  
    * isdecimal(): 10진수 숫자  
    * isdigit(): 숫자  
    * is numeric(): 수치형  
    * isidentifier(): 식별자  
    * isspace(): 공백  
    * istitle(): 제목  
    * islower()/isupper(): 소문자 대문자  
    * join(iterable): iterable에 있는 문자열에 연결된 문자열 반환  
    * ljust(width[,fillchar]): 너비만큼 문자열에서 왼쪽 정렬된 문자열 반환 rjust도 가능  
    * swapcase(): 대소문자 변환  
    * strip([chars]): lstrip, rstrip 모두 가능/ 문자열에 지정된 공백 제거한 문자열 반환  
    * replace(old,new[,count])): 변환  
    * split(sep=None,maxsplit=1): sep를 구분자 문자열로 사용하여 단어목록 반환   
    * zfill(width): 너비 만큼의 문자열에서 비어있는 부분에 '0'이 채워진 문자열 반환


In [188]:
name_tuple=['San Lee','Steven Jobs',' Larry Page','Elon Musk', None,'Bill Gates','Mark Zukerberg','Jeff Bezos']
names=pd.Series(name_tuple)
names

0           San Lee
1       Steven Jobs
2        Larry Page
3         Elon Musk
4              None
5        Bill Gates
6    Mark Zukerberg
7        Jeff Bezos
dtype: object

In [185]:
names.str.lower()

0           san lee
1       steven jobs
2        larry page
3         elon musk
4              None
5        bill gates
6    mark zukerberg
7        jeff bezos
dtype: object

In [186]:
names.str.len()

0     7.0
1    11.0
2    11.0
3     9.0
4     NaN
5    10.0
6    14.0
7    10.0
dtype: float64

In [187]:
names.str.split()

0           [San, Lee]
1       [Steven, Jobs]
2        [Larry, Page]
3         [Elon, Musk]
4                 None
5        [Bill, Gates]
6    [Mark, Zukerberg]
7        [Jeff, Bezos]
dtype: object

## 기타 연산자
* get(): 각 요소에 인덳스 지정해서 가져오는 것
* slice(): 각 요소에 슬라이스 적용
* slice_replace(): 슬라이스를 특정 값을 대체
* cat(): 문자열 연결
* repeat(): 값 반복
* normalize(): 문자열의 유니코드 형태로 반환
* pad(): 문자열 왼쪽 오른쪽 또는 양쪽 공백 추가
* wrap(): 긴 문자열을 주여진 너비보다 짧은 길이의 여러 줄로 나눔
* join(): Series의 각 요소에 있는 문자열을 전달된 구분자와 결합
* get_dummies(): Dataframe으로 더미변수 추출

In [189]:
names.str[0:4]

0    San 
1    Stev
2     Lar
3    Elon
4    None
5    Bill
6    Mark
7    Jeff
dtype: object

In [196]:
names.str.split().str.get(-1)

0          Lee
1         Jobs
2         Page
3         Musk
4         None
5        Gates
6    Zukerberg
7        Bezos
dtype: object

In [197]:
names.str.repeat(2)

0                  San LeeSan Lee
1          Steven JobsSteven Jobs
2           Larry Page Larry Page
3              Elon MuskElon Musk
4                            None
5            Bill GatesBill Gates
6    Mark ZukerbergMark Zukerberg
7            Jeff BezosJeff Bezos
dtype: object

In [199]:
names.str.join("*")

0                  S*a*n* *L*e*e
1          S*t*e*v*e*n* *J*o*b*s
2           *L*a*r*r*y* *P*a*g*e
3              E*l*o*n* *M*u*s*k
4                           None
5            B*i*l*l* *G*a*t*e*s
6    M*a*r*k* *Z*u*k*e*r*b*e*r*g
7            J*e*f*f* *B*e*z*o*s
dtype: object

## 정규 표현식

In [201]:
names.str.match('([A-Za-z]+)')

0     True
1     True
2    False
3     True
4     None
5     True
6     True
7     True
dtype: object

In [202]:
names.str.findall('([A-Za-z]+)')

0           [San, Lee]
1       [Steven, Jobs]
2        [Larry, Page]
3         [Elon, Musk]
4                 None
5        [Bill, Gates]
6    [Mark, Zukerberg]
7        [Jeff, Bezos]
dtype: object

# 시계열 처리

In [205]:
idx=pd.DatetimeIndex(['2019-01-01','2019-01-01','2020-02-01','2020-02-02','2020-03-01'])
s=pd.Series([0,1,2,3,4],index=idx)
s

2019-01-01    0
2019-01-01    1
2020-02-01    2
2020-02-02    3
2020-03-01    4
dtype: int64

In [206]:
s['2020-01-01':]

2020-02-01    2
2020-02-02    3
2020-03-01    4
dtype: int64

In [207]:
s['2019']

2019-01-01    0
2019-01-01    1
dtype: int64

## 시계열 데이터 구조

* 타임스탬프 / 기간 / 시간 델타 또는 지속기간


In [210]:
from datetime import datetime
dates=pd.to_datetime(['12-12-2019',datetime(2020,1,1),'2nd of Feb, 2020','2020-Mar-4','20200701'])
dates

DatetimeIndex(['2019-12-12', '2020-01-01', '2020-02-02', '2020-03-04',
               '2020-07-01'],
              dtype='datetime64[ns]', freq=None)

In [211]:
dates.to_period('D')

PeriodIndex(['2019-12-12', '2020-01-01', '2020-02-02', '2020-03-04',
             '2020-07-01'],
            dtype='period[D]', freq='D')

In [212]:
#timedelta 형식
dates-dates[0]

TimedeltaIndex(['0 days', '20 days', '52 days', '83 days', '202 days'], dtype='timedelta64[ns]', freq=None)

In [213]:
pd.date_range('2020-01-01','2020-07-01')

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10',
               ...
               '2020-06-22', '2020-06-23', '2020-06-24', '2020-06-25',
               '2020-06-26', '2020-06-27', '2020-06-28', '2020-06-29',
               '2020-06-30', '2020-07-01'],
              dtype='datetime64[ns]', length=183, freq='D')

In [216]:
pd.date_range('2020-01-01',periods=7)

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07'],
              dtype='datetime64[ns]', freq='D')

In [217]:
pd.date_range('2020-01-01',periods=7,freq='M')

DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31', '2020-06-30', '2020-07-31'],
              dtype='datetime64[ns]', freq='M')

In [220]:
pd.date_range('2020-01-01',periods=7,freq='H')

DatetimeIndex(['2020-01-01 00:00:00', '2020-01-01 01:00:00',
               '2020-01-01 02:00:00', '2020-01-01 03:00:00',
               '2020-01-01 04:00:00', '2020-01-01 05:00:00',
               '2020-01-01 06:00:00'],
              dtype='datetime64[ns]', freq='H')

In [223]:
# NaT=Not a Time
idx=pd.to_datetime(['2020-01-01 12:00:00','2020-01-02 00:00:00']+[None])
idx

DatetimeIndex(['2020-01-01 12:00:00', '2020-01-02 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [225]:
idx[2]

NaT

In [226]:
pd.isnull(idx)

array([False, False,  True])

## 시계열 기본