### pandas
- R언어의 핵심 기능인 DataFrame, Series를 파이썬에 추가한 것
- numpy위에 구현
- numpy를 좀 더 폭넓게 표현
- 분석용 라이브러리로 각광


In [3]:
import pandas as pd
import numpy as np

In [9]:
# 데이터 유형 기초 : Series (데이터 프레임 기준 컬럼이 없는 형태, 인덱스만 존재)
s = pd.Series([1,3,5,np.nan,10,15])
s
# array + indices
# np.nan => NaN => Not a Number

0     1.0
1     3.0
2     5.0
3     NaN
4    10.0
5    15.0
dtype: float64

In [94]:
# pandas의 데이터 타입 중 날짜형을 기준으로 인덱스 생성
dates = pd.date_range('20180929',periods = 6)
dates

DatetimeIndex(['2018-09-29', '2018-09-30', '2018-10-01', '2018-10-02',
               '2018-10-03', '2018-10-04'],
              dtype='datetime64[ns]', freq='D')

In [21]:
# 데이터프레임 : 인덱스, 컬럼, 값
# 시리즈 : 인덱스, 값 
# 데이터 프레임 = 시리즈 + 컬럼
# 데이터 프레임의 인덱싱 결과 => 시리즈
cols= ['A','B','C','D']
df = pd.DataFrame(np.random.randn(6,4),columns=cols,index=dates)
# 데이터의 행의 수와 인덱스 수가 일치
# 데이터의 열의 수와 컬럼 수가 일치
df

Unnamed: 0,A,B,C,D
2018-09-29,0.251464,-0.147523,-0.561578,1.392507
2018-09-30,0.367146,0.445136,-0.241571,-0.434635
2018-10-01,-0.439628,0.04017,-0.76096,0.168268
2018-10-02,0.53052,-1.144927,-1.406527,1.494357
2018-10-03,0.577937,-0.098738,-0.381949,1.138831
2018-10-04,0.62089,1.659343,-0.993242,0.314272


In [24]:
# 위쪽 데이터만 본다.
df.head()

Unnamed: 0,A,B,C,D
2018-09-29,0.251464,-0.147523,-0.561578,1.392507
2018-09-30,0.367146,0.445136,-0.241571,-0.434635
2018-10-01,-0.439628,0.04017,-0.76096,0.168268
2018-10-02,0.53052,-1.144927,-1.406527,1.494357
2018-10-03,0.577937,-0.098738,-0.381949,1.138831


In [25]:
# 인덱스
df.index

DatetimeIndex(['2018-09-29', '2018-09-30', '2018-10-01', '2018-10-02',
               '2018-10-03', '2018-10-04'],
              dtype='datetime64[ns]', freq='D')

In [26]:
# 컬럼
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [31]:
# 값확인 - > numpy의 ndarray
print(df.values, type(df.values))

[[ 0.25146369 -0.14752339 -0.56157835  1.39250704]
 [ 0.36714635  0.44513628 -0.24157113 -0.43463453]
 [-0.43962752  0.04017043 -0.76095986  0.16826762]
 [ 0.53052027 -1.14492738 -1.40652686  1.49435697]
 [ 0.57793745 -0.09873776 -0.38194867  1.13883069]
 [ 0.6208897   1.65934287 -0.99324219  0.31427206]] <class 'numpy.ndarray'>


In [30]:
df.index

DatetimeIndex(['2018-09-29', '2018-09-30', '2018-10-01', '2018-10-02',
               '2018-10-03', '2018-10-04'],
              dtype='datetime64[ns]', freq='D')

In [36]:
# df의 개요
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2018-09-29 to 2018-10-04
Freq: D
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [37]:
# 통계적 개요 (개수, 평균, 표준편차, 최소, 최대, 25%지점, 50%지점, 75%지점)
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.318055,0.125577,-0.724305,0.678933
std,0.396312,0.916617,0.427976,0.777097
min,-0.439628,-1.144927,-1.406527,-0.434635
25%,0.280384,-0.135327,-0.935172,0.204769
50%,0.448833,-0.029284,-0.661269,0.726551
75%,0.566083,0.343895,-0.426856,1.329088
max,0.62089,1.659343,-0.241571,1.494357


In [38]:
# 컬럼 C 열 기준 내림차순 정렬
df.sort_values(by = 'C',ascending = False)

Unnamed: 0,A,B,C,D
2018-09-30,0.367146,0.445136,-0.241571,-0.434635
2018-10-03,0.577937,-0.098738,-0.381949,1.138831
2018-09-29,0.251464,-0.147523,-0.561578,1.392507
2018-10-01,-0.439628,0.04017,-0.76096,0.168268
2018-10-04,0.62089,1.659343,-0.993242,0.314272
2018-10-02,0.53052,-1.144927,-1.406527,1.494357


In [39]:
# 원본은 그대로
df

Unnamed: 0,A,B,C,D
2018-09-29,0.251464,-0.147523,-0.561578,1.392507
2018-09-30,0.367146,0.445136,-0.241571,-0.434635
2018-10-01,-0.439628,0.04017,-0.76096,0.168268
2018-10-02,0.53052,-1.144927,-1.406527,1.494357
2018-10-03,0.577937,-0.098738,-0.381949,1.138831
2018-10-04,0.62089,1.659343,-0.993242,0.314272


In [41]:
# 컬럼 B의 데이터를 가져와라 = > 인덱싱 => 차원축소 => 시리즈
df.B
df['B']

2018-09-29   -0.147523
2018-09-30    0.445136
2018-10-01    0.040170
2018-10-02   -1.144927
2018-10-03   -0.098738
2018-10-04    1.659343
Freq: D, Name: B, dtype: float64

In [45]:
# 슬라이싱 : 1차원 인덱스 부분만 자른것임
df[1:3] # 행 슬라이싱

Unnamed: 0,A,B,C,D
2018-09-30,0.367146,0.445136,-0.241571,-0.434635
2018-10-01,-0.439628,0.04017,-0.76096,0.168268


In [46]:
df['2018-09-30':'2018-10-02'] # 이름으로 자르면 종점에 해당 날짜도 포함

Unnamed: 0,A,B,C,D
2018-09-30,0.367146,0.445136,-0.241571,-0.434635
2018-10-01,-0.439628,0.04017,-0.76096,0.168268
2018-10-02,0.53052,-1.144927,-1.406527,1.494357


In [96]:
# 인덱스 데이터를 이용하여 해당 정보를 보고 싶다.
# loc() : 인덱스의 로케이션 정보를 옵션으로 슬라이싱 지정.
df.loc[dates[2]] # column을 index 형태로 받아줌.
# 해당 행의 데이터를 나열하고 열의 정보(컬럼명)이 인덱스로 적용되서 
# 시리즈로 나온다=> 인덱싱해서 차원축소가 발생했기 때문이다.


A   -0.439628
B    0.040170
C   -0.760960
D    0.168268
Name: 2018-10-01 00:00:00, dtype: float64

In [49]:
df.loc[:,['C','D']]

Unnamed: 0,C,D
2018-09-29,-0.561578,1.392507
2018-09-30,-0.241571,-0.434635
2018-10-01,-0.76096,0.168268
2018-10-02,-1.406527,1.494357
2018-10-03,-0.381949,1.138831
2018-10-04,-0.993242,0.314272


In [54]:
df.loc['2018-10-01':'2018-10-02',['C','D']]

Unnamed: 0,C,D
2018-10-01,-0.76096,0.168268
2018-10-02,-1.406527,1.494357


In [50]:
df.loc['2018-10-01',['C','D']]

C   -0.760960
D    0.168268
Name: 2018-10-01 00:00:00, dtype: float64

In [52]:
df.loc['2018-10-01','C']

-0.7609598563061841

In [None]:
#iloc()

In [56]:
df.iloc[1] # loc은 값으로 iloc은 인덱스로

A    0.367146
B    0.445136
C   -0.241571
D   -0.434635
Name: 2018-09-30 00:00:00, dtype: float64

In [57]:
df.iloc[1:3,1:3]

Unnamed: 0,B,C
2018-09-30,0.445136,-0.241571
2018-10-01,0.04017,-0.76096


In [62]:
# 비 연속적인 데이터를 내가 원하는 순서대로 슬라이싱
df.iloc[[1,4,2],[0,2]]

Unnamed: 0,A,C
2018-09-30,0.367146,-0.241571
2018-10-03,0.577937,-0.381949
2018-10-01,-0.439628,-0.76096


In [63]:
df.iloc[1:3,1]

2018-09-30    0.445136
2018-10-01    0.040170
Freq: D, Name: B, dtype: float64

In [64]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2018-09-29,-0.147523,-0.561578
2018-09-30,0.445136,-0.241571
2018-10-01,0.04017,-0.76096
2018-10-02,-1.144927,-1.406527
2018-10-03,-0.098738,-0.381949
2018-10-04,1.659343,-0.993242


In [74]:
# c열 값이 양수인 경우만 찾아서 해당 행만 가져와라.
df[df.C > 0]
df[df.loc[:,'C']>0]


Unnamed: 0,A,B,C,D


In [76]:
# 양수인 데이터만 값을 채워라.
# 음수면 False가 되서 NaN이 채워졌다.
# 양수면 True가 되서 원래값이 채워졌다.
df[df>0]

Unnamed: 0,A,B,C,D
2018-09-29,0.251464,,,1.392507
2018-09-30,0.367146,0.445136,,
2018-10-01,,0.04017,,0.168268
2018-10-02,0.53052,,,1.494357
2018-10-03,0.577937,,,1.138831
2018-10-04,0.62089,1.659343,,0.314272


In [77]:
# 복사
print(df[:])

                   A         B         C         D
2018-09-29  0.251464 -0.147523 -0.561578  1.392507
2018-09-30  0.367146  0.445136 -0.241571 -0.434635
2018-10-01 -0.439628  0.040170 -0.760960  0.168268
2018-10-02  0.530520 -1.144927 -1.406527  1.494357
2018-10-03  0.577937 -0.098738 -0.381949  1.138831
2018-10-04  0.620890  1.659343 -0.993242  0.314272                    A         B         C         D
2018-09-29  0.251464 -0.147523 -0.561578  1.392507
2018-09-30  0.367146  0.445136 -0.241571 -0.434635
2018-10-01 -0.439628  0.040170 -0.760960  0.168268
2018-10-02  0.530520 -1.144927 -1.406527  1.494357
2018-10-03  0.577937 -0.098738 -0.381949  1.138831
2018-10-04  0.620890  1.659343 -0.993242  0.314272


In [78]:
print(df.copy())

                   A         B         C         D
2018-09-29  0.251464 -0.147523 -0.561578  1.392507
2018-09-30  0.367146  0.445136 -0.241571 -0.434635
2018-10-01 -0.439628  0.040170 -0.760960  0.168268
2018-10-02  0.530520 -1.144927 -1.406527  1.494357
2018-10-03  0.577937 -0.098738 -0.381949  1.138831
2018-10-04  0.620890  1.659343 -0.993242  0.314272


In [84]:
# 새로운 칼럼 추가 (많이 사용)
# 컬럼, 값(모집합과 일치하는 인덱스 수만큼 데이터가 필요)
# 값의 데이터타입 => 달라도 된다.
new_data = ['one', 'two', 'three', 'four', 'five', 'six']
df2 = df.copy()
df2['E'] = new_data
df2


Unnamed: 0,A,B,C,D,E
2018-09-29,0.251464,-0.147523,-0.561578,1.392507,one
2018-09-30,0.367146,0.445136,-0.241571,-0.434635,two
2018-10-01,-0.439628,0.04017,-0.76096,0.168268,three
2018-10-02,0.53052,-1.144927,-1.406527,1.494357,four
2018-10-03,0.577937,-0.098738,-0.381949,1.138831,five
2018-10-04,0.62089,1.659343,-0.993242,0.314272,six


In [88]:
# 데이터가 부족하면 Nan 이라도 채워서 맞춰
new_data2 = ['one2', 'two2', 'three2', np.nan, np.nan,np.nan]
df2['F']=new_data2
df2

Unnamed: 0,A,B,C,D,E,F
2018-09-29,0.251464,-0.147523,-0.561578,1.392507,one,one2
2018-09-30,0.367146,0.445136,-0.241571,-0.434635,two,two2
2018-10-01,-0.439628,0.04017,-0.76096,0.168268,three,three2
2018-10-02,0.53052,-1.144927,-1.406527,1.494357,four,
2018-10-03,0.577937,-0.098738,-0.381949,1.138831,five,
2018-10-04,0.62089,1.659343,-0.993242,0.314272,six,


In [89]:
#특정 컬럼에 특정값 조사
df2['E'].isin(['two','four'])

2018-09-29    False
2018-09-30     True
2018-10-01    False
2018-10-02     True
2018-10-03    False
2018-10-04    False
Freq: D, Name: E, dtype: bool

In [90]:
# 누적합 
df


Unnamed: 0,A,B,C,D
2018-09-29,0.251464,-0.147523,-0.561578,1.392507
2018-09-30,0.367146,0.445136,-0.241571,-0.434635
2018-10-01,-0.439628,0.04017,-0.76096,0.168268
2018-10-02,0.53052,-1.144927,-1.406527,1.494357
2018-10-03,0.577937,-0.098738,-0.381949,1.138831
2018-10-04,0.62089,1.659343,-0.993242,0.314272


In [91]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2018-09-29,0.251464,-0.147523,-0.561578,1.392507
2018-09-30,0.61861,0.297613,-0.803149,0.957873
2018-10-01,0.178983,0.337783,-1.564109,1.12614
2018-10-02,0.709503,-0.807144,-2.970636,2.620497
2018-10-03,1.28744,-0.905882,-3.352585,3.759328
2018-10-04,1.90833,0.753461,-4.345827,4.0736


In [92]:
# 최대값, 최소값 거리(distance)
# df.apply(함수)
# 컬럼별로 계산하여 시리즈로 리턴
df.apply(lambda x:x.max() - x.min()) # x는 numpy

A    1.060517
B    2.804270
C    1.164956
D    1.928991
dtype: float64