In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Pandas -> Series 사용.

In [74]:
s = pd.Series([1,3,5,np.nan,6,8])

In [75]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [76]:
# NaN 값을 0으로 바꿔줍니다.
s.fillna(0, inplace=True)

In [77]:
s

0    1.0
1    3.0
2    5.0
3    0.0
4    6.0
5    8.0
dtype: float64

# Date 관련 Pandas 사용하기.

In [78]:
dates = pd.date_range('20130101', periods=6)

In [79]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [80]:
# dates 를 index로 하는 새로운 Dataframe
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [81]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.008509,0.712807,1.885236,-1.477224
2013-01-02,0.474715,1.285606,0.103423,-1.150489
2013-01-03,0.690394,0.191583,3.525021,0.859442
2013-01-04,-0.114306,-0.851259,1.510567,-0.29753
2013-01-05,-0.813176,-0.539804,-1.067253,0.541624
2013-01-06,-1.033251,0.164307,0.350616,-1.743714


In [82]:
df.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

# Pandas 데이터 보는 방법.

In [83]:
# 위에서 10개
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,1.008509,0.712807,1.885236,-1.477224
2013-01-02,0.474715,1.285606,0.103423,-1.150489
2013-01-03,0.690394,0.191583,3.525021,0.859442
2013-01-04,-0.114306,-0.851259,1.510567,-0.29753
2013-01-05,-0.813176,-0.539804,-1.067253,0.541624


In [84]:
# 끝에서 10개
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,0.474715,1.285606,0.103423,-1.150489
2013-01-03,0.690394,0.191583,3.525021,0.859442
2013-01-04,-0.114306,-0.851259,1.510567,-0.29753
2013-01-05,-0.813176,-0.539804,-1.067253,0.541624
2013-01-06,-1.033251,0.164307,0.350616,-1.743714


In [85]:
# 인덱스
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [86]:
# 컬럼
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [87]:
# 값
df.values

array([[ 1.00850888,  0.71280712,  1.88523632, -1.47722449],
       [ 0.47471543,  1.28560614,  0.10342307, -1.15048933],
       [ 0.69039406,  0.19158294,  3.52502059,  0.85944198],
       [-0.11430572, -0.85125853,  1.51056724, -0.29753031],
       [-0.81317615, -0.53980394, -1.06725321,  0.54162379],
       [-1.03325065,  0.16430653,  0.3506157 , -1.74371387]])

In [88]:
# Dataframe 정보
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [89]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.035481,0.16054,1.051268,-0.544649
std,0.831079,0.785617,1.606447,1.085083
min,-1.033251,-0.851259,-1.067253,-1.743714
25%,-0.638459,-0.363776,0.165221,-1.395541
50%,0.180205,0.177945,0.930591,-0.72401
75%,0.636474,0.582501,1.791569,0.331835
max,1.008509,1.285606,3.525021,0.859442


In [90]:
# 데이터 Transpose 를 통한 행 <--> 열 사이의 위치 바꾸기.
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.008509,0.474715,0.690394,-0.114306,-0.813176,-1.033251
B,0.712807,1.285606,0.191583,-0.851259,-0.539804,0.164307
C,1.885236,0.103423,3.525021,1.510567,-1.067253,0.350616
D,-1.477224,-1.150489,0.859442,-0.29753,0.541624,-1.743714


In [91]:
df.transpose()

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.008509,0.474715,0.690394,-0.114306,-0.813176,-1.033251
B,0.712807,1.285606,0.191583,-0.851259,-0.539804,0.164307
C,1.885236,0.103423,3.525021,1.510567,-1.067253,0.350616
D,-1.477224,-1.150489,0.859442,-0.29753,0.541624,-1.743714


# Index 를 이용한 Sort

In [92]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.477224,1.885236,0.712807,1.008509
2013-01-02,-1.150489,0.103423,1.285606,0.474715
2013-01-03,0.859442,3.525021,0.191583,0.690394
2013-01-04,-0.29753,1.510567,-0.851259,-0.114306
2013-01-05,0.541624,-1.067253,-0.539804,-0.813176
2013-01-06,-1.743714,0.350616,0.164307,-1.033251


# 값을 통한 Sort

In [93]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-04,-0.114306,-0.851259,1.510567,-0.29753
2013-01-05,-0.813176,-0.539804,-1.067253,0.541624
2013-01-06,-1.033251,0.164307,0.350616,-1.743714
2013-01-03,0.690394,0.191583,3.525021,0.859442
2013-01-01,1.008509,0.712807,1.885236,-1.477224
2013-01-02,0.474715,1.285606,0.103423,-1.150489


In [94]:
# select * from df where dates == '2013-01-02'
df.loc['20130102']

A    0.474715
B    1.285606
C    0.103423
D   -1.150489
Name: 2013-01-02 00:00:00, dtype: float64

In [95]:
# select 'B', 'C' from df where dates == '2013-01-02'
df.loc['20130102', ['B', 'C']]

B    1.285606
C    0.103423
Name: 2013-01-02 00:00:00, dtype: float64

In [96]:
# 위와 같다.
df.iloc[[1], [1, 3]]

Unnamed: 0,B,D
2013-01-02,1.285606,-1.150489


In [97]:
# 양의 값을 가지는 데이터
x = df[df >= 0]
x.fillna(0, inplace=True)
x

Unnamed: 0,A,B,C,D
2013-01-01,1.008509,0.712807,1.885236,0.0
2013-01-02,0.474715,1.285606,0.103423,0.0
2013-01-03,0.690394,0.191583,3.525021,0.859442
2013-01-04,0.0,0.0,1.510567,0.0
2013-01-05,0.0,0.0,0.0,0.541624
2013-01-06,0.0,0.164307,0.350616,0.0


In [98]:
# 함수 적용하기
df.apply(lambda x: x.max() - x.min())

A    2.041760
B    2.136865
C    4.592274
D    2.603156
dtype: float64

In [121]:
df2 = df['A']

In [122]:
df2.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [123]:
df2.apply(lambda x: x**2)

2013-01-01    1.017090
2013-01-02    0.225355
2013-01-03    0.476644
2013-01-04    0.013066
2013-01-05    0.661255
2013-01-06    1.067607
Freq: D, Name: A, dtype: float64

In [124]:
df2 = pd.DataFrame(df2.values, index=dates, columns=list('C'))

In [137]:
df2.head()

Unnamed: 0,C
2013-01-01,1.008509
2013-01-02,0.474715
2013-01-03,0.690394
2013-01-04,-0.114306
2013-01-05,-0.813176


In [142]:
# Append
df['B'] = df2

In [143]:
df

Unnamed: 0,A,B,C,D,E
2013-01-01,1.008509,1.008509,1.008509,-1.477224,1.008509
2013-01-02,0.474715,0.474715,0.474715,-1.150489,0.474715
2013-01-03,0.690394,0.690394,0.690394,0.859442,0.690394
2013-01-04,-0.114306,-0.114306,-0.114306,-0.29753,-0.114306
2013-01-05,-0.813176,-0.813176,-0.813176,0.541624,-0.813176
2013-01-06,-1.033251,-1.033251,-1.033251,-1.743714,-1.033251


In [144]:
# Append(2)
df.append(df2, ignore_index=True)

Unnamed: 0,A,B,C,D,E
0,1.008509,1.008509,1.008509,-1.477224,1.008509
1,0.474715,0.474715,0.474715,-1.150489,0.474715
2,0.690394,0.690394,0.690394,0.859442,0.690394
3,-0.114306,-0.114306,-0.114306,-0.29753,-0.114306
4,-0.813176,-0.813176,-0.813176,0.541624,-0.813176
5,-1.033251,-1.033251,-1.033251,-1.743714,-1.033251
6,,,1.008509,,
7,,,0.474715,,
8,,,0.690394,,
9,,,-0.114306,,


In [145]:
df.head()

Unnamed: 0,A,B,C,D,E
2013-01-01,1.008509,1.008509,1.008509,-1.477224,1.008509
2013-01-02,0.474715,0.474715,0.474715,-1.150489,0.474715
2013-01-03,0.690394,0.690394,0.690394,0.859442,0.690394
2013-01-04,-0.114306,-0.114306,-0.114306,-0.29753,-0.114306
2013-01-05,-0.813176,-0.813176,-0.813176,0.541624,-0.813176
