In [3]:
# 데이터분석 라이브러리 pandas 가져와 pd로 축약어 사용
import pandas as pd
# 배열 연산 라이브러리 numpy 가져와 np로 축약어 사용
import numpy as np
# 그래프 그리는 라이브러리 matplotlib 중 일부 가져오기
import matplotlib.pyplot as plt

In [4]:
# pandas Series 실행해보기
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [5]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
# pandas DataFrame 사용하기
dates = pd.date_range('20300101', periods=6)
dates

DatetimeIndex(['2030-01-01', '2030-01-02', '2030-01-03', '2030-01-04',
               '2030-01-05', '2030-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2030-01-01,0.036827,1.570352,-0.266189,-2.268367
2030-01-02,0.836598,0.051323,-0.212467,-0.070784
2030-01-03,-0.672708,-1.083557,0.065164,-0.459747
2030-01-04,0.208172,1.986025,-0.817182,0.717744
2030-01-05,-0.704882,-0.422741,-0.319663,0.247813
2030-01-06,-0.931347,0.733605,1.398552,-0.694737


In [8]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20300102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3]*4, dtype='int32'),
                    'E': pd.Categorical(['test', 'train', 'test', 'train']),
                    'F': 'foo'
                   })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2030-01-02,1.0,3,test,foo
1,1.0,2030-01-02,1.0,3,train,foo
2,1.0,2030-01-02,1.0,3,test,foo
3,1.0,2030-01-02,1.0,3,train,foo


In [9]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [12]:
dir(df2)

['A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce_

In [13]:
# 첫 5개 행의 데이터를 보여줍니다
df.head()

Unnamed: 0,A,B,C,D
2030-01-01,0.036827,1.570352,-0.266189,-2.268367
2030-01-02,0.836598,0.051323,-0.212467,-0.070784
2030-01-03,-0.672708,-1.083557,0.065164,-0.459747
2030-01-04,0.208172,1.986025,-0.817182,0.717744
2030-01-05,-0.704882,-0.422741,-0.319663,0.247813


In [14]:
# 마지막 3개 행의 데이터를 보여줍니다
df.tail(3)

Unnamed: 0,A,B,C,D
2030-01-04,0.208172,1.986025,-0.817182,0.717744
2030-01-05,-0.704882,-0.422741,-0.319663,0.247813
2030-01-06,-0.931347,0.733605,1.398552,-0.694737


In [15]:
# DataFrame의 인덱스를 보기
df.index

DatetimeIndex(['2030-01-01', '2030-01-02', '2030-01-03', '2030-01-04',
               '2030-01-05', '2030-01-06'],
              dtype='datetime64[ns]', freq='D')

In [16]:
# 컬럼을 보기
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [17]:
# numpy 데이터 살펴보기
df.values

array([[ 0.03682652,  1.5703522 , -0.26618897, -2.26836709],
       [ 0.83659795,  0.05132321, -0.21246703, -0.07078364],
       [-0.67270826, -1.08355689,  0.06516378, -0.45974714],
       [ 0.20817247,  1.98602486, -0.81718226,  0.71774449],
       [-0.70488189, -0.42274132, -0.31966329,  0.2478132 ],
       [-0.93134719,  0.73360452,  1.39855244, -0.69473671]])

In [18]:
# DataFrame의 간단한 통계 정보
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.204557,0.472501,-0.025298,-0.421346
std,0.679754,1.180201,0.753966,1.034978
min,-0.931347,-1.083557,-0.817182,-2.268367
25%,-0.696838,-0.304225,-0.306295,-0.635989
50%,-0.317941,0.392464,-0.239328,-0.265265
75%,0.165336,1.361165,-0.004244,0.168164
max,0.836598,1.986025,1.398552,0.717744


In [19]:
# 열과 행을 바꾼 형태의 DataFrame
df.T

Unnamed: 0,2030-01-01,2030-01-02,2030-01-03,2030-01-04,2030-01-05,2030-01-06
A,0.036827,0.836598,-0.672708,0.208172,-0.704882,-0.931347
B,1.570352,0.051323,-1.083557,1.986025,-0.422741,0.733605
C,-0.266189,-0.212467,0.065164,-0.817182,-0.319663,1.398552
D,-2.268367,-0.070784,-0.459747,0.717744,0.247813,-0.694737


In [20]:
# 행과 열 일므을 정렬해보자
# axis=0은 인덱스, axis=1은 컬럼 기준
# ascending=True는 오름차순, False는 내림차순
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2030-01-01,-2.268367,-0.266189,1.570352,0.036827
2030-01-02,-0.070784,-0.212467,0.051323,0.836598
2030-01-03,-0.459747,0.065164,-1.083557,-0.672708
2030-01-04,0.717744,-0.817182,1.986025,0.208172
2030-01-05,0.247813,-0.319663,-0.422741,-0.704882
2030-01-06,-0.694737,1.398552,0.733605,-0.931347


In [23]:
# DataFrame 내부 값으로 정렬하기
# ascending=False 추가하면 반대로 뒤집어짐
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2030-01-04,0.208172,1.986025,-0.817182,0.717744
2030-01-01,0.036827,1.570352,-0.266189,-2.268367
2030-01-06,-0.931347,0.733605,1.398552,-0.694737
2030-01-02,0.836598,0.051323,-0.212467,-0.070784
2030-01-05,-0.704882,-0.422741,-0.319663,0.247813
2030-01-03,-0.672708,-1.083557,0.065164,-0.459747
