# Pandas Basic

In [1]:
import pandas as pd
import numpy as np

In [4]:
# Series 유형
# NaN : Not a Number
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
# 날짜형 date_range - periods : 입력값일 동안
dates = pd.date_range('20210101', periods = 12)
dates

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12'],
              dtype='datetime64[ns]', freq='D')

In [6]:
# 12행 4열의 랜덤 변수, dates를 인덱스로 지정, A~D 까지 컬럼명
df = pd.DataFrame(np.random.randn(12, 4), index = dates, columns = ['A', 'B', 'C', 'D',])
df

Unnamed: 0,A,B,C,D
2021-01-01,1.399779,1.150443,0.388653,-0.389174
2021-01-02,-1.211479,-1.268212,0.233408,-1.119129
2021-01-03,0.020261,-0.410821,0.158969,-0.960599
2021-01-04,1.182762,-0.245173,0.237493,-0.014211
2021-01-05,1.019881,0.72874,-1.278958,0.979596
2021-01-06,-1.221939,-1.53038,-1.026768,-1.199167
2021-01-07,0.513636,-0.288173,1.196648,0.690339
2021-01-08,-0.293654,0.48216,-0.025896,1.218975
2021-01-09,-1.75557,-0.650302,1.748272,-0.65621
2021-01-10,0.527413,-0.107739,0.914567,0.773691


In [7]:
# df의 상위 3개값 출력 / head()의 default 값은 5
df.head(3)

Unnamed: 0,A,B,C,D
2021-01-01,1.399779,1.150443,0.388653,-0.389174
2021-01-02,-1.211479,-1.268212,0.233408,-1.119129
2021-01-03,0.020261,-0.410821,0.158969,-0.960599


In [8]:
# 인덱스 출력
df.index

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12'],
              dtype='datetime64[ns]', freq='D')

In [9]:
# 컬럼명 출력
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
# 내용 출력 (DataFrame과 같은 형태)
df.values

array([[ 1.39977948,  1.1504426 ,  0.38865303, -0.38917357],
       [-1.21147869, -1.26821242,  0.23340767, -1.11912884],
       [ 0.02026086, -0.41082056,  0.15896907, -0.96059897],
       [ 1.18276227, -0.24517345,  0.23749345, -0.01421143],
       [ 1.01988107,  0.72874029, -1.27895849,  0.97959605],
       [-1.22193868, -1.5303798 , -1.02676772, -1.19916735],
       [ 0.51363625, -0.28817314,  1.19664847,  0.69033947],
       [-0.29365353,  0.48215986, -0.02589582,  1.21897533],
       [-1.75557014, -0.65030193,  1.74827206, -0.65620962],
       [ 0.52741323, -0.10773895,  0.91456724,  0.77369139],
       [-0.74533124,  0.99054088, -1.29908944,  0.47981558],
       [-0.809278  ,  0.53844912,  1.03115426, -0.43342663]])

In [11]:
# df의 개요 출력
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12 entries, 2021-01-01 to 2021-01-12
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       12 non-null     float64
 1   B       12 non-null     float64
 2   C       12 non-null     float64
 3   D       12 non-null     float64
dtypes: float64(4)
memory usage: 480.0 bytes


In [12]:
# df의 통계적 개요 출력 (값의 갯수, 평균, 표준편차, 사분위수, 최대·최소)
df.describe()
# values가 문자일 경우에도 그에 맞는 개요 출력

Unnamed: 0,A,B,C,D
count,12.0,12.0,12.0,12.0
mean,-0.11446,-0.050872,0.189871,-0.052458
std,1.049813,0.854079,0.982623,0.85818
min,-1.75557,-1.53038,-1.299089,-1.199167
25%,-0.909828,-0.470691,-0.276114,-0.732307
50%,-0.136696,-0.176456,0.235451,-0.201692
75%,0.65053,0.586022,0.943714,0.711177
max,1.399779,1.150443,1.748272,1.218975


In [13]:
# by로 지정된 컬럼을 기준으로 정렬
df.sort_values(by='B', ascending=False)
# ascending=False : 내림차순, ascending=True : 오름차순

Unnamed: 0,A,B,C,D
2021-01-01,1.399779,1.150443,0.388653,-0.389174
2021-01-11,-0.745331,0.990541,-1.299089,0.479816
2021-01-05,1.019881,0.72874,-1.278958,0.979596
2021-01-12,-0.809278,0.538449,1.031154,-0.433427
2021-01-08,-0.293654,0.48216,-0.025896,1.218975
2021-01-10,0.527413,-0.107739,0.914567,0.773691
2021-01-04,1.182762,-0.245173,0.237493,-0.014211
2021-01-07,0.513636,-0.288173,1.196648,0.690339
2021-01-03,0.020261,-0.410821,0.158969,-0.960599
2021-01-09,-1.75557,-0.650302,1.748272,-0.65621


In [14]:
# 해당 컬럼 출력 (Series 형태)
df['A']

2021-01-01    1.399779
2021-01-02   -1.211479
2021-01-03    0.020261
2021-01-04    1.182762
2021-01-05    1.019881
2021-01-06   -1.221939
2021-01-07    0.513636
2021-01-08   -0.293654
2021-01-09   -1.755570
2021-01-10    0.527413
2021-01-11   -0.745331
2021-01-12   -0.809278
Freq: D, Name: A, dtype: float64

In [15]:
# Slicing 가능
df[0:3]

Unnamed: 0,A,B,C,D
2021-01-01,1.399779,1.150443,0.388653,-0.389174
2021-01-02,-1.211479,-1.268212,0.233408,-1.119129
2021-01-03,0.020261,-0.410821,0.158969,-0.960599


In [16]:
# Index로 Slicing 가능
df['2021-01-05':'2021-01-08']

Unnamed: 0,A,B,C,D
2021-01-05,1.019881,0.72874,-1.278958,0.979596
2021-01-06,-1.221939,-1.53038,-1.026768,-1.199167
2021-01-07,0.513636,-0.288173,1.196648,0.690339
2021-01-08,-0.293654,0.48216,-0.025896,1.218975


In [18]:
# location 옵션 : 위치값 지정 가능
df.loc[dates[0]] # dates[0] = 2021-01-01 행 각 열의 값

A    1.399779
B    1.150443
C    0.388653
D   -0.389174
Name: 2021-01-01 00:00:00, dtype: float64

In [32]:
# 행·열 Indexing
df['2021-01-03':'2021-01-07']['A'] # Series 형태로 출력

2021-01-03    0.020261
2021-01-04    1.182762
2021-01-05    1.019881
2021-01-06   -1.221939
2021-01-07    0.513636
Freq: D, Name: A, dtype: float64

In [31]:
df['2021-01-03':'2021-01-07'][['A']] # DataFrame 형태로 출력

Unnamed: 0,A
2021-01-03,0.020261
2021-01-04,1.182762
2021-01-05,1.019881
2021-01-06,-1.221939
2021-01-07,0.513636


In [33]:
df['2021-01-03':'2021-01-07'][['A','C']]

Unnamed: 0,A,C
2021-01-03,0.020261,0.158969
2021-01-04,1.182762,0.237493
2021-01-05,1.019881,-1.278958
2021-01-06,-1.221939,-1.026768
2021-01-07,0.513636,1.196648


In [34]:
#  2021-01-02의 A, B 컬럼 내용 확인
df.loc[dates[1],['A','B']]

A   -1.211479
B   -1.268212
Name: 2021-01-02 00:00:00, dtype: float64

In [35]:
df.loc[dates[0], 'A']

1.3997794774888306

In [36]:
# iloc : 행과 열의 인덱스를 이용하여 데이터에 바로 접근
# 행이나 열의 범위를 지정 / :(콜론) 사용 시 전체를 의미
df.iloc[1] # 2021-01-02 행 각 열의 데이터

A   -1.211479
B   -1.268212
C    0.233408
D   -1.119129
Name: 2021-01-02 00:00:00, dtype: float64

In [37]:
# 행 Slicing
df.iloc[3:5]

Unnamed: 0,A,B,C,D
2021-01-04,1.182762,-0.245173,0.237493,-0.014211
2021-01-05,1.019881,0.72874,-1.278958,0.979596


In [38]:
# 행·열 Slicing
df.iloc[3:5, 0:2]
# 슬라이싱 범위는 a:b = a, b-1 로 동일

Unnamed: 0,A,B
2021-01-04,1.182762,-0.245173
2021-01-05,1.019881,0.72874


In [40]:
# 열 Slicing
df.iloc[:,1:3]

Unnamed: 0,B,C
2021-01-01,1.150443,0.388653
2021-01-02,-1.268212,0.233408
2021-01-03,-0.410821,0.158969
2021-01-04,-0.245173,0.237493
2021-01-05,0.72874,-1.278958
2021-01-06,-1.53038,-1.026768
2021-01-07,-0.288173,1.196648
2021-01-08,0.48216,-0.025896
2021-01-09,-0.650302,1.748272
2021-01-10,-0.107739,0.914567


In [42]:
# 행·열 지정 출력
df.iloc[[1,2,4],[0,2]] # 2, 3, 5행의 1, 3열 데이터 출력

Unnamed: 0,A,C
2021-01-02,-1.211479,0.233408
2021-01-03,0.020261,0.158969
2021-01-05,1.019881,-1.278958


In [43]:
# 특정 조건을 만족하는 데이터 출력
df[df.A > 0] # 컬럼 A에서 0 이상인 데이터만 출력

Unnamed: 0,A,B,C,D
2021-01-01,1.399779,1.150443,0.388653,-0.389174
2021-01-03,0.020261,-0.410821,0.158969,-0.960599
2021-01-04,1.182762,-0.245173,0.237493,-0.014211
2021-01-05,1.019881,0.72874,-1.278958,0.979596
2021-01-07,0.513636,-0.288173,1.196648,0.690339
2021-01-10,0.527413,-0.107739,0.914567,0.773691


In [44]:
df[df > 0]
# 데이터 전체에 조건을 추가하여 만족하지 못할 경우 NaN으로 출력

Unnamed: 0,A,B,C,D
2021-01-01,1.399779,1.150443,0.388653,
2021-01-02,,,0.233408,
2021-01-03,0.020261,,0.158969,
2021-01-04,1.182762,,0.237493,
2021-01-05,1.019881,0.72874,,0.979596
2021-01-06,,,,
2021-01-07,0.513636,,1.196648,0.690339
2021-01-08,,0.48216,,1.218975
2021-01-09,,,1.748272,
2021-01-10,0.527413,,0.914567,0.773691


## DataFrame 복사

In [49]:
df2 = df # 데이터 위치만 복사
df2
id(df2) == id(df)

True

In [50]:
df2 = df.copy() # 데이터 내용까지 복사
df2
id(df2) == id(df)

False

In [51]:
# 새로운 column 추가
df2['E'] = ['one', 'two', 'one', 'four', '5', '6', 'six', 'eight', 'nine','ten', '11','12']
df2

Unnamed: 0,A,B,C,D,E
2021-01-01,1.399779,1.150443,0.388653,-0.389174,one
2021-01-02,-1.211479,-1.268212,0.233408,-1.119129,two
2021-01-03,0.020261,-0.410821,0.158969,-0.960599,one
2021-01-04,1.182762,-0.245173,0.237493,-0.014211,four
2021-01-05,1.019881,0.72874,-1.278958,0.979596,5
2021-01-06,-1.221939,-1.53038,-1.026768,-1.199167,6
2021-01-07,0.513636,-0.288173,1.196648,0.690339,six
2021-01-08,-0.293654,0.48216,-0.025896,1.218975,eight
2021-01-09,-1.75557,-0.650302,1.748272,-0.65621,nine
2021-01-10,0.527413,-0.107739,0.914567,0.773691,ten


In [57]:
# 데이터의 존재 여부 확인 : isin([])
df2['E'].isin(['two', 'four', '6'])

2021-01-01    False
2021-01-02     True
2021-01-03    False
2021-01-04     True
2021-01-05    False
2021-01-06     True
2021-01-07    False
2021-01-08    False
2021-01-09    False
2021-01-10    False
2021-01-11    False
2021-01-12    False
Freq: D, Name: E, dtype: bool

In [63]:
df2[df2['E'].isin(['two', 'four', '6'])]
# isin([]) Method를 활용해 조건을 만족하는 데이터 출력

Unnamed: 0,A,B,C,D,E
2021-01-02,-1.211479,-1.268212,0.233408,-1.119129,two
2021-01-04,1.182762,-0.245173,0.237493,-0.014211,four
2021-01-06,-1.221939,-1.53038,-1.026768,-1.199167,6


In [64]:
df

Unnamed: 0,A,B,C,D
2021-01-01,1.399779,1.150443,0.388653,-0.389174
2021-01-02,-1.211479,-1.268212,0.233408,-1.119129
2021-01-03,0.020261,-0.410821,0.158969,-0.960599
2021-01-04,1.182762,-0.245173,0.237493,-0.014211
2021-01-05,1.019881,0.72874,-1.278958,0.979596
2021-01-06,-1.221939,-1.53038,-1.026768,-1.199167
2021-01-07,0.513636,-0.288173,1.196648,0.690339
2021-01-08,-0.293654,0.48216,-0.025896,1.218975
2021-01-09,-1.75557,-0.650302,1.748272,-0.65621
2021-01-10,0.527413,-0.107739,0.914567,0.773691


In [65]:
# 특정 함수를 DataFrame에 적용하기 위한 apply 함수
df.apply(np.cumsum) # .cumsum : 누적합 / 각 열에 누적합 적용

Unnamed: 0,A,B,C,D
2021-01-01,1.399779,1.150443,0.388653,-0.389174
2021-01-02,0.188301,-0.11777,0.622061,-1.508302
2021-01-03,0.208562,-0.52859,0.78103,-2.468901
2021-01-04,1.391324,-0.773764,1.018523,-2.483113
2021-01-05,2.411205,-0.045024,-0.260435,-1.503517
2021-01-06,1.189266,-1.575403,-1.287203,-2.702684
2021-01-07,1.702903,-1.863576,-0.090555,-2.012345
2021-01-08,1.409249,-1.381417,-0.11645,-0.793369
2021-01-09,-0.346321,-2.031719,1.631822,-1.449579
2021-01-10,0.181092,-2.139457,2.546389,-0.675888


In [66]:
# lambda 함수(one-line 함수)도 적용 가능
df.apply(lambda x: x.max() - x.min()) # 각 열 데이터의 (최댓값 - 최솟값)

A    3.155350
B    2.680822
C    3.047362
D    2.418143
dtype: float64