In [1]:
import pandas as pd

## 데이터프레임 생성 방법

### 리스트 이용

In [4]:
frame = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
frame

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


### 딕셔너리 이용

In [5]:
data = {
    "age": [20,23,48],
    "height": [183,192,175],
    "weight": [77,83,65]
}
indexName = ['슈퍼맨','스파이더맨','배트맨']

frame = pd.DataFrame(data, index=indexName)
frame

Unnamed: 0,age,height,weight
슈퍼맨,20,183,77
스파이더맨,23,192,83
배트맨,48,175,65


## 데이터프레임 조회 방법

### 열(column) 조회

In [7]:
print(frame['age'])
print(frame.age)

슈퍼맨      20
스파이더맨    23
배트맨      48
Name: age, dtype: int64
슈퍼맨      20
스파이더맨    23
배트맨      48
Name: age, dtype: int64


In [8]:
print(frame['age'][1])
print(frame.age[1])

23
23


### 행(row) 조회

In [10]:
print(frame.loc['배트맨'])

age        48
height    175
weight     65
Name: 배트맨, dtype: int64


In [11]:
print(frame.iloc[0])

age        20
height    183
weight     77
Name: 슈퍼맨, dtype: int64


### loc 인덱싱

In [29]:
import numpy as np

In [30]:
df = pd.DataFrame(np.arange(10,22).reshape(3,4),
                 index=['a','b','c'],
                 columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [31]:
df.loc['a']

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [32]:
df.loc['b':'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [33]:
df['b':'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [34]:
df.loc[['b','c']]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [35]:
df.A > 15

a    False
b    False
c     True
Name: A, dtype: bool

In [36]:
df.loc[df.A > 15]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [37]:
df2 = pd.DataFrame(np.arange(10,26).reshape(4,4), columns=["A","B","C","D"])
df2

Unnamed: 0,A,B,C,D
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21
3,22,23,24,25


In [38]:
df2.loc[1:2]

Unnamed: 0,A,B,C,D
1,14,15,16,17
2,18,19,20,21


### 행과 열을 모두 인덱싱

In [40]:
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [39]:
df.loc["a", "A"]

10

In [41]:
df.loc["b":,"A"]

b    14
c    18
Name: A, dtype: int32

In [42]:
df.loc['a',:]

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [43]:
df.loc[['a','b'], ['B', 'D']]

Unnamed: 0,B,D
a,11,13
b,15,17


In [44]:
df.loc[df.A > 10, ["C", "D"]]

Unnamed: 0,C,D
b,16,17
c,20,21


### iloc 인덱싱

In [45]:
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [46]:
df.iloc[0,1]

11

In [47]:
df.iloc[:2,2]

a    12
b    16
Name: C, dtype: int32

In [48]:
df.iloc[0, -2:]

C    12
D    13
Name: a, dtype: int32

In [49]:
df.iloc[2:3, 1:3]

Unnamed: 0,B,C
c,19,20


In [50]:
df.iloc[-1]

A    18
B    19
C    20
D    21
Name: c, dtype: int32

In [51]:
df.iloc[-1] = df.iloc[-1] * 2
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,36,38,40,42


## 데이터프레임 수정 방법

### 열(column) 수정하기

In [12]:
frame_add_col = pd.DataFrame(frame, columns=['age','height','weight','blood_type'])
frame_add_col

Unnamed: 0,age,height,weight,blood_type
슈퍼맨,20,183,77,
스파이더맨,23,192,83,
배트맨,48,175,65,


In [13]:
frame_add_col['blood_type'] = ['A','B','O']
frame_add_col

Unnamed: 0,age,height,weight,blood_type
슈퍼맨,20,183,77,A
스파이더맨,23,192,83,B
배트맨,48,175,65,O


### 행(row) 추가하기

In [14]:
frame_add_index = frame_add_col.copy()
frame_add_index.loc['배트맨'] = [25,170,50,'AB']
frame_add_index

Unnamed: 0,age,height,weight,blood_type
슈퍼맨,20,183,77,A
스파이더맨,23,192,83,B
배트맨,25,170,50,AB


### 행, 열 삭제하기

In [16]:
print('remove age column:\n')
frame_add_col.drop('age', axis=1)

remove age column:



Unnamed: 0,height,weight,blood_type
슈퍼맨,183,77,A
스파이더맨,192,83,B
배트맨,175,65,O


In [17]:
frame_add_index.drop('배트맨', axis=0, inplace=True)

In [18]:
frame_add_index

Unnamed: 0,age,height,weight,blood_type
슈퍼맨,20,183,77,A
스파이더맨,23,192,83,B


## 데이터 입출력

### 인덱스 지정

In [22]:
frame

Unnamed: 0,age,height,weight
슈퍼맨,20,183,77
스파이더맨,23,192,83
배트맨,48,175,65


In [25]:
frame.index = ['a','b','c']
frame

Unnamed: 0,age,height,weight
a,20,183,77
b,23,192,83
c,48,175,65


### 컬럼명 지정

In [26]:
frame.columns = ['a','b','c']
frame

Unnamed: 0,a,b,c
a,20,183,77
b,23,192,83
c,48,175,65


## 데이터 갯수 세기

In [53]:
s = pd.Series(range(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [54]:
s.count()

9

In [55]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(5, size=(4,4)), dtype=float)
df.iloc[2,3] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [56]:
df.count()

0    4
1    4
2    4
3    3
dtype: int64

## 정렬

In [60]:
s.value_counts().sort_index()

0.0    1
1.0    1
2.0    1
4.0    1
5.0    1
6.0    1
7.0    1
8.0    1
9.0    1
dtype: int64

## 행/열 합계

In [61]:
np.random.seed(1)
df2 = pd.DataFrame(np.random.randint(10, size=(4,8)))
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,5,8,9,5,0,0,1,7
1,6,9,2,4,5,2,4,2
2,4,7,7,9,1,7,0,6
3,9,9,7,6,9,1,0,1


In [62]:
df2.sum(axis=1)

0    35
1    34
2    41
3    42
dtype: int64

In [63]:
df2['RowSum'] = df2.sum(axis=1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,5,8,9,5,0,0,1,7,35
1,6,9,2,4,5,2,4,2,34
2,4,7,7,9,1,7,0,6,41
3,9,9,7,6,9,1,0,1,42


In [64]:
df2.sum()

0          24
1          33
2          25
3          24
4          15
5          10
6           5
7          16
RowSum    152
dtype: int64

In [65]:
df2.loc['ColTotal', :] = df2.sum()
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,5.0,8.0,9.0,5.0,0.0,0.0,1.0,7.0,35.0
1,6.0,9.0,2.0,4.0,5.0,2.0,4.0,2.0,34.0
2,4.0,7.0,7.0,9.0,1.0,7.0,0.0,6.0,41.0
3,9.0,9.0,7.0,6.0,9.0,1.0,0.0,1.0,42.0
ColTotal,24.0,33.0,25.0,24.0,15.0,10.0,5.0,16.0,152.0


## apply 변환

In [66]:
df3 = pd.DataFrame({
    'A': [1, 3, 4, 3, 4],
    'B': [2, 3, 1, 2, 3],
    'C': [1, 5, 2, 4, 4]
})
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


### 각 열의 최대값과 최소값의 차이

In [67]:
df3.apply(lambda x: x.max() - x.min())

A    3
B    2
C    4
dtype: int64

In [69]:
df3.apply(lambda x: x.max() - x.min(), axis=1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

In [70]:
df3.apply(pd.value_counts)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


## 데이터프레임 인덱스 조작

In [71]:
np.random.seed(0)
df1 = pd.DataFrame(np.vstack([list('ABCDE'),
                              np.round(np.random.rand(3, 5), 2)]).T,
                   columns=["C1", "C2", "C3", "C4"])
df1

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


### 특정 열을 인덱스로 설정

In [72]:
df2 = df1.set_index("C1")
df2

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.55,0.65,0.79
B,0.72,0.44,0.53
C,0.6,0.89,0.57
D,0.54,0.96,0.93
E,0.42,0.38,0.07


In [73]:
df2.set_index("C2")

Unnamed: 0_level_0,C3,C4
C2,Unnamed: 1_level_1,Unnamed: 2_level_1
0.55,0.65,0.79
0.72,0.44,0.53
0.6,0.89,0.57
0.54,0.96,0.93
0.42,0.38,0.07


In [74]:
df2.reset_index()

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [75]:
df2.reset_index(drop=True)  # 인덱스 버리기

Unnamed: 0,C2,C3,C4
0,0.55,0.65,0.79
1,0.72,0.44,0.53
2,0.6,0.89,0.57
3,0.54,0.96,0.93
4,0.42,0.38,0.07
