# Pandas Library
## 1. Pandas 소개
### 소개
- 파이썬에서 사용하는 데이터분석 라이브러리
- 행과 열로 이루어진 데이터 객체를 만들어 다룰 수 있게 됨
- 보다 안정적으로 대용량의 데이터를 처리할 수 있음
- Pandas library는 다양한 형태의 데이터를 다루기에 용이하다.
- NumPy library가 numerical value를 다루는 데에 최적화되어있었다면, Pandas library는 보다 다양한 형태의 여러 데이터를 함께 다루기 용이함
- Panel data: 여러 개체들을 복수의 시간에 걸쳐 추적하여 얻는 데이터
- Big data analysis에 사용될 수 있다.
- Flexible indexing: non-integer index를 사용할 수 있다.

In [1]:
import pandas as pd

## 2. Pandas 자료구조
### 1) Series 생성

In [2]:
grades = pd.Series(range(70,100,2))
grades

0     70
1     72
2     74
3     76
4     78
5     80
6     82
7     84
8     86
9     88
10    90
11    92
12    94
13    96
14    98
dtype: int64

#### describe() :Series data의 descriptive statistics 확인

In [3]:
grades.describe()

count    15.000000
mean     84.000000
std       8.944272
min      70.000000
25%      77.000000
50%      84.000000
75%      91.000000
max      98.000000
dtype: float64

In [4]:
print(len(grades))

15


In [7]:
type(grades)

pandas.core.series.Series

#### Index를 지정한 Series 생성

In [14]:
height = pd.Series([160,170,180], index=["b","c","d"])
height

b    160
c    170
d    180
dtype: int64

In [13]:
height = pd.Series([160,170,180], index=("d","b","c"))
height

d    160
b    170
c    180
dtype: int64

In [15]:
height.b

160

In [16]:
height["b"]

160

In [19]:
height[0]

160

#### Dictionary를 사용한 Series 생성

In [20]:
nations = pd.Series({'Korea':82, 'USA':1,'China':'cn'})
nations

Korea    82
USA       1
China    cn
dtype: object

In [21]:
nations[2]

'cn'

## 2) DataFrame 생성
#### pd.DataFrame(data,index=~)
- Dictionary를 사용하여 DataFrame을 생성할 수 있다.
- Dictionary의 key는 column이 된다.
- Dictionary의 item은 모두 같은 길이의 리스트여야 한다.

In [22]:
scores = {'A':[10,20,30]
         ,'B':[50,40,30]
         ,'C':[15,16,15]}
print(scores)
print(type(scores))

{'A': [10, 20, 30], 'B': [50, 40, 30], 'C': [15, 16, 15]}
<class 'dict'>


In [23]:
scores_df = pd.DataFrame(scores)
print(scores_df)

    A   B   C
0  10  50  15
1  20  40  16
2  30  30  15


In [24]:
scores_df

Unnamed: 0,A,B,C
0,10,50,15
1,20,40,16
2,30,30,15


#### Index를 지정하여 DataFrame을 생성하거나 생성한 DataFrame의 index를 바꿀 수 있다.

In [25]:
# 인덱스 변경
scores_df.index = ['M',"S",'A']
scores_df

Unnamed: 0,A,B,C
M,10,50,15
S,20,40,16
A,30,30,15


In [27]:
scores_df.index

Index(['M', 'S', 'A'], dtype='object')

In [28]:
scores_df.colums=['index','index1','index3']

  """Entry point for launching an IPython kernel.


In [29]:
scores_df_with_index = pd.DataFrame(scores,index=['M','S','A'])
scores_df_with_index

Unnamed: 0,A,B,C
M,10,50,15
S,20,40,16
A,30,30,15


In [30]:
scores_df_with_index_over = pd.DataFrame(scores,index=['M','S','A','T'])
scores_df_with_index_over

ValueError: Shape of passed values is (3, 3), indices imply (4, 3)

#### Column을 지정하여 DataFrame을 생성할 수 있다.
Index와 달리 Dictionary에 없는 column도 포함하여 생성할 수 있다.
Dictionary에 없는 column은 Nan으로 초기화된다

In [31]:
scores_df_with_index_columns = pd.DataFrame(scores
                                            ,index=['M','S','Y']
                                           ,columns=['A','B','C','D'])
scores_df_with_index_columns

Unnamed: 0,A,B,C,D
M,10,50,15,
S,20,40,16,
Y,30,30,15,


In [32]:
# 컬럼명 변경
scores_df_with_index_columns.columns = ['a','b','c','d']
scores_df_with_index_columns

Unnamed: 0,a,b,c,d
M,10,50,15,
S,20,40,16,
Y,30,30,15,


In [34]:

scores_df_with_index_columns.columns = ['a','b','c','d','e']
scores_df_with_index_columns

ValueError: Length mismatch: Expected axis has 4 elements, new values have 5 elements

* 생성한 DataFrame의 index와 column, 값을 확인할 수 있다.
#### df
#### df.index
#### df.colums
#### df.values

In [35]:
scores_df_with_index_columns

Unnamed: 0,a,b,c,d
M,10,50,15,
S,20,40,16,
Y,30,30,15,


In [36]:
scores_df_with_index_columns.values

array([[10, 50, 15, nan],
       [20, 40, 16, nan],
       [30, 30, 15, nan]], dtype=object)

In [37]:
scores_df_with_index_columns.index

Index(['M', 'S', 'Y'], dtype='object')

In [38]:
scores_df_with_index_columns.columns

Index(['a', 'b', 'c', 'd'], dtype='object')

* 각 index와 column의 이름, DataFrame의 이름을 정의할 수 있다!

In [40]:
print(scores_df_with_index_columns.index.name)
print(scores_df_with_index_columns.columns.name)
print(scores_df_with_index_columns.name)

None
None


AttributeError: 'DataFrame' object has no attribute 'name'

In [41]:
scores_df_with_index_columns.index.name = "Subjects"
scores_df_with_index_columns.columns.name = "Name"
scores_df_with_index_columns.name = "Scores"

In [43]:
scores_df_with_index_columns.name

'Scores'

* describe() :  DataFrame의 계산 가능한 값들을 확인할 수 있다.

In [55]:
desc=scores_df_with_index_columns.describe()
desc

Name,a,b,c
count,3.0,3.0,3.0
mean,20.0,40.0,15.333333
std,10.0,10.0,0.57735
min,10.0,30.0,15.0
25%,15.0,35.0,15.0
50%,20.0,40.0,15.0
75%,25.0,45.0,15.5
max,30.0,50.0,16.0


#### 데이터프레임 데이터타입
- `df.dtypes` df 의 데이터 타입 확인
- 모든 열의 데이터 타입을 변경 `pd.astype(dtype)`
- 특정 칼럼의 데이터 타입을 변경 `pd.astype({"컬럼명":dtype})`

In [56]:
desc.dtypes

Name
a    float64
b    float64
c    float64
dtype: object

In [57]:
desc.astype('int')

Name,a,b,c
count,3,3,3
mean,20,40,15
std,10,10,0
min,10,30,15
25%,15,35,15
50%,20,40,15
75%,25,45,15
max,30,50,16


In [61]:
desc_changed=desc.astype({'a':'int','b':'int'})

In [62]:
desc_changed.dtypes

Name
a      int32
b      int32
c    float64
dtype: object

## 3. DataFrame Indexing
* DataFrame을 만들 때 index, columns를 설정하지 않으면 기본값으로 0부터 시작하는 정수형 숫자로 index와 column이 초기화된다

In [63]:
import numpy as np

In [92]:
df = pd.DataFrame(np.random.randn(6,4))
df

Unnamed: 0,0,1,2,3
0,0.711003,0.029406,-0.296875,2.529935
1,-0.537456,0.246846,-1.815685,-0.012548
2,0.023277,0.113713,1.381328,0.367186
3,-0.058526,1.069196,-0.147599,-0.837621
4,-0.333083,0.291673,-0.644198,1.172412
5,1.91023,-0.665238,-1.369032,0.072684


In [93]:
df = pd.DataFrame(np.random.randn(6,4))
df.columns = ['A','B','C','D']
df.index=pd.date_range('20201129',periods=6)
df

Unnamed: 0,A,B,C,D
2020-11-29,0.711003,0.029406,-0.296875,2.529935
2020-11-30,-0.537456,0.246846,-1.815685,-0.012548
2020-12-01,0.023277,0.113713,1.381328,0.367186
2020-12-02,-0.058526,1.069196,-0.147599,-0.837621
2020-12-03,-0.333083,0.291673,-0.644198,1.172412
2020-12-04,1.91023,-0.665238,-1.369032,0.072684


- pd.date_range('20201129',periods=6)

In [68]:
pd.date_range('20201129','20201204')

DatetimeIndex(['2020-11-29', '2020-11-30', '2020-12-01', '2020-12-02',
               '2020-12-03', '2020-12-04'],
              dtype='datetime64[ns]', freq='D')

### 1) Column을 선택하고 조작하기
- Column 이름을 사용하여 DataFrame의 column에 접근한다.

In [74]:
df

Unnamed: 0,A,B,C,D
2020-11-29,-0.535361,0.42169,0.470385,0.762099
2020-11-30,-0.374484,-0.683701,-0.036078,1.012179
2020-12-01,0.201985,-0.559024,-2.365288,-1.225365
2020-12-02,-0.35856,0.58227,-0.530562,-1.516931
2020-12-03,-0.379146,-1.695595,0.463151,-2.356435
2020-12-04,-0.591298,0.145629,0.016083,1.17899


In [75]:
df.C

2020-11-29    0.470385
2020-11-30   -0.036078
2020-12-01   -2.365288
2020-12-02   -0.530562
2020-12-03    0.463151
2020-12-04    0.016083
Freq: D, Name: C, dtype: float64

In [76]:
df."C"

SyntaxError: invalid syntax (<ipython-input-76-d39c0f4a81fb>, line 1)

In [77]:
df[C]

NameError: name 'C' is not defined

In [78]:
df["C"]

2020-11-29    0.470385
2020-11-30   -0.036078
2020-12-01   -2.365288
2020-12-02   -0.530562
2020-12-03    0.463151
2020-12-04    0.016083
Freq: D, Name: C, dtype: float64

In [79]:
df[2]

KeyError: 2

In [80]:
df[['A','B']]

Unnamed: 0,A,B
2020-11-29,-0.535361,0.42169
2020-11-30,-0.374484,-0.683701
2020-12-01,0.201985,-0.559024
2020-12-02,-0.35856,0.58227
2020-12-03,-0.379146,-1.695595
2020-12-04,-0.591298,0.145629


In [81]:
df['A','B']

KeyError: ('A', 'B')

In [82]:
df[('A','B')]

KeyError: ('A', 'B')

In [85]:
df_A=df[['A']]
df_A

Unnamed: 0,A
2020-11-29,-0.535361
2020-11-30,-0.374484
2020-12-01,0.201985
2020-12-02,-0.35856
2020-12-03,-0.379146
2020-12-04,-0.591298


In [86]:
df_AA=df['A']
df_AA

2020-11-29   -0.535361
2020-11-30   -0.374484
2020-12-01    0.201985
2020-12-02   -0.358560
2020-12-03   -0.379146
2020-12-04   -0.591298
Freq: D, Name: A, dtype: float64

In [87]:
print(type(df_A))
print(type(df_AA))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


* 여러 column을 함께 선택할 수 있다

* Column의 값을 수정할 수 있다

In [88]:
df

Unnamed: 0,A,B,C,D
2020-11-29,-0.535361,0.42169,0.470385,0.762099
2020-11-30,-0.374484,-0.683701,-0.036078,1.012179
2020-12-01,0.201985,-0.559024,-2.365288,-1.225365
2020-12-02,-0.35856,0.58227,-0.530562,-1.516931
2020-12-03,-0.379146,-1.695595,0.463151,-2.356435
2020-12-04,-0.591298,0.145629,0.016083,1.17899


In [90]:
df['D'] = 0.5
df['C'] = [1,2,3,4,5,6]
df

Unnamed: 0,A,B,C,D
2020-11-29,-0.535361,0.42169,1,0.5
2020-11-30,-0.374484,-0.683701,2,0.5
2020-12-01,0.201985,-0.559024,3,0.5
2020-12-02,-0.35856,0.58227,4,0.5
2020-12-03,-0.379146,-1.695595,5,0.5
2020-12-04,-0.591298,0.145629,6,0.5


In [91]:
df = 0.5
df

0.5

In [94]:
# 다시 위에서 생성 하고 내려옴
df

Unnamed: 0,A,B,C,D
2020-11-29,0.711003,0.029406,-0.296875,2.529935
2020-11-30,-0.537456,0.246846,-1.815685,-0.012548
2020-12-01,0.023277,0.113713,1.381328,0.367186
2020-12-02,-0.058526,1.069196,-0.147599,-0.837621
2020-12-03,-0.333083,0.291673,-0.644198,1.172412
2020-12-04,1.91023,-0.665238,-1.369032,0.072684


In [95]:
df[['B','C']]=0.5
df

Unnamed: 0,A,B,C,D
2020-11-29,0.711003,0.5,0.5,2.529935
2020-11-30,-0.537456,0.5,0.5,-0.012548
2020-12-01,0.023277,0.5,0.5,0.367186
2020-12-02,-0.058526,0.5,0.5,-0.837621
2020-12-03,-0.333083,0.5,0.5,1.172412
2020-12-04,1.91023,0.5,0.5,0.072684


* 새로운 column을 추가할 수 있다

In [98]:
x = np.arange(6)
print(x)
print(type(x))

[0 1 2 3 4 5]
<class 'numpy.ndarray'>


In [99]:
df['E'] = np.arange(6)
df

Unnamed: 0,A,B,C,D,E
2020-11-29,0.711003,0.5,0.5,2.529935,0
2020-11-30,-0.537456,0.5,0.5,-0.012548,1
2020-12-01,0.023277,0.5,0.5,0.367186,2
2020-12-02,-0.058526,0.5,0.5,-0.837621,3
2020-12-03,-0.333083,0.5,0.5,1.172412,4
2020-12-04,1.91023,0.5,0.5,0.072684,5


* Series를 새로운 column으로 추가할 수 있다.

In [113]:
ss = df.index

In [106]:
data = pd.Series([1,2,3,4,5,6],index=ss)
print(data)

2020-11-29    1
2020-11-30    2
2020-12-01    3
2020-12-02    4
2020-12-03    5
2020-12-04    6
Freq: D, dtype: int64


In [111]:
del data['2020-12-01']
del data['2020-12-04']
data

2020-11-30    2
2020-12-01    3
2020-12-02    4
2020-12-03    5
Freq: D, dtype: int64

In [112]:
df['F']=data
df

Unnamed: 0,A,B,C,D,E,F
2020-11-29,0.711003,0.5,0.5,2.529935,0,
2020-11-30,-0.537456,0.5,0.5,-0.012548,1,2.0
2020-12-01,0.023277,0.5,0.5,0.367186,2,3.0
2020-12-02,-0.058526,0.5,0.5,-0.837621,3,4.0
2020-12-03,-0.333083,0.5,0.5,1.172412,4,5.0
2020-12-04,1.91023,0.5,0.5,0.072684,5,


* 산술 연산을 이용하여 새로운 column을 추가할 수 있다.

* column을 삭제할 수 있다