In [5]:
import numpy as np
import pandas as pd

# Data Structure

## Series

In [6]:
s = pd.Series([1,2,3,4], index = ['a','b','c','d'])
s

a    1
b    2
c    3
d    4
dtype: int64

## DataFrame

In [8]:
data={
'Country':['Korea','china','japan'],
'Capital':['seoul','beijing','tokyo'],
'Population':[1111,2222,3333,]
}
data

{'Country': ['Korea', 'china', 'japan'],
 'Capital': ['seoul', 'beijing', 'tokyo'],
 'Population': [1111, 2222, 3333]}

In [11]:
df = pd.DataFrame(data, columns = ['Country','Capital','Population'])
df

Unnamed: 0,Country,Capital,Population
0,Korea,seoul,1111
1,china,beijing,2222
2,japan,tokyo,3333


In [13]:
df

Unnamed: 0,Country,Capital,Population
0,Korea,seoul,1111
1,china,beijing,2222
2,japan,tokyo,3333


In [14]:
s

a    1
b    2
c    3
d    4
dtype: int64

In [15]:
s['c']

3

In [17]:
df[1:]

Unnamed: 0,Country,Capital,Population
1,china,beijing,2222
2,japan,tokyo,3333


## 선택, 논리인덱싱, 설정

In [19]:
df.iloc[[0],[0]]

Unnamed: 0,Country
0,Korea


In [20]:
df.iat[0,0]

'Korea'

In [21]:
df.loc[0,'Country']

'Korea'

In [22]:
df.loc[2]

Country       japan
Capital       tokyo
Population     3333
Name: 2, dtype: object

In [23]:
s

a    1
b    2
c    3
d    4
dtype: int64

In [26]:
s[s>1]

b    2
c    3
d    4
dtype: int64

1보다 크거나 3보다 큰 Series

In [28]:
s[(s>1) | (s>3)]

b    2
c    3
d    4
dtype: int64

In [29]:
df

Unnamed: 0,Country,Capital,Population
0,Korea,seoul,1111
1,china,beijing,2222
2,japan,tokyo,3333


In [31]:
df[df['Population']>2000]

Unnamed: 0,Country,Capital,Population
1,china,beijing,2222
2,japan,tokyo,3333


인덱스 설정

In [32]:
s

a    1
b    2
c    3
d    4
dtype: int64

In [33]:
s['d'] = 0
s

a    1
b    2
c    3
d    0
dtype: int64

## 제거

In [34]:
s

a    1
b    2
c    3
d    0
dtype: int64

행 삭제(axis=0)

In [35]:
s.drop('a', axis=0)

b    2
c    3
d    0
dtype: int64

열 삭제(axis=1)

In [36]:
df.drop('Country', axis=1)

Unnamed: 0,Capital,Population
0,seoul,1111
1,beijing,2222
2,tokyo,3333


## 정렬, 순위

In [37]:
df.sort_values(by='Country')

Unnamed: 0,Country,Capital,Population
0,Korea,seoul,1111
1,china,beijing,2222
2,japan,tokyo,3333


In [39]:
df

Unnamed: 0,Country,Capital,Population
0,Korea,seoul,1111
1,china,beijing,2222
2,japan,tokyo,3333


In [40]:
df.rank()

Unnamed: 0,Country,Capital,Population
0,1.0,2.0,1.0
1,2.0,1.0,2.0
2,3.0,3.0,3.0


In [41]:
df.sort_values(by='Capital')

Unnamed: 0,Country,Capital,Population
1,china,beijing,2222
0,Korea,seoul,1111
2,japan,tokyo,3333


In [42]:
df.rank()

Unnamed: 0,Country,Capital,Population
0,1.0,2.0,1.0
1,2.0,1.0,2.0
2,3.0,3.0,3.0


## Series와 DataFrame의 정보 조회

행과 열의 정보

In [44]:
df.shape

(3, 3)

인덱스 설명

In [46]:
df.index

RangeIndex(start=0, stop=3, step=1)

데이터 프레임의 컬럼에 대한 설명

In [48]:
df.columns

Index(['Country', 'Capital', 'Population'], dtype='object')

데이터프레임에 대한 정보

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
Country       3 non-null object
Capital       3 non-null object
Population    3 non-null int64
dtypes: int64(1), object(2)
memory usage: 152.0+ bytes


결측치(NA)를 제외한 값의 수

In [50]:
df.count()

Country       3
Capital       3
Population    3
dtype: int64

값의 합

In [51]:
df.sum()

Country         Koreachinajapan
Capital       seoulbeijingtokyo
Population                 6666
dtype: object

값의 누적합

In [52]:
df.cumsum()

Unnamed: 0,Country,Capital,Population
0,Korea,seoul,1111
1,Koreachina,seoulbeijing,3333
2,Koreachinajapan,seoulbeijingtokyo,6666


값의 최솟값    

In [53]:
df.min()

Country         Korea
Capital       beijing
Population       1111
dtype: object

값의 최댓값

In [55]:
df.max()

Country       japan
Capital       tokyo
Population     3333
dtype: object

기술통계 요약

In [57]:
df.describe()

Unnamed: 0,Population
count,3.0
mean,2222.0
std,1111.0
min,1111.0
25%,1666.5
50%,2222.0
75%,2777.5
max,3333.0


값들의 평균

In [58]:
df.mean()

Population    2222.0
dtype: float64

값들의 중위수

In [59]:
df.median()

Population    2222.0
dtype: float64

## 데이터 정렬

내부 데이터 정렬

In [61]:
s3 = pd.Series([1,2,3], index=['a','c','b'])
s3

a    1
c    2
b    3
dtype: int64

In [62]:
s

a    1
b    2
c    3
d    0
dtype: int64

In [63]:
s + s3

a    2.0
b    5.0
c    5.0
d    NaN
dtype: float64