## Table

In [89]:
import pandas as pd

In [90]:
s = pd.Series([1,4,9,16,25])
s

0     1
1     4
2     9
3    16
4    25
dtype: int64

In [91]:
t = pd.Series({'one':1, 'two':2,'three' : 3, ' four':4, 'five':5})
t

one      1
two      2
three    3
 four    4
five     5
dtype: int64

### Series는 Ndarray와 유사하다

In [92]:
s[1]

4

In [93]:
t[1]

2

In [94]:
t[1:3]

two      2
three    3
dtype: int64

In [95]:
s[s > s.median()] #자기 자신의 median(중앙값) 보다 큰 값을 가져와라

3    16
4    25
dtype: int64

In [96]:
s[[3,1,4]]

3    16
1     4
4    25
dtype: int64

In [97]:
import numpy as np

np.exp(s)

0    2.718282e+00
1    5.459815e+01
2    8.103084e+03
3    8.886111e+06
4    7.200490e+10
dtype: float64

In [98]:
s.dtype


dtype('int64')

### Series는 dict와 유사하다

In [99]:
t

one      1
two      2
three    3
 four    4
five     5
dtype: int64

In [100]:
t['one']

1

In [101]:
#Series에 값 추가

t['six'] = 6

t

one      1
two      2
three    3
 four    4
five     5
six      6
dtype: int64

In [102]:
'six' in t

True

In [103]:
'seven' in t

False

In [104]:
# t['seven']

In [105]:
t.get('seven')

In [106]:
t.get('seven', 0)

0

### Series에 이름 붙이기

- `name` 속성을 가지고 있다.
- 처음 Series를 만들 때 이름을 붙일 수 있다.

In [107]:
s = pd.Series(np.random.randn(5),name = "random_nums")
s

0   -1.149532
1   -1.796879
2    0.433547
3    0.623636
4   -1.784287
Name: random_nums, dtype: float64

In [108]:
s.name = "임의의 난수"

s

0   -1.149532
1   -1.796879
2    0.433547
3    0.623636
4   -1.784287
Name: 임의의 난수, dtype: float64

## Pandas로 2차원 데이터 다루기 - dataframe

In [109]:
d = {"height": [1,2,3,4], "weight": [30,40,50,60]}

df = pd.DataFrame(d)

df

Unnamed: 0,height,weight
0,1,30
1,2,40
2,3,50
3,4,60


In [110]:
## dtype 확인

df.dtypes #pandas에서는 s를 붙여서 이용

height    int64
weight    int64
dtype: object

### Frame CSV to dataframe

- Comma Separate Value를 DataFrame로 생성해줄 수 있다.
- `.read_csv()` 를 이용

In [111]:
# 동일 경로에 country_wise_latest.csv 가 존재하면:

covid = pd.read_csv("./country_wise_latest.csv")
covid

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.50,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.00,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.60,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,West Bank and Gaza,10621,78,3752,6791,152,2,0,0.73,35.33,2.08,8916,1705,19.12,Eastern Mediterranean
183,Western Sahara,10,1,8,1,0,0,0,10.00,80.00,12.50,10,0,0.00,Africa
184,Yemen,1691,483,833,375,10,4,36,28.56,49.26,57.98,1619,72,4.45,Eastern Mediterranean
185,Zambia,4552,140,2815,1597,71,1,465,3.08,61.84,4.97,3326,1226,36.86,Africa


### Pandas 활용 1. 일부분만 관찰하기

`head(n)` : 처음 n개의 데이터 참조

In [112]:
# 위에서부터 5개를 관찰하는 방법(함수)

covid.head(5)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.6,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa


`tail(n)` : 마지막 n개의 데이터를 참조

In [113]:
# 아래에서부터 5개를 관찰하는 방법(함수)

covid.tail(5)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
182,West Bank and Gaza,10621,78,3752,6791,152,2,0,0.73,35.33,2.08,8916,1705,19.12,Eastern Mediterranean
183,Western Sahara,10,1,8,1,0,0,0,10.0,80.0,12.5,10,0,0.0,Africa
184,Yemen,1691,483,833,375,10,4,36,28.56,49.26,57.98,1619,72,4.45,Eastern Mediterranean
185,Zambia,4552,140,2815,1597,71,1,465,3.08,61.84,4.97,3326,1226,36.86,Africa
186,Zimbabwe,2704,36,542,2126,192,2,24,1.33,20.04,6.64,1713,991,57.85,Africa


### Pandas 활용 2. 데이터 접근하기

- `df['column_name']` or `df.colum_name`

In [114]:
covid['Confirmed']

0      36263
1       4880
2      27973
3        907
4        950
       ...  
182    10621
183       10
184     1691
185     4552
186     2704
Name: Confirmed, Length: 187, dtype: int64

In [115]:
# covid.WHO Region

### Honey Tip! DataFrame의 각 column은 "Series"다!

In [116]:
type(covid['Confirmed'])

pandas.core.series.Series

In [117]:
covid['Confirmed'][0]

36263

In [118]:
covid['Confirmed'][1:5]

1     4880
2    27973
3      907
4      950
Name: Confirmed, dtype: int64

### Pandas 활용 3. "조건"을 이용해서 데이터 접근하기

In [119]:
#신규 확진자가 100명이 넘는 나라를 찾아보자

covid[covid['New cases'] > 100].head(5) ##출력된 값을 다시 키로 사용

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
6,Argentina,167416,3059,72575,91782,4890,120,2057,1.83,43.35,4.21,130774,36642,28.02,Americas
8,Australia,15303,167,9311,5825,368,6,137,1.09,60.84,1.79,12428,2875,23.13,Western Pacific


In [120]:
# WHO 지역(WHO Region)이 동남아인 나라 찾기

#유니크한 값들을 출력한다. 중복되지 않고 안에 해당하는 값들을 출력
covid['WHO Region'].unique() 

array(['Eastern Mediterranean', 'Europe', 'Africa', 'Americas',
       'Western Pacific', 'South-East Asia'], dtype=object)

In [121]:
covid[covid['WHO Region'] == 'South-East Asia']

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
13,Bangladesh,226225,2965,125683,97577,2772,37,1801,1.31,55.56,2.36,207453,18772,9.05,South-East Asia
19,Bhutan,99,0,86,13,4,0,1,0.0,86.87,0.0,90,9,10.0,South-East Asia
27,Burma,350,6,292,52,0,0,2,1.71,83.43,2.05,341,9,2.64,South-East Asia
79,India,1480073,33408,951166,495499,44457,637,33598,2.26,64.26,3.51,1155338,324735,28.11,South-East Asia
80,Indonesia,100303,4838,58173,37292,1525,57,1518,4.82,58.0,8.32,88214,12089,13.7,South-East Asia
106,Maldives,3369,15,2547,807,67,0,19,0.45,75.6,0.59,2999,370,12.34,South-East Asia
119,Nepal,18752,48,13754,4950,139,3,626,0.26,73.35,0.35,17844,908,5.09,South-East Asia
158,Sri Lanka,2805,11,2121,673,23,0,15,0.39,75.61,0.52,2730,75,2.75,South-East Asia
167,Thailand,3297,58,3111,128,6,0,2,1.76,94.36,1.86,3250,47,1.45,South-East Asia
168,Timor-Leste,24,0,0,24,0,0,0,0.0,0.0,0.0,24,0,0.0,South-East Asia


### Pandas 활용 4. 행을 기준으로 데이터 접근하기

In [122]:
# 예시 데이터 - 도서관 정보

books_dict = {"Available":[True,True, False],"Location":[102,215,323],"Genre":["Programming","Physics","Math"]}

books_df = pd.DataFrame(books_dict, index=["버그란 무엇인가","두근두근 물리학","미분해줘 홈즈"])

books_df

Unnamed: 0,Available,Location,Genre
버그란 무엇인가,True,102,Programming
두근두근 물리학,True,215,Physics
미분해줘 홈즈,False,323,Math


### 인덱스를 이용해서 가져오기 : `.loc[row, col]`

In [123]:
type(books_df.loc["버그란 무엇인가"])

pandas.core.series.Series

In [124]:
#"미분해줘 홈즈 책이 대출가능한지?"

books_df.loc["미분해줘 홈즈",'Available']

False

### 숫자 인덱스를 이용해서 가져오기 : `.iloc[rowidx,colidx]`

In [125]:
# 인덱스 0행의 인덱스 1열 가지고 오기

books_df.iloc[0,1]

102

In [126]:
# 인덱스 1행의 인덱스 0~1열 가지고 오기

books_df.iloc[1,0:2]

Available    True
Location      215
Name: 두근두근 물리학, dtype: object

### Pandas 활용 5. groupby

- Split : 특정한 "기준"을 바탕으로 DataFrame을 분할
- Apply : 통계함수 -sum(), mean(), median()- 을 적용해서 각 데이터를 압축
- Combine : Apply된 결과를 바탕으로 새로운 Series를 생성 (group_key : applied_value)

`.groupby()`

In [127]:
covid.head(5)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.6,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa


In [128]:
# WHO Region 별 확진자수

#1. covid에서 확진자 수 column만 추출한다
#2. 이를 covid의 WHO Region을 기준으로 groupby한다.


#Split를 적용함
covid_by_region = covid["Confirmed"].groupby(by = covid["WHO Region"])

covid_by_region

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001C2CDC32910>

In [129]:
#Apply를 적용
covid_by_region.sum()

WHO Region
Africa                    723207
Americas                 8839286
Eastern Mediterranean    1490744
Europe                   3299523
South-East Asia          1835297
Western Pacific           292428
Name: Confirmed, dtype: int64

In [130]:
#국가당 감염자 수

covid_by_region.mean() #sum() / 국가 수

WHO Region
Africa                    15066.812500
Americas                 252551.028571
Eastern Mediterranean     67761.090909
Europe                    58920.053571
South-East Asia          183529.700000
Western Pacific           18276.750000
Name: Confirmed, dtype: float64

## Mission:

### 1. covid 데이터에서 100 case 대비 사망률(Deaths / 100 Cases)이 가장 높은 국가는?

In [142]:
# covid['Deaths / 100 Cases'].name
# covid.loc[covid['Deaths / 100 Cases'].max(),"Country/Region"]

### 2. covid 데이터에서 신규 확진자가 없는 나라 중 WHO Region이 'Europe'를 모두 출력하면?

Hint : 한 줄에 동시에 두가지 조건을 Apply하는 경우 Warning이 발생할 수 있음

In [None]:
# covid.

### 3. 다음 데이터를 이용해 각  Region별로 아보카도가 가장 비싼 평균가격(AveragePrice.csv)을 출력하면?

kaggle 참고