In [3]:
import pandas as pd
from pandas import Series, DataFrame

## 1. Covid-19 데이터 가져오기
- https://github.com/owid/covid-19-data/tree/master/public/data : 매일 업데이트된 파일을 제공함

In [4]:
covid = pd.read_excel('data/owid-covid-data.xlsx')

## 2. 데이터 탐색 및 전처리

##### 데이터 크기, 컬럼들의 개수와 타입 등 확인

In [5]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70851 entries, 0 to 70850
Data columns (total 59 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   iso_code                               70851 non-null  object 
 1   continent                              67325 non-null  object 
 2   location                               70851 non-null  object 
 3   date                                   70851 non-null  object 
 4   total_cases                            69938 non-null  float64
 5   new_cases                              69936 non-null  float64
 6   new_cases_smoothed                     68935 non-null  float64
 7   total_deaths                           60914 non-null  float64
 8   new_deaths                             61072 non-null  float64
 9   new_deaths_smoothed                    68935 non-null  float64
 10  total_cases_per_million                69555 non-null  float64
 11  ne

##### date 컬럼을 datetime으로 변경하기

In [6]:
covid['date'] =  pd.to_datetime(covid['date'], format='%Y-%m-%d')

In [7]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70851 entries, 0 to 70850
Data columns (total 59 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   iso_code                               70851 non-null  object        
 1   continent                              67325 non-null  object        
 2   location                               70851 non-null  object        
 3   date                                   70851 non-null  datetime64[ns]
 4   total_cases                            69938 non-null  float64       
 5   new_cases                              69936 non-null  float64       
 6   new_cases_smoothed                     68935 non-null  float64       
 7   total_deaths                           60914 non-null  float64       
 8   new_deaths                             61072 non-null  float64       
 9   new_deaths_smoothed                    68935 non-null  float6

##### 년도(year), 달(month), 일(day), 주차(WeekNumber, %U)과 요일(weekDay, %a) 컬럼 추가하기

In [8]:
covid['year'] = covid['date'].apply(lambda x: x.strftime('%Y'))
covid['month'] = covid['date'].apply(lambda x: x.strftime('%m'))
covid['day'] = covid['date'].apply(lambda x: x.strftime('%d'))
covid['weekNumber'] = covid['date'].apply(lambda x: x.strftime('%U'))
covid['weekDay'] = covid['date'].apply(lambda x: x.strftime('%a'))

In [9]:
covid.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,year,month,day,weekNumber,weekDay
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,,37.746,0.5,64.83,0.511,2020,2,24,8,Mon
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,2020,2,25,8,Tue
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,2020,2,26,8,Wed
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,2020,2,27,8,Thu
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,2020,2,28,8,Fri


##### date를 row index로 설정하기

In [12]:
covid2 = covid.set_index(['date']).sort_index()

In [14]:
type(covid2.index)

pandas.core.indexes.datetimes.DatetimeIndex

##### 데이터 분석에 활용할 데이터의 범위 정하기 (나라, 기간)

1) 수집되고 있는 나라의 개수 확인

In [15]:
covid2.location.unique().size

214

In [16]:
covid2[covid2.location == 'Asia']

Unnamed: 0_level_0,iso_code,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,year,month,day,weekNumber,weekDay
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-22,OWID_ASI,,Asia,556.0,0.0,,17.0,0.0,,0.120,...,,,,,,2020,01,22,03,Wed
2020-01-23,OWID_ASI,,Asia,654.0,98.0,,18.0,1.0,,0.141,...,,,,,,2020,01,23,03,Thu
2020-01-24,OWID_ASI,,Asia,937.0,283.0,,26.0,8.0,,0.202,...,,,,,,2020,01,24,03,Fri
2020-01-25,OWID_ASI,,Asia,1428.0,491.0,,42.0,16.0,,0.308,...,,,,,,2020,01,25,03,Sat
2020-01-26,OWID_ASI,,Asia,2105.0,677.0,,56.0,14.0,,0.454,...,,,,,,2020,01,26,04,Sun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-19,OWID_ASI,,Asia,24385712.0,67228.0,64754.857,390453.0,894.0,838.429,5255.714,...,,,,,,2021,02,19,07,Fri
2021-02-20,OWID_ASI,,Asia,24448415.0,62703.0,64704.571,391362.0,909.0,845.857,5269.228,...,,,,,,2021,02,20,07,Sat
2021-02-21,OWID_ASI,,Asia,24515896.0,67481.0,65962.571,392079.0,717.0,842.143,5283.772,...,,,,,,2021,02,21,08,Sun
2021-02-22,OWID_ASI,,Asia,24583145.0,67249.0,67088.429,392833.0,754.0,849.714,5298.266,...,,,,,,2021,02,22,08,Mon


In [17]:
# 더 정확히 알아보기
# 204개의 나라에서 데이터 수집 중
covid2[covid2.continent.notnull()].location.unique().size

204

2) 데이터 수집 기간 확인

In [18]:
print(covid2.index.min(), covid2.index.max())

2020-01-01 00:00:00 2021-02-23 00:00:00


3) 날짜별 데이터 수집 개수 확인

In [19]:
# 결과에 표현되는 최대 row 갯수
pd.set_option('display.max_rows', 500)

# 결과에 표현되는 최대 columns 갯수
#pd.set_option('display.max_columns', 100)

In [20]:
covid2.index.value_counts().sort_index()

2020-01-01      2
2020-01-02      2
2020-01-03      2
2020-01-04      3
2020-01-05      3
2020-01-06      3
2020-01-07      3
2020-01-08      3
2020-01-09      3
2020-01-10      3
2020-01-11      3
2020-01-12      3
2020-01-13      3
2020-01-14      3
2020-01-15      3
2020-01-16      4
2020-01-17      4
2020-01-18      4
2020-01-19      4
2020-01-20      4
2020-01-21      5
2020-01-22     11
2020-01-23     15
2020-01-24     17
2020-01-25     20
2020-01-26     22
2020-01-27     25
2020-01-28     25
2020-01-29     27
2020-01-30     30
2020-01-31     34
2020-02-01     36
2020-02-02     42
2020-02-03     38
2020-02-04     39
2020-02-05     39
2020-02-06     40
2020-02-07     43
2020-02-08     43
2020-02-09     49
2020-02-10     43
2020-02-11     43
2020-02-12     43
2020-02-13     44
2020-02-14     45
2020-02-15     45
2020-02-16     50
2020-02-17     46
2020-02-18     46
2020-02-19     47
2020-02-20     48
2020-02-21     49
2020-02-22     50
2020-02-23     58
2020-02-24     58
2020-02-25

4) 코로나 바이러스가 가장 많이 걸린 상위 100개 나라만 선택

In [21]:
covid2[covid2.location == 'South Korea'] # total_cases가 누적 값인 것을 확인

Unnamed: 0_level_0,iso_code,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,year,month,day,weekNumber,weekDay
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-21,KOR,Asia,South Korea,,,,,,,,...,40.9,,12.27,83.03,0.916,2020,1,21,3,Tue
2020-01-22,KOR,Asia,South Korea,1.0,,,,,,0.02,...,40.9,,12.27,83.03,0.916,2020,1,22,3,Wed
2020-01-23,KOR,Asia,South Korea,1.0,0.0,,,,,0.02,...,40.9,,12.27,83.03,0.916,2020,1,23,3,Thu
2020-01-24,KOR,Asia,South Korea,2.0,1.0,,,,,0.039,...,40.9,,12.27,83.03,0.916,2020,1,24,3,Fri
2020-01-25,KOR,Asia,South Korea,2.0,0.0,,,,,0.039,...,40.9,,12.27,83.03,0.916,2020,1,25,3,Sat
2020-01-26,KOR,Asia,South Korea,3.0,1.0,,,,,0.059,...,40.9,,12.27,83.03,0.916,2020,1,26,4,Sun
2020-01-27,KOR,Asia,South Korea,4.0,1.0,,,,,0.078,...,40.9,,12.27,83.03,0.916,2020,1,27,4,Mon
2020-01-28,KOR,Asia,South Korea,4.0,0.0,0.429,,,0.0,0.078,...,40.9,,12.27,83.03,0.916,2020,1,28,4,Tue
2020-01-29,KOR,Asia,South Korea,4.0,0.0,0.429,,,0.0,0.078,...,40.9,,12.27,83.03,0.916,2020,1,29,4,Wed
2020-01-30,KOR,Asia,South Korea,4.0,0.0,0.429,,,0.0,0.078,...,40.9,,12.27,83.03,0.916,2020,1,30,4,Thu


In [22]:
# 1. covid2.index.max() 가장 최근 날짜 구하기
covid2.index.max()

Timestamp('2021-02-23 00:00:00')

In [23]:
# 2. 가장 최신 날짜의 데이터만 선택
recent = covid2[covid2.index == covid2.index.max()]

In [24]:
# 3. 값이 가장 큰 100개의 나라(location) 선택
top100 = recent.nlargest(100, 'total_cases').location.values
top100

array(['World', 'Europe', 'North America', 'United States', 'Asia',
       'European Union', 'South America', 'India', 'Brazil',
       'United Kingdom', 'Russia', 'Africa', 'France', 'Spain', 'Italy',
       'Turkey', 'Germany', 'Colombia', 'Argentina', 'Mexico', 'Poland',
       'Iran', 'South Africa', 'Ukraine', 'Indonesia', 'Peru', 'Czechia',
       'Netherlands', 'Canada', 'Chile', 'Portugal', 'Romania', 'Israel',
       'Belgium', 'Iraq', 'Sweden', 'Pakistan', 'Philippines',
       'Switzerland', 'Bangladesh', 'Morocco', 'Austria', 'Serbia',
       'Japan', 'Hungary', 'Saudi Arabia', 'United Arab Emirates',
       'Jordan', 'Lebanon', 'Panama', 'Slovakia', 'Malaysia', 'Belarus',
       'Ecuador', 'Nepal', 'Georgia', 'Kazakhstan', 'Bolivia', 'Bulgaria',
       'Croatia', 'Dominican Republic', 'Azerbaijan', 'Tunisia',
       'Ireland', 'Denmark', 'Costa Rica', 'Lithuania', 'Kuwait',
       'Slovenia', 'Greece', 'Egypt', 'Moldova', 'Palestine', 'Guatemala',
       'Armenia', 'Hondur

In [25]:
# 4. covid2에서 top100 나라 데이터만 선택하여, covid2에 다시 저장하기
covid2 = covid2[covid2.location.isin(top100)]

5) 분석에 활용할 데이터의 수집 기간 정하기

In [26]:
covid2.index.value_counts().sort_index()
# 2020-03-27~2021-02-23까지는 빠짐 없이 수집됨을 확인하였으므로, 해당 기간의 데이터만 활용하기로 함

2020-01-01      2
2020-01-02      2
2020-01-03      2
2020-01-04      2
2020-01-05      2
2020-01-06      2
2020-01-07      2
2020-01-08      2
2020-01-09      2
2020-01-10      2
2020-01-11      2
2020-01-12      2
2020-01-13      2
2020-01-14      2
2020-01-15      2
2020-01-16      2
2020-01-17      2
2020-01-18      2
2020-01-19      2
2020-01-20      2
2020-01-21      3
2020-01-22      9
2020-01-23     11
2020-01-24     13
2020-01-25     15
2020-01-26     16
2020-01-27     18
2020-01-28     18
2020-01-29     19
2020-01-30     21
2020-01-31     24
2020-02-01     26
2020-02-02     32
2020-02-03     28
2020-02-04     29
2020-02-05     29
2020-02-06     29
2020-02-07     31
2020-02-08     31
2020-02-09     36
2020-02-10     31
2020-02-11     31
2020-02-12     31
2020-02-13     32
2020-02-14     33
2020-02-15     33
2020-02-16     38
2020-02-17     34
2020-02-18     34
2020-02-19     35
2020-02-20     36
2020-02-21     37
2020-02-22     38
2020-02-23     46
2020-02-24     44
2020-02-25

In [27]:
data = covid2['2020-03-27':'2021-02-23']

In [28]:
data # 최종 데이터

Unnamed: 0_level_0,iso_code,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,year,month,day,weekNumber,weekDay
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-27,PAK,Asia,Pakistan,1495.0,122.0,109.286,12.0,1.0,1.286,6.768,...,36.700,59.607,0.600,67.27,0.557,2020,03,27,12,Fri
2020-03-27,VEN,South America,Venezuela,107.0,0.0,9.286,1.0,1.0,0.143,3.763,...,,,0.800,72.06,0.711,2020,03,27,12,Fri
2020-03-27,HUN,Europe,Hungary,300.0,39.0,30.714,10.0,0.0,1.000,31.055,...,34.800,,7.020,76.88,0.854,2020,03,27,12,Fri
2020-03-27,MDA,Europe,Moldova,199.0,22.0,19.000,2.0,1.0,0.143,49.331,...,44.600,86.979,5.800,71.90,0.750,2020,03,27,12,Fri
2020-03-27,NOR,Europe,Norway,3755.0,386.0,263.000,19.0,5.0,1.714,692.646,...,20.700,,3.600,82.40,0.957,2020,03,27,12,Fri
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-23,GBR,Europe,United Kingdom,4146760.0,8523.0,10917.714,121536.0,548.0,445.000,61084.167,...,24.700,,2.540,81.32,0.932,2021,02,23,08,Tue
2021-02-23,NPL,Asia,Nepal,273666.0,110.0,103.000,2065.0,4.0,1.429,9392.450,...,37.800,47.782,0.300,70.78,0.602,2021,02,23,08,Tue
2021-02-23,RUS,Europe,Russia,4142126.0,11679.0,12655.857,82666.0,411.0,429.571,28383.467,...,58.300,,8.050,72.58,0.824,2021,02,23,08,Tue
2021-02-23,OWID_WRL,,World,112109753.0,387864.0,371186.429,2485434.0,11256.0,9467.571,14382.636,...,34.635,60.130,2.705,72.58,0.737,2021,02,23,08,Tue


## 3. 데이터 분석

#### [실습 #1] 누적 확진자수가 가장 많은 국가 10개 찾아보기

In [29]:
data[(data.index == data.index.max()) & (data.continent.notnull())].nlargest(10, 'total_cases')

Unnamed: 0_level_0,iso_code,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,year,month,day,weekNumber,weekDay
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-02-23,USA,North America,United States,28261595.0,71436.0,71562.143,502660.0,2350.0,2030.714,85381.779,...,24.6,,2.77,78.86,0.926,2021,2,23,8,Tue
2021-02-23,IND,Asia,India,11030176.0,13742.0,13265.143,156567.0,104.0,93.429,7992.856,...,20.6,59.55,0.53,69.66,0.645,2021,2,23,8,Tue
2021-02-23,BRA,South America,Brazil,10257875.0,62715.0,47984.857,248529.0,1386.0,1084.143,48258.861,...,17.9,,2.2,75.88,0.765,2021,2,23,8,Tue
2021-02-23,GBR,Europe,United Kingdom,4146760.0,8523.0,10917.714,121536.0,548.0,445.0,61084.167,...,24.7,,2.54,81.32,0.932,2021,2,23,8,Tue
2021-02-23,RUS,Europe,Russia,4142126.0,11679.0,12655.857,82666.0,411.0,429.571,28383.467,...,58.3,,8.05,72.58,0.824,2021,2,23,8,Tue
2021-02-23,FRA,Europe,France,3689534.0,20180.0,20154.571,85195.0,431.0,319.143,56524.215,...,35.6,,5.98,82.66,0.901,2021,2,23,8,Tue
2021-02-23,ESP,Europe,Spain,3161432.0,7461.0,9298.429,68079.0,443.0,300.0,67617.296,...,31.4,,2.97,83.56,0.904,2021,2,23,8,Tue
2021-02-23,ITA,Europe,Italy,2832162.0,13299.0,13224.429,96348.0,356.0,311.0,46842.15,...,27.8,,3.18,83.51,0.892,2021,2,23,8,Tue
2021-02-23,TUR,Asia,Turkey,2655633.0,9107.0,7657.0,28213.0,75.0,80.143,31487.579,...,41.1,,2.81,77.69,0.82,2021,2,23,8,Tue
2021-02-23,DEU,Europe,Germany,2405263.0,5764.0,7499.571,68785.0,422.0,422.286,28707.923,...,33.1,,8.0,81.33,0.947,2021,2,23,8,Tue


#### [실습 #2] 2021-02 한 달 동안(21-02-01~21-02-23) 가장 많은 확진자가 발생한 국가 10개 찾아보기

In [30]:
data

Unnamed: 0_level_0,iso_code,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,year,month,day,weekNumber,weekDay
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-27,PAK,Asia,Pakistan,1495.0,122.0,109.286,12.0,1.0,1.286,6.768,...,36.700,59.607,0.600,67.27,0.557,2020,03,27,12,Fri
2020-03-27,VEN,South America,Venezuela,107.0,0.0,9.286,1.0,1.0,0.143,3.763,...,,,0.800,72.06,0.711,2020,03,27,12,Fri
2020-03-27,HUN,Europe,Hungary,300.0,39.0,30.714,10.0,0.0,1.000,31.055,...,34.800,,7.020,76.88,0.854,2020,03,27,12,Fri
2020-03-27,MDA,Europe,Moldova,199.0,22.0,19.000,2.0,1.0,0.143,49.331,...,44.600,86.979,5.800,71.90,0.750,2020,03,27,12,Fri
2020-03-27,NOR,Europe,Norway,3755.0,386.0,263.000,19.0,5.0,1.714,692.646,...,20.700,,3.600,82.40,0.957,2020,03,27,12,Fri
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-23,GBR,Europe,United Kingdom,4146760.0,8523.0,10917.714,121536.0,548.0,445.000,61084.167,...,24.700,,2.540,81.32,0.932,2021,02,23,08,Tue
2021-02-23,NPL,Asia,Nepal,273666.0,110.0,103.000,2065.0,4.0,1.429,9392.450,...,37.800,47.782,0.300,70.78,0.602,2021,02,23,08,Tue
2021-02-23,RUS,Europe,Russia,4142126.0,11679.0,12655.857,82666.0,411.0,429.571,28383.467,...,58.300,,8.050,72.58,0.824,2021,02,23,08,Tue
2021-02-23,OWID_WRL,,World,112109753.0,387864.0,371186.429,2485434.0,11256.0,9467.571,14382.636,...,34.635,60.130,2.705,72.58,0.737,2021,02,23,08,Tue


In [31]:
ex2 = data[data.continent.notnull()]['2021-02']\
    .pivot_table(index = 'location', values = ['new_cases', 'new_deaths'], aggfunc = 'sum')\
    .nlargest(10, 'new_cases')

In [32]:
ex2

Unnamed: 0_level_0,new_cases,new_deaths
location,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,2074560.0,54560.0
Brazil,1053144.0,24025.0
France,433614.0,8994.0
Spain,418313.0,9760.0
Russia,333778.0,10637.0
United Kingdom,318573.0,15169.0
Italy,279130.0,7832.0
India,272566.0,2175.0
Indonesia,220294.0,5016.0
Mexico,188006.0,23273.0


#### [실습 3] 실습 2에서 구한 나라들의 2020년 4월부터 2020년 12월까지의 월별 신규확진자수, 사망자수, 검사자수 구하기

In [33]:
# 1) data에서 실습2에서 구한 나라들의 데이터만 선택
ex3 = data[data.location.isin(ex2.index)]

In [34]:
# 2) 2020년 4월부터 2020년 12월 데이터만 선택
ex3 = ex3['2020-04':'2020-12']

In [35]:
# 3 월별 신규확진자수, 사망자수, 검사자수 구하기
ex3 = ex3.pivot_table(index = 'month', columns = 'location', aggfunc = 'sum', values = ['new_cases', 'new_deaths', 'new_tests'])

In [36]:
ex3

Unnamed: 0_level_0,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,...,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests
location,Brazil,France,India,Indonesia,Italy,Mexico,Russia,Spain,United Kingdom,United States,...,Brazil,France,India,Indonesia,Italy,Mexico,Russia,Spain,United Kingdom,United States
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
4,81470.0,116583.0,33466.0,8590.0,99671.0,18009.0,104161.0,117512.0,139956.0,888718.0,...,0.0,0.0,683294.0,62214.0,1472249.0,77689.0,3317307.0,0.0,718807.0,5618576.0
5,427662.0,22114.0,155746.0,16355.0,27534.0,71440.0,299345.0,26044.0,78768.0,717694.0,...,0.0,677559.0,2906826.0,136443.0,1899522.0,191581.0,7199301.0,0.0,2520931.0,11899129.0
6,887192.0,13269.0,394872.0,29912.0,7581.0,135425.0,241086.0,9792.0,27677.0,843368.0,...,0.0,1087930.0,4871627.0,215368.0,1511371.0,306893.0,8929059.0,0.0,2656857.0,18014790.0
7,1260444.0,22995.0,1110507.0,51991.0,6959.0,198548.0,191532.0,39251.0,19577.0,1924850.0,...,0.0,2051856.0,10224316.0,265632.0,1423003.0,407014.0,8042857.0,0.0,3949332.0,27454928.0
8,1245787.0,93921.0,1995178.0,66420.0,21677.0,174923.0,153941.0,174336.0,33290.0,1458662.0,...,0.0,3452145.0,23474944.0,299905.0,1831746.0,367906.0,7789748.0,0.0,5230736.0,25348800.0
9,902663.0,284733.0,2621418.0,112212.0,45647.0,143656.0,178397.0,306330.0,117765.0,1206239.0,...,0.0,5490691.0,31888815.0,633641.0,2689063.0,359178.0,9520619.0,0.0,6879501.0,26824516.0
10,724670.0,808471.0,1871498.0,123080.0,364569.0,181746.0,435468.0,416490.0,558947.0,1926939.0,...,0.0,7445662.0,34599335.0,810521.0,4450539.0,411629.0,13747959.0,0.0,8965952.0,35280518.0
11,800273.0,862510.0,1278727.0,128795.0,922124.0,188581.0,669669.0,462509.0,618941.0,4496449.0,...,0.0,6601086.0,31583912.0,907567.0,6160638.0,455048.0,15726155.0,0.0,9332998.0,46214321.0
12,1340095.0,400792.0,803865.0,204315.0,505612.0,312551.0,851411.0,280078.0,862499.0,6406683.0,...,0.0,9067261.0,29731625.0,644905.0,4653508.0,794686.0,12575649.0,0.0,11407092.0,52551708.0


In [38]:
# 4) plotly로 그리기
pd.options.plotting.backend = "plotly"
fig1 = ex3.plot()
fig1.show()
# plotly.express는 아직 계층 색인은 지원하지 않음

TypeError: Data frame columns is a pandas MultiIndex. pandas MultiIndex is not supported by plotly express at the moment.

In [92]:
# 4) plotly로 그리기
pd.options.plotting.backend = "plotly"
fig1 = ex3['new_cases'].plot(width = 700, height = 400)
fig1.show()

In [87]:
fig2 = ex3['new_deaths'].plot()
fig2.show()

In [41]:
fig3 = ex3['new_tests'].plot()
fig3.show()

In [67]:
ex3

Unnamed: 0_level_0,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,...,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests
location,Brazil,France,India,Indonesia,Italy,Mexico,Russia,Spain,United Kingdom,United States,...,Brazil,France,India,Indonesia,Italy,Mexico,Russia,Spain,United Kingdom,United States
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
4,81470.0,116583.0,33466.0,8590.0,99671.0,18009.0,104161.0,117512.0,139956.0,888718.0,...,0.0,0.0,683294.0,62214.0,1472249.0,77689.0,3317307.0,0.0,718807.0,5618576.0
5,427662.0,22114.0,155746.0,16355.0,27534.0,71440.0,299345.0,26044.0,78768.0,717694.0,...,0.0,677559.0,2906826.0,136443.0,1899522.0,191581.0,7199301.0,0.0,2520931.0,11899129.0
6,887192.0,13269.0,394872.0,29912.0,7581.0,135425.0,241086.0,9792.0,27677.0,843368.0,...,0.0,1087930.0,4871627.0,215368.0,1511371.0,306893.0,8929059.0,0.0,2656857.0,18014790.0
7,1260444.0,22995.0,1110507.0,51991.0,6959.0,198548.0,191532.0,39251.0,19577.0,1924850.0,...,0.0,2051856.0,10224316.0,265632.0,1423003.0,407014.0,8042857.0,0.0,3949332.0,27454928.0
8,1245787.0,93921.0,1995178.0,66420.0,21677.0,174923.0,153941.0,174336.0,33290.0,1458662.0,...,0.0,3452145.0,23474944.0,299905.0,1831746.0,367906.0,7789748.0,0.0,5230736.0,25348800.0
9,902663.0,284733.0,2621418.0,112212.0,45647.0,143656.0,178397.0,306330.0,117765.0,1206239.0,...,0.0,5490691.0,31888815.0,633641.0,2689063.0,359178.0,9520619.0,0.0,6879501.0,26824516.0
10,724670.0,808471.0,1871498.0,123080.0,364569.0,181746.0,435468.0,416490.0,558947.0,1926939.0,...,0.0,7445662.0,34599335.0,810521.0,4450539.0,411629.0,13747959.0,0.0,8965952.0,35280518.0
11,800273.0,862510.0,1278727.0,128795.0,922124.0,188581.0,669669.0,462509.0,618941.0,4496449.0,...,0.0,6601086.0,31583912.0,907567.0,6160638.0,455048.0,15726155.0,0.0,9332998.0,46214321.0
12,1340095.0,400792.0,803865.0,204315.0,505612.0,312551.0,851411.0,280078.0,862499.0,6406683.0,...,0.0,9067261.0,29731625.0,644905.0,4653508.0,794686.0,12575649.0,0.0,11407092.0,52551708.0


In [42]:
ex3.to_excel('data/covid_top10.xlsx')

In [None]:
# 이 부분은 하지 말자..## 

In [57]:
ex3_1 = ex3.stack(0).reset_index()

In [58]:
ex3_1.columns[2:12]

Index(['Brazil', 'France', 'India', 'Indonesia', 'Italy', 'Mexico', 'Russia',
       'Spain', 'United Kingdom', 'United States'],
      dtype='object', name='location')

In [59]:
# 하나의 figure 안에 각각의 subplot으로 그리기
ex3_1.plot(x='month',y = ex_3_1.columns[2:12],
                               facet_row = 'level_1')

In [60]:
# 5) 그래프를 html로 export 하기
fig1.write_html('covid_top10_countries_new_cases.html')
fig2.write_html('covid_top10_countries_new_deaths.html')
fig3.write_html('covid_top10_countries_new_tests.html')

## 4. 데이터 분석 결과로 웹 서버 구축하기 with Dash
 - https://plotly.com/python/getting-started/

In [81]:
# https://plotly.com/python/getting-started/
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.graph_objects as go

app = dash.Dash(__name__)

app.layout = html.Div([
    html.P("Color:"),
    dcc.Dropdown(
        id="dropdown",
        options=[
            {'label': x, 'value': x}
            for x in ['Gold', 'MediumTurquoise', 'LightGreen']
        ],
        value='Gold',
        clearable=False,
    ),
    dcc.Graph(id="graph"),
])

@app.callback(
    Output("graph", "figure"), 
    [Input("dropdown", "value")])
def display_color(color):
    fig = go.Figure(
        data=go.Bar(y=[2, 3, 1], marker_color=color))
    return fig

app.run_server(debug=True)

In [None]:
#### []

In [68]:
covid19_top10 = pd.read_excel('data/covid_top10.xlsx', header = [0,1])

In [76]:
covid19_top10

Unnamed: 0_level_0,Unnamed: 0_level_0,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,...,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests
Unnamed: 0_level_1,location,Brazil,France,India,Indonesia,Italy,Mexico,Russia,Spain,United Kingdom,...,Brazil,France,India,Indonesia,Italy,Mexico,Russia,Spain,United Kingdom,United States
0,month,,,,,,,,,,...,,,,,,,,,,
1,04,81470.0,116583.0,33466.0,8590.0,99671.0,18009.0,104161.0,117512.0,139956.0,...,0.0,0.0,683294.0,62214.0,1472249.0,77689.0,3317307.0,0.0,718807.0,5618576.0
2,05,427662.0,22114.0,155746.0,16355.0,27534.0,71440.0,299345.0,26044.0,78768.0,...,0.0,677559.0,2906826.0,136443.0,1899522.0,191581.0,7199301.0,0.0,2520931.0,11899129.0
3,06,887192.0,13269.0,394872.0,29912.0,7581.0,135425.0,241086.0,9792.0,27677.0,...,0.0,1087930.0,4871627.0,215368.0,1511371.0,306893.0,8929059.0,0.0,2656857.0,18014790.0
4,07,1260444.0,22995.0,1110507.0,51991.0,6959.0,198548.0,191532.0,39251.0,19577.0,...,0.0,2051856.0,10224316.0,265632.0,1423003.0,407014.0,8042857.0,0.0,3949332.0,27454928.0
5,08,1245787.0,93921.0,1995178.0,66420.0,21677.0,174923.0,153941.0,174336.0,33290.0,...,0.0,3452145.0,23474944.0,299905.0,1831746.0,367906.0,7789748.0,0.0,5230736.0,25348800.0
6,09,902663.0,284733.0,2621418.0,112212.0,45647.0,143656.0,178397.0,306330.0,117765.0,...,0.0,5490691.0,31888815.0,633641.0,2689063.0,359178.0,9520619.0,0.0,6879501.0,26824516.0
7,10,724670.0,808471.0,1871498.0,123080.0,364569.0,181746.0,435468.0,416490.0,558947.0,...,0.0,7445662.0,34599335.0,810521.0,4450539.0,411629.0,13747959.0,0.0,8965952.0,35280518.0
8,11,800273.0,862510.0,1278727.0,128795.0,922124.0,188581.0,669669.0,462509.0,618941.0,...,0.0,6601086.0,31583912.0,907567.0,6160638.0,455048.0,15726155.0,0.0,9332998.0,46214321.0
9,12,1340095.0,400792.0,803865.0,204315.0,505612.0,312551.0,851411.0,280078.0,862499.0,...,0.0,9067261.0,29731625.0,644905.0,4653508.0,794686.0,12575649.0,0.0,11407092.0,52551708.0


In [85]:
covid19_top10.set_index(covid19_top10.columns[0])[1:]
# 데이터

Unnamed: 0_level_0,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,new_cases,...,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests,new_tests
Unnamed: 0_level_1,Brazil,France,India,Indonesia,Italy,Mexico,Russia,Spain,United Kingdom,United States,...,Brazil,France,India,Indonesia,Italy,Mexico,Russia,Spain,United Kingdom,United States
"(Unnamed: 0_level_0, location)",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
4,81470.0,116583.0,33466.0,8590.0,99671.0,18009.0,104161.0,117512.0,139956.0,888718.0,...,0.0,0.0,683294.0,62214.0,1472249.0,77689.0,3317307.0,0.0,718807.0,5618576.0
5,427662.0,22114.0,155746.0,16355.0,27534.0,71440.0,299345.0,26044.0,78768.0,717694.0,...,0.0,677559.0,2906826.0,136443.0,1899522.0,191581.0,7199301.0,0.0,2520931.0,11899129.0
6,887192.0,13269.0,394872.0,29912.0,7581.0,135425.0,241086.0,9792.0,27677.0,843368.0,...,0.0,1087930.0,4871627.0,215368.0,1511371.0,306893.0,8929059.0,0.0,2656857.0,18014790.0
7,1260444.0,22995.0,1110507.0,51991.0,6959.0,198548.0,191532.0,39251.0,19577.0,1924850.0,...,0.0,2051856.0,10224316.0,265632.0,1423003.0,407014.0,8042857.0,0.0,3949332.0,27454928.0
8,1245787.0,93921.0,1995178.0,66420.0,21677.0,174923.0,153941.0,174336.0,33290.0,1458662.0,...,0.0,3452145.0,23474944.0,299905.0,1831746.0,367906.0,7789748.0,0.0,5230736.0,25348800.0
9,902663.0,284733.0,2621418.0,112212.0,45647.0,143656.0,178397.0,306330.0,117765.0,1206239.0,...,0.0,5490691.0,31888815.0,633641.0,2689063.0,359178.0,9520619.0,0.0,6879501.0,26824516.0
10,724670.0,808471.0,1871498.0,123080.0,364569.0,181746.0,435468.0,416490.0,558947.0,1926939.0,...,0.0,7445662.0,34599335.0,810521.0,4450539.0,411629.0,13747959.0,0.0,8965952.0,35280518.0
11,800273.0,862510.0,1278727.0,128795.0,922124.0,188581.0,669669.0,462509.0,618941.0,4496449.0,...,0.0,6601086.0,31583912.0,907567.0,6160638.0,455048.0,15726155.0,0.0,9332998.0,46214321.0
12,1340095.0,400792.0,803865.0,204315.0,505612.0,312551.0,851411.0,280078.0,862499.0,6406683.0,...,0.0,9067261.0,29731625.0,644905.0,4653508.0,794686.0,12575649.0,0.0,11407092.0,52551708.0


In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.graph_objects as go
import plotly.express as px


import pandas as pd 
pd.options.plotting.backend = "plotly" 

app = dash.Dash(__name__)
data = pd.read_excel('data/covid_top10.xlsx', header = [0,1]) #  분석 결과 가져오기
data = data.set_index(data.columns[0])[1:]                    # 첫번째 컬럼을 로우 인덱스로 변경

app.layout = html.Div([
    html.P("Type:"),
    dcc.Dropdown(
        id="dropdown",
        options=[
            {'label': x, 'value': x}
            for x in ['new_cases','new_deaths','new_tests']
        ],
        value='new_cases',
        clearable=False,
    ),
    dcc.Graph(id="graph"),
])

@app.callback(
    Output("graph", "figure"), 
    [Input("dropdown", "value")])
def display_graph(val):
    fig = data[val].plot()
    return fig

app.run_server(debug=True)