In [52]:
import pandas as pd

covid_data = pd.read_csv('C:\IDE\Skillfactory\PY-13 Visualisation\data\covid_19\\covid-19.csv', sep=',')
display(covid_data.head())

# Мы будем работать со следующими столбцами:

# date — дата наблюдения;
# province/state — наименование провинции/штата;
# country — наименование страны;
# confirmed — общее число зафиксированных случаев на указанный день;
# deaths — общее число зафиксированных смертей на указанный день;
# recovered — общее число выздоровлений на указанный день.

Unnamed: 0,date,province/state,country,confirmed,deaths,recovered
0,01/22/2020,Anhui,China,1.0,0.0,0.0
1,01/22/2020,Beijing,China,14.0,0.0,0.0
2,01/22/2020,Chongqing,China,6.0,0.0,0.0
3,01/22/2020,Fujian,China,1.0,0.0,0.0
4,01/22/2020,Gansu,China,0.0,0.0,0.0


In [53]:
vaccinations_data = pd.read_csv('C:\IDE\Skillfactory\PY-13 Visualisation\data\covid_19\\country_vaccinations.csv')
vaccinations_data = vaccinations_data[
    ['country', 'date', 'total_vaccinations', 
     'people_vaccinated', 'people_vaccinated_per_hundred',
     'people_fully_vaccinated', 'people_fully_vaccinated_per_hundred',
     'daily_vaccinations', 'vaccines']
]

vaccinations_data.head()

# Данная таблица содержит следующие столбцы:

# country — наименование страны;
# date — дата наблюдения;
# total_vaccinations — общее число введённых вакцин в стране на указанный день;
# people_vaccinated — общее число привитых первым компонентом в стране на указанный день;
# people_vaccinated_per_hundred — процент привитых первым компонентом в стране на указанный день (рассчитывается как );
# people_fully_vaccinated — общее число привитых вторым компонентом в стране на указанный день (первый компонент уже был введён им ранее);
# people_fully_vaccinated_per_hundred — процент привитых вторым компонентом в стране на указанный день (рассчитывается как );
# daily_vaccination — ежедневная вакцинация (число вакцинированных в указанный день);
# vaccines — комбинации вакцин, используемые в стране.

Unnamed: 0,country,date,total_vaccinations,people_vaccinated,people_vaccinated_per_hundred,people_fully_vaccinated,people_fully_vaccinated_per_hundred,daily_vaccinations,vaccines
0,Afghanistan,2021-02-22,0.0,0.0,0.0,,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
1,Afghanistan,2021-02-23,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
2,Afghanistan,2021-02-24,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
3,Afghanistan,2021-02-25,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
4,Afghanistan,2021-02-26,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."


In [54]:
# Группируем таблицу по дате и названию страны и рассчитываем суммарные показатели по всем регионам. 
# Тем самым переходим от данных по регионам к данным по странам:
covid_data = covid_data.groupby(
    ['date', 'country'], 
    as_index=False
)[['confirmed', 'deaths', 'recovered']].sum()

In [55]:
# Преобразуем даты в формат datetime с помощью функции pd.to_datetime():

covid_data['date'] = pd.to_datetime(covid_data['date'])

In [56]:
# Создадим признак больных на данный момент (active). Для этого вычтем из общего числа зафиксированных 
# случаев число смертей и число выздоровевших пациентов:

covid_data['active'] = covid_data['confirmed'] - covid_data['deaths'] - covid_data['recovered']

In [57]:
# Создадим признак ежедневного прироста числа заболевших, умерших и выздоровевших людей. Для этого 
# отсортируем данные по названиям стран, а затем по датам. После этого произведём группировку по странам 
# и рассчитаем разницу между «вчера и сегодня» с помощью метода diff():

covid_data = covid_data.sort_values(by=['country', 'date'])
covid_data['daily_confirmed'] = covid_data.groupby('country')['confirmed'].diff()
covid_data['daily_deaths'] = covid_data.groupby('country')['deaths'].diff()
covid_data['daily_recovered'] = covid_data.groupby('country')['recovered'].diff()

In [58]:
covid_data

Unnamed: 0,date,country,confirmed,deaths,recovered,active,daily_confirmed,daily_deaths,daily_recovered
11337,2020-02-24,Afghanistan,1.0,0.0,0.0,1.0,,,
11570,2020-02-25,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0,0.0
11807,2020-02-26,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0,0.0
12051,2020-02-27,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0,0.0
12299,2020-02-28,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
15933,2020-03-12,occupied Palestinian territory,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16595,2020-03-14,occupied Palestinian territory,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16938,2020-03-15,occupied Palestinian territory,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17290,2020-03-16,occupied Palestinian territory,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
# В таблице vaccinations_data достаточно будет преобразовать столбцы в формат datetime:

vaccinations_data['date'] = pd.to_datetime(vaccinations_data['date'])
vaccinations_data

Unnamed: 0,country,date,total_vaccinations,people_vaccinated,people_vaccinated_per_hundred,people_fully_vaccinated,people_fully_vaccinated_per_hundred,daily_vaccinations,vaccines
0,Afghanistan,2021-02-22,0.0,0.0,0.00,,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
1,Afghanistan,2021-02-23,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
2,Afghanistan,2021-02-24,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
3,Afghanistan,2021-02-25,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
4,Afghanistan,2021-02-26,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
...,...,...,...,...,...,...,...,...,...
42790,Zimbabwe,2021-09-01,4270430.0,2615233.0,17.33,1655197.0,10.97,36416.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac..."
42791,Zimbabwe,2021-09-02,4323735.0,2649505.0,17.56,1674230.0,11.09,39711.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac..."
42792,Zimbabwe,2021-09-03,4372216.0,2681657.0,17.77,1690559.0,11.20,42317.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac..."
42793,Zimbabwe,2021-09-04,4400246.0,2698332.0,17.88,1701914.0,11.28,41413.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac..."


In [60]:
covid_df = covid_data.merge(
    vaccinations_data,
    on = ['date', 'country'],
    how = 'left'
)
covid_df.shape

(86785, 16)

In [61]:
covid_df['death_rate'] = covid_df['deaths']/covid_df['confirmed']*100
covid_df['recover_rate'] = covid_df['recovered']/covid_df['confirmed']*100

In [62]:
round(covid_df[covid_df['country'] == 'Russia']['recover_rate'].mean(), 2)

67.06

In [63]:
round(covid_df[(covid_df['country'] == 'United States')&(covid_df['date'] == '2021-05-29')]['recover_rate'].max(), 2)

0.0

In [64]:
covid_df[covid_df['country'] == 'United States']

Unnamed: 0,date,country,confirmed,deaths,recovered,active,daily_confirmed,daily_deaths,daily_recovered,total_vaccinations,people_vaccinated,people_vaccinated_per_hundred,people_fully_vaccinated,people_fully_vaccinated_per_hundred,daily_vaccinations,vaccines,death_rate,recover_rate
82542,2020-01-22,United States,1.0,0.0,0.0,1.0,,,,,,,,,,,0.000000,0.0
82543,2020-01-23,United States,1.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,,,,,0.000000,0.0
82544,2020-01-24,United States,2.0,0.0,0.0,2.0,1.0,0.0,0.0,,,,,,,,0.000000,0.0
82545,2020-01-25,United States,2.0,0.0,0.0,2.0,0.0,0.0,0.0,,,,,,,,0.000000,0.0
82546,2020-01-26,United States,5.0,0.0,0.0,5.0,3.0,0.0,0.0,,,,,,,,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83031,2021-05-25,United States,33166418.0,590941.0,0.0,32575477.0,22756.0,621.0,0.0,287788872.0,164378258.0,48.87,131078608.0,38.97,1750524.0,"Johnson&Johnson, Moderna, Pfizer/BioNTech",1.781745,0.0
83032,2021-05-26,United States,33190470.0,591950.0,0.0,32598520.0,24052.0,1009.0,0.0,289212304.0,165074907.0,49.08,131850089.0,39.20,1703162.0,"Johnson&Johnson, Moderna, Pfizer/BioNTech",1.783494,0.0
83033,2021-05-27,United States,33217995.0,593288.0,0.0,32624707.0,27525.0,1338.0,0.0,290724607.0,165718717.0,49.27,132769894.0,39.48,1618194.0,"Johnson&Johnson, Moderna, Pfizer/BioNTech",1.786044,0.0
83034,2021-05-28,United States,33239963.0,593963.0,0.0,32646000.0,21968.0,675.0,0.0,292099778.0,166388129.0,49.47,133532544.0,39.70,1500632.0,"Johnson&Johnson, Moderna, Pfizer/BioNTech",1.786894,0.0


In [65]:
round(covid_df[covid_df['country'] == 'Russia']['confirmed'].sum(), 2)

930548849.0

In [66]:
vaccinations_data['date'].min()

Timestamp('2020-12-02 00:00:00')